In [42]:
from libsvm.svmutil import *
from liblinear.liblinearutil import *
from sklearn.metrics import accuracy_score
import multiprocessing as mp
import numpy as np
import itertools
import random
import math
from math import log, exp
import sys

In [43]:
# Load data
y_train, X_train = svm_read_problem("letter_train")
y_test, X_test = svm_read_problem("letter_test")

# Label the data
train_new_x = []
train_new_y = []

for num in range(len(y_train)):
    if y_train[num] == 11:
        new = 1
        train_new_y.append(new)
        train_new_x.append([X_train[num][k] for k in range(1, len(X_train[num]) + 1)])
    elif y_train[num] == 26:
        new = -1
        train_new_y.append(new)
        train_new_x.append([X_train[num][k] for k in range(1, len(X_train[num]) + 1)])


In [44]:
def get_error_rate(pred, true, weights):
    # Calculate the error rate using the weighted 0/1 error
    error = sum(w for i, w in enumerate(weights) if pred[i] != true[i])
    return error / sum(weights)

def decision_stump(X, y, weights):
    # Implement the decision stump algorithm
    num_samples, num_features = len(X), len(X[0])
    best_error = float('inf')
    best_feature, best_threshold, best_s = None, None, None

    for i in range(num_features):
        sorted_indices = sorted(range(num_samples), key=lambda x: X[x][i])
        thresholds = [float('-inf')] + [(X[sorted_indices[j]][i] + X[sorted_indices[j+1]][i]) / 2 for j in range(num_samples - 1)]
        
        for threshold in thresholds:
            for s in [-1, 1]:
                pred = [s if x[i] < threshold else -s for x in X]
                error = get_error_rate(pred, y, weights)

                if error < best_error:
                    best_error = error
                    best_feature = i
                    best_threshold = threshold
                    best_s = s
    
    return best_feature, best_threshold, best_s, best_error

def adaboost_stump(X, y, T):
    num_samples = len(X)
    num_features = len(X[0])

    # Initialize weights
    weights = [1 / num_samples] * num_samples

    alphas = []
    classifiers = []

    for t in range(T):
        # Train a decision stump
        feature, threshold, s, error = decision_stump(X, y, weights)

        # Calculate alpha
        alpha = 0.5 * log((1 - error) / error)
        alphas.append(alpha)

        # Update weights
        pred = [s if x[feature] < threshold else -s for x in X]
        weights = [w * exp(-alpha * p * t) for w, p, t in zip(weights, pred, y)]
        weights = [w / sum(weights) for w in weights]

        # Save the classifier
        classifiers.append((feature, threshold, s))

        # Print progress
        print("Iteration", t + 1, "completed")

    return alphas, classifiers

def evaluate(X, y, alphas, classifiers):
    # Evaluate the AdaBoost classifier
    num_samples = len(X)
    num_classifiers = len(classifiers)

    errors = []

    for i in range(num_samples):
        prediction = sum(alpha * (s if X[i][feature] < threshold else -s) for alpha, (feature, threshold, s) in zip(alphas, classifiers))
        errors.append(prediction != y[i])

    ein = sum(errors) / num_samples

    return ein

In [45]:
# Run AdaBoost-Stump on the training data
T = 1000
alphas, classifiers = adaboost_stump(train_new_x, train_new_y, T)

# Evaluate the classifier on the training data
ein = evaluate(train_new_x, train_new_y, alphas, classifiers)

# Evaluate the classifier on the test data
eout = evaluate(X_test, y_test_binary, alphas, classifiers)

# Calculate min_{1≤t≤1000} Ein(gt)
min_ein = min(get_error_rate([s if train_new_x[i][feature] < threshold else -s for i in range(len(train_new_x))], train_new_y, [1] * len(train_new_x)) for feature, threshold, s in classifiers)

# Calculate max_{1≤t≤1000} Ein(gt)
max_ein = max(get_error_rate([s if train_new_x[i][feature] < threshold else -s for i in range(len(train_new_x))], train_new_y, [1] * len(train_new_x)) for feature, threshold, s in classifiers)

# Print the results
print("min_{1≤t≤1000} Ein(gt):", min_ein)
print("max_{1≤t≤1000} Ein(gt):", max_ein)
print("Ein(G):", ein)
print("Eout(G):", eout)

Iteration 1 completed
Iteration 2 completed
Iteration 3 completed
Iteration 4 completed
Iteration 5 completed
Iteration 6 completed
Iteration 7 completed
