### This notebook tests choosing the best classifier. It compares its output with another implmentation of the same algorithm.: aparande
### also, it ensures that the best error chosen is the same on the dataset


In [727]:
import numpy as np
from matplotlib import pyplot as plt
import time
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from accumulative import WeakClassifier, OneClassifier

In [728]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.cuda.is_available()

False

In [729]:
def EQ(x, y, permittivity=1e-6):
    return np.abs(x - y) < permittivity
def EQ3(x, y, z, permittivity=1e-4):
    return EQ(x, y, permittivity) and EQ(y, z, permittivity)
def OK(msg='OK'):
    print("\033[32m{}\033[0m".format(msg))
def NOK(msg='Not Equal'):
    print("\033[31m{}\033[0m".format(msg))

# Theirs

In [730]:
class TheirWeakClassifier:
    def __init__(self, feature_index, feature_val, threshold, polarity, error):
        self.feature_index = feature_index
        self.feature_val = feature_val
        self.threshold = threshold
        self.polarity = polarity
        self.error = error
    
    # make a function for easier access as numpy array, example: np.array(wc)
    def __array__(self):
        # return tensor.cpu() if members are tensors else np.array
        if type(self.feature_index) == torch.Tensor:
            return np.array([self.feature_index.cpu().numpy(), self.feature_val.cpu().numpy(), self.threshold.cpu().numpy(), self.polarity.cpu().numpy(), self.error.cpu().numpy()])
        else:
            return np.array([self.feature_index, self.feature_val, self.threshold, self.polarity, self.error])
        
    def __str__(self):
        return np.array(self).__str__()
    
def train_weak(X, y, features, weights):
    s_t = time.time()

    total_pos, total_neg = 0, 0
    for w, label in zip(weights, y):
        if label == 1:
            total_pos += w
        else:
            total_neg += w

    classifiers = []
    total_features = X.shape[0]
    for index, feature in enumerate(X):
        if len(classifiers) % 1000 == 0 and len(classifiers) != 0:
            print("Trained %d classifiers out of %d" % (len(classifiers), total_features))

        applied_feature = sorted(zip(weights, feature, y), key=lambda x: x[1])

        pos_seen, neg_seen = 0, 0
        pos_weights, neg_weights = 0, 0
        min_error, best_feature, best_threshold, best_polarity = float('inf'), None, None, None
        current_idx = 0
        ws = []
        last_error = 0
        pos_seen_list = []
        for w, f, label in applied_feature:
            ws.append(w)
            # min(all before current example are positive and all after are negative, all before current example are negative and all after are positive)
            # error = sum of weights of misclassified examples
            error = min(neg_weights + total_pos - pos_weights, pos_weights + total_neg - neg_weights)
            last_error = error
            # print("error : ", error)
            if error < min_error:
                min_error = error
                best_feature = (current_idx, f)
                best_threshold = f - 0.00001
                if neg_weights + total_pos - pos_weights < pos_weights + total_neg - neg_weights:
                    best_polarity = 1
                else:
                    best_polarity = -1


            if label == 1:
                pos_seen += 1
                pos_weights += w
            else:
                neg_seen += 1
                neg_weights += w
            current_idx += 1
            pos_seen_list.append(pos_seen)

        clf = TheirWeakClassifier(best_feature[0], best_feature[1], best_threshold, best_polarity, min_error)
        classifiers.append(clf)

    print("Time taken: %f seconds" % (time.time() - s_t))
    return classifiers

def select_best(classifiers, weights, X, y):
    best_clf, best_error, best_accuracy = None, float('inf'), None
    xt = X.T
    aaa = []
    for i, clf in enumerate(classifiers):
        error, accuracy = 0, []
        bbb = []
        for data, w, yc in zip(xt, weights, y):
            classification = 1 if data[i] * clf.polarity <= clf.threshold * clf.polarity else 0
            correctness = classification != yc
            accuracy.append(correctness)
            bbb.append(w * correctness)
            error += w * correctness
        if error < best_error:
            best_clf, best_error, best_accuracy = clf, error, accuracy
        aaa.append(bbb)
    return best_clf, best_error, best_accuracy, aaa


In [731]:
# n_features = 16000
# n_samples = 15000
# n_features = 3
# n_samples = 10
n_features = 5
n_samples = 15
n_classes = 2
def generate_data():
    X = np.random.randn(n_features, n_samples)
    # X = np.sort(X, axis=1)
    y = np.random.randint(0, n_classes, n_samples)
    # y = np.array([1 if i == 1 else -1 for i in y])
    weights = np.random.rand(n_samples)
    weights = weights / np.sum(weights)
    return X, y, weights
X, y, weights = generate_data()

In [732]:
# np.save("X.npy", X)
# np.save("y.npy", y)
# np.save("weights.npy", weights)


In [733]:
# X = np.load("X.npy")
# y = np.load("y.npy")
# weights = np.load("weights.npy")


In [734]:
s_t = time.time()

total_pos, total_neg = 0, 0
for w, label in zip(weights, y):
    if label == 1:
        total_pos += w
    else:
        total_neg += w

classifiers = []
total_features = X.shape[0]
for index, feature in enumerate(X):
    if len(classifiers) % 1000 == 0 and len(classifiers) != 0:
        print("Trained %d classifiers out of %d" % (len(classifiers), total_features))

    applied_feature = sorted(zip(weights, feature, y), key=lambda x: x[1])

    pos_seen, neg_seen = 0, 0
    pos_weights, neg_weights = 0, 0
    min_error, best_feature, best_threshold, best_polarity = float('inf'), None, None, None
    current_idx = 0
    ws = []
    last_error = 0
    pos_seen_list = []
    for w, f, label in applied_feature:
        ws.append(w)
        # min(all before current example are positive and all after are negative, all before current example are negative and all after are positive)
        # error = sum of weights of misclassified examples
        error = min(neg_weights + total_pos - pos_weights, pos_weights + total_neg - neg_weights)
        last_error = error
        # print("error : ", error)
        if error < min_error:
            min_error = error
            # best_feature = features[index]
            best_feature = (current_idx, f)
            best_threshold = f - 0.00001
            # best_polarity = 1 if pos_seen > neg_seen else -1
            if neg_weights + total_pos - pos_weights < pos_weights + total_neg - neg_weights:
                best_polarity = 1
            else:
                best_polarity = -1
                

        if label == 1:
            pos_seen += 1
            pos_weights += w
        else:
            neg_seen += 1
            neg_weights += w
        current_idx += 1
        pos_seen_list.append(pos_seen)

    # clf = WeakClassifier(best_feature[0], best_feature[1], best_threshold, best_polarity)
    clf = OneClassifier(best_feature[0], best_feature[1], best_threshold, best_polarity, min_error)
    classifiers.append(clf)

print("Time taken: %f seconds" % (time.time() - s_t))

Time taken: 0.000497 seconds


In [735]:
L = 1
for i in range(L):
    if L > 1:
        X, y, weights = generate_data()
    weak_classifiers = WeakClassifier(X, y, weights, 1000, False, False, delta=0.00001)
    best_index, best_threshold, best_polarity, best_error, classifiers4, LW, RW = weak_classifiers.chooseClassifier()
    if L == 1:
        print("Best classifier: index %d, threshold %f, polarity %d, error %f" % (best_index, best_threshold, best_polarity, best_error))

    try:
        assert (LW[0] + RW[0] + LW[-1] + RW[-1] - 1 < 0.00001).all(), 'NOT EQUAL'
        OK('Weights sum to 1')
    except:
        NOK('Weights do not sum to 1')
        
    
    datas = X.T
    compared_error = 0
    for data, w, yc in zip(datas, weights, y):
        classification = 1 if data[best_index] * best_polarity <= best_threshold * best_polarity else 0
        correctness = classification != yc
        compared_error += w * correctness

    
    if i % 100 == 0:
        print("Finished %d iterations" % i)
    try:
        assert abs(compared_error - best_error) < 0.0000001, 'NOT EQUAL'
        OK('Best error is correct')
    except:
        NOK('Best error is not correct')
        print("Compared error: %f" % compared_error)
        print("Best error: %f" % best_error)
        

        print("Compared error: %f" % compared_error)

        for i, clf in enumerate(classifiers4):
            error = 0
            for data, w, yc in zip(datas, weights, y):
                classification = 1 if data[i] * clf.polarity >= clf.threshold * clf.polarity else 0
                correctness = classification != yc
                error += w * correctness
            print("Error: %f" % error, end=' ')
            print(f'Polarity: {clf.polarity}', '--------' if error == best_error else '')

        print()
        for i, clf in enumerate(classifiers4):
            error = 0
            for data, w, yc in zip(datas, weights, y):
                classification = 1 if data[i] * clf.polarity <= clf.threshold * clf.polarity else 0
                # classification = classification if clf.polarity == 1 else 1 - classification
                # classification = 1 if data[i]  >= clf.threshold else 0
                # correctness = classification == yc
                correctness = classification != yc
                error += w * correctness
            print("Error2: %f" % error, end=' ')
            print(f'Polarity: {clf.polarity}','--------' if error == best_error else '')
        print()
        for i, clf in enumerate(classifiers4):
            error = 0
            for data, w, yc in zip(datas, weights, y):
                classification = 1 if data[i] * clf.polarity >= clf.threshold * clf.polarity else 0
                correctness = classification == yc
                correctness = classification != yc
                error += w * correctness
            print("Error2: %f" % error, end=' ')
            print(f'Polarity: {clf.polarity}','--------' if error == best_error else '')

        print()
        for i, clf in enumerate(classifiers4):
            error = 0
            for data, w, yc in zip(datas, weights, y):
                classification = 1 if data[i] * clf.polarity <= clf.threshold * clf.polarity else 0
                correctness = classification == yc
                correctness = classification != yc
                error += w * correctness
            print("Error2: %f" % error, end=' ')
            print(f'Polarity: {clf.polarity}','--------' if error == best_error else '')
        break

Best classifier: index 0, threshold 0.329738, polarity 1, error 0.044251
[32mWeights sum to 1[0m
Finished 0 iterations
[32mBest error is correct[0m


In [736]:
weak_classifiers2 = train_weak(X, y, None, weights)
best_clf2, best_error2, best_accuracy2, aaa2 = select_best(weak_classifiers2, weights, X, y)

print("Best classifier: index %d, threshold %f, polarity %d, error %f" % (best_clf2.feature_index, best_clf2.threshold, best_clf2.polarity, best_error))
try:
    assert EQ(best_error, best_error2), 'NOT EQUAL'
    # assert EQ(best_clf2.feature_index, best_index), 'NOT EQUAL'
    assert EQ(best_clf2.threshold, best_threshold), 'NOT EQUAL'
    assert EQ(best_clf2.polarity, best_polarity), 'NOT EQUAL'
    OK('Best classifier is correct')
except AssertionError as e:
    NOK('Best classifier is not correct')
    print(e)

Time taken: 0.000198 seconds
Best classifier: index 9, threshold 0.329738, polarity 1, error 0.044251
[32mBest classifier is correct[0m


In [737]:
for i in range(len(classifiers4)):
    try:
        assert EQ3(classifiers4[i].threshold, weak_classifiers2[i].threshold, classifiers[i].threshold), 'θ NOT EQUAL'
        assert EQ3(classifiers4[i].polarity, weak_classifiers2[i].polarity, classifiers[i].polarity), 'p NOT EQUAL'
        assert EQ3(classifiers4[i].error, weak_classifiers2[i].error, classifiers[i].error), 'e NOT EQUAL'
        OK(f'classifier {i} is correct')
    except AssertionError as e:
        NOK('Classifiers are not equal')
        print(e)
        print('4', classifiers4[i])
        print('1', classifiers[i])
        print('2', weak_classifiers2[i])
        print()

[32mclassifier 0 is correct[0m
[32mclassifier 1 is correct[0m
[32mclassifier 2 is correct[0m
[32mclassifier 3 is correct[0m
[32mclassifier 4 is correct[0m
