### This notebook tests choosing the best classifier. It compares its output with another implmentation of the same algorithm.: aparande
### also, it ensures that the best error chosen is the same on the dataset


In [14]:
import numpy as np
from matplotlib import pyplot as plt
import time
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from classifier import BestClassifier, WeakClassifier

In [15]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.cuda.is_available()

True

In [16]:
def EQ(x, y, permittivity=1e-6):
    return np.abs(x - y) < permittivity
def EQ3(x, y, z, permittivity=1e-4):
    return EQ(x, y, permittivity) and EQ(y, z, permittivity)
def OK(msg='OK'):
    print("\033[32m{}\033[0m".format(msg))
def NOK(msg='Not Equal'):
    print("\033[31m{}\033[0m".format(msg))

# Theirs

In [17]:
class TheirWeakClassifier:
    def __init__(self, feature_index, feature_val, threshold, polarity, error):
        self.feature_index = feature_index
        self.feature_val = feature_val
        self.threshold = threshold
        self.polarity = polarity
        self.error = error
    
    # make a function for easier access as numpy array, example: np.array(wc)
    def __array__(self):
        # return tensor.cpu() if members are tensors else np.array
        if type(self.feature_index) == torch.Tensor:
            return np.array([self.feature_index.cpu().numpy(), self.feature_val.cpu().numpy(), self.threshold.cpu().numpy(), self.polarity.cpu().numpy(), self.error.cpu().numpy()])
        else:
            return np.array([self.feature_index, self.feature_val, self.threshold, self.polarity, self.error])
        
    def __str__(self):
        return np.array(self).__str__()
    
def train_weak(X, y, features, weights):
    s_t = time.time()

    total_pos, total_neg = 0, 0
    for w, label in zip(weights, y):
        if label == 1:
            total_pos += w
        else:
            total_neg += w

    classifiers = []
    total_features = X.shape[0]
    for index, feature in enumerate(X):
        if len(classifiers) % 1000 == 0 and len(classifiers) != 0:
            print("Trained %d classifiers out of %d" % (len(classifiers), total_features))

        applied_feature = sorted(zip(weights, feature, y), key=lambda x: x[1])

        pos_seen, neg_seen = 0, 0
        pos_weights, neg_weights = 0, 0
        min_error, best_feature, best_threshold, best_polarity = float('inf'), None, None, None
        current_idx = 0
        ws = []
        last_error = 0
        pos_seen_list = []
        for w, f, label in applied_feature:
            ws.append(w)
            # min(all before current example are positive and all after are negative, all before current example are negative and all after are positive)
            # error = sum of weights of misclassified examples
            error = min(neg_weights + total_pos - pos_weights, pos_weights + total_neg - neg_weights)
            last_error = error
            # print("error : ", error)
            if error < min_error:
                min_error = error
                best_feature = (current_idx, f)
                best_threshold = f - 0.00001
                if neg_weights + total_pos - pos_weights < pos_weights + total_neg - neg_weights:
                    best_polarity = 1
                else:
                    best_polarity = -1


            if label == 1:
                pos_seen += 1
                pos_weights += w
            else:
                neg_seen += 1
                neg_weights += w
            current_idx += 1
            pos_seen_list.append(pos_seen)

        clf = TheirWeakClassifier(best_feature[0], best_feature[1], best_threshold, best_polarity, min_error)
        classifiers.append(clf)

    print("Time taken: %f seconds" % (time.time() - s_t))
    return classifiers

def select_best(classifiers, weights, X, y):
    best_clf, best_error, best_accuracy = None, float('inf'), None
    xt = X.T
    aaa = []
    for i, clf in enumerate(classifiers):
        error, accuracy = 0, []
        bbb = []
        for data, w, yc in zip(xt, weights, y):
            classification = 1 if data[i] * clf.polarity <= clf.threshold * clf.polarity else 0
            correctness = classification != yc
            accuracy.append(correctness)
            bbb.append(w * correctness)
            error += w * correctness
        if error < best_error:
            best_clf, best_error, best_accuracy = clf, error, accuracy
        aaa.append(bbb)
    return best_clf, best_error, best_accuracy, aaa


In [18]:
n_features = 5
n_samples = 15
n_classes = 2
def generate_data(n_features=n_features, n_samples=n_samples, floatornot=True):
    X = np.random.randn(n_features, n_samples)
    if floatornot:
        X = X.astype(np.float32)
    y = np.random.randint(0, n_classes, n_samples)
    weights = np.random.rand(n_samples)
    if floatornot:
        weights = weights.astype(np.float32)
    weights = weights / np.sum(weights)
    return X, y, weights
X, y, weights = generate_data()

In [19]:
# np.save("X.npy", X)
# np.save("y.npy", y)
# np.save("weights.npy", weights)


In [20]:
# X = np.load("X.npy")
# y = np.load("y.npy")
# weights = np.load("weights.npy")


In [21]:
s_t = time.time()

total_pos, total_neg = 0, 0
for w, label in zip(weights, y):
    if label == 1:
        total_pos += w
    else:
        total_neg += w

classifiers = []
total_features = X.shape[0]
for index, feature in enumerate(X):
    if len(classifiers) % 1000 == 0 and len(classifiers) != 0:
        print("Trained %d classifiers out of %d" % (len(classifiers), total_features))

    applied_feature = sorted(zip(weights, feature, y), key=lambda x: x[1])

    pos_seen, neg_seen = 0, 0
    pos_weights, neg_weights = 0, 0
    min_error, best_feature, best_threshold, best_polarity = float('inf'), None, None, None
    current_idx = 0
    ws = []
    last_error = 0
    pos_seen_list = []
    for w, f, label in applied_feature:
        ws.append(w)
        # min(all before current example are positive and all after are negative, all before current example are negative and all after are positive)
        # error = sum of weights of misclassified examples
        error = min(neg_weights + total_pos - pos_weights, pos_weights + total_neg - neg_weights)
        last_error = error
        # print("error : ", error)
        if error < min_error:
            min_error = error
            # best_feature = features[index]
            best_feature = (current_idx, f)
            best_threshold = f - 0.00001
            # best_polarity = 1 if pos_seen > neg_seen else -1
            if neg_weights + total_pos - pos_weights < pos_weights + total_neg - neg_weights:
                best_polarity = 1
            else:
                best_polarity = -1
                

        if label == 1:
            pos_seen += 1
            pos_weights += w
        else:
            neg_seen += 1
            neg_weights += w
        current_idx += 1
        pos_seen_list.append(pos_seen)

    clf = WeakClassifier(best_feature[0], best_threshold, best_polarity, min_error)
    classifiers.append(clf)

print("Time taken: %f seconds" % (time.time() - s_t))

Time taken: 0.003658 seconds


# Equivalence of the two implementations

In [22]:
L = 1
for i in range(L):
    if L > 1:
        X, y, weights = generate_data()
    # weak_classifiers = BestClassifier(X, y, weights, 1000, False, False, debug=True, delta=0.00001)
    weak_classifiers = BestClassifier(X, y, weights, 1000, False, False, debug=True, delta=0.00001)
    # best_index, best_threshold, best_polarity, best_error, classifiers4, LW, RW = weak_classifiers.chooseClassifier()
    BC, (classifiers4, LW, RW) = weak_classifiers.chooseClassifier()
    best_index, best_threshold, best_polarity, best_error = BC.feature_index, BC.threshold, BC.polarity, BC.error
    
    if L == 1:
        print(f"Best classifier: index {best_index}, threshold {best_threshold}, polarity {best_polarity}, error {best_error}")

    try:
        assert (LW[0] + RW[0] + LW[-1] + RW[-1] - 1 < 0.00001).all(), 'NOT EQUAL'
        OK('Weights sum to 1')
    except:
        NOK('Weights do not sum to 1')
        print(LW[0] + RW[0] + LW[-1] + RW[-1])
        
    
    datas = X.T
    compared_error = 0
    prediction1 = []
    for data, w, yc in zip(datas, weights, y):
        classification = 1 if data[best_index] * best_polarity <= best_threshold * best_polarity else 0
        prediction1.append(classification)
        correctness = classification != yc
        compared_error += w * correctness
        
    predictions2 = BC.predict(X)
    try:
        assert (prediction1 == predictions2).all(), 'NOT EQUAL'
        OK('Predictions are correct')
    except:
        NOK('Predictions are not correct')
    
    if i % 100 == 0:
        print("Finished %d iterations" % i)
    try:
        assert abs(compared_error - best_error) < 0.0000001, 'NOT EQUAL'
        OK('Best error is correct')
    except:
        NOK('Best error is not correct')
        print("Compared error: %f" % compared_error)
        print("Best error: %f" % best_error)
        

        print("Compared error: %f" % compared_error)

        for i, clf in enumerate(classifiers4):
            error = 0
            for data, w, yc in zip(datas, weights, y):
                classification = 1 if data[i] * clf.polarity >= clf.threshold * clf.polarity else 0
                correctness = classification != yc
                error += w * correctness
            print("Error: %f" % error, end=' ')
            print(f'Polarity: {clf.polarity}', '--------' if error == best_error else '')

        print()
        for i, clf in enumerate(classifiers4):
            error = 0
            for data, w, yc in zip(datas, weights, y):
                classification = 1 if data[i] * clf.polarity <= clf.threshold * clf.polarity else 0
                # classification = classification if clf.polarity == 1 else 1 - classification
                # classification = 1 if data[i]  >= clf.threshold else 0
                # correctness = classification == yc
                correctness = classification != yc
                error += w * correctness
            print("Error2: %f" % error, end=' ')
            print(f'Polarity: {clf.polarity}','--------' if error == best_error else '')
        print()
        for i, clf in enumerate(classifiers4):
            error = 0
            for data, w, yc in zip(datas, weights, y):
                classification = 1 if data[i] * clf.polarity >= clf.threshold * clf.polarity else 0
                correctness = classification == yc
                correctness = classification != yc
                error += w * correctness
            print("Error2: %f" % error, end=' ')
            print(f'Polarity: {clf.polarity}','--------' if error == best_error else '')

        print()
        for i, clf in enumerate(classifiers4):
            error = 0
            for data, w, yc in zip(datas, weights, y):
                classification = 1 if data[i] * clf.polarity <= clf.threshold * clf.polarity else 0
                correctness = classification == yc
                correctness = classification != yc
                error += w * correctness
            print("Error2: %f" % error, end=' ')
            print(f'Polarity: {clf.polarity}','--------' if error == best_error else '')
        break

Starting to choose classifier
Best classifier: index 0, threshold 1.3954046964645386, polarity -1, error 0.20713873207569122
[32mWeights sum to 1[0m
[32mPredictions are correct[0m
Finished 0 iterations
[32mBest error is correct[0m


In [23]:
weak_classifiers2 = train_weak(X, y, None, weights)
best_clf2, best_error2, best_accuracy2, aaa2 = select_best(weak_classifiers2, weights, X, y)

print(f'Best classifier: index {best_clf2.feature_index}, threshold {best_clf2.threshold}, polarity {best_clf2.polarity}, error {best_error2}')

try:
    assert EQ(best_error, best_error2), 'ϵ NOT EQUAL'
    assert EQ(best_threshold, best_clf2.threshold), 'θ NOT EQUAL'
    assert EQ(best_polarity, best_clf2.polarity), 'p NOT EQUAL'
    OK('Best classifier is correct')
except AssertionError as e:
    NOK('Best classifier is not correct')
    print(e)

Time taken: 0.001384 seconds
Best classifier: index 12, threshold 1.3954047100448608, polarity -1, error 0.20713873486965895
[32mBest classifier is correct[0m


In [24]:
for i in range(len(classifiers4)):
    try:
        assert EQ3(classifiers4[i].threshold, weak_classifiers2[i].threshold, classifiers[i].threshold), 'θ NOT EQUAL'
        assert EQ3(classifiers4[i].polarity, weak_classifiers2[i].polarity, classifiers[i].polarity), 'p NOT EQUAL'
        assert EQ3(classifiers4[i].error, weak_classifiers2[i].error, classifiers[i].error), 'e NOT EQUAL'
        OK(f'classifier {i} is correct')
    except AssertionError as e:
        NOK('Classifiers are not equal')
        print(e)
        print('4', classifiers4[i])
        print('1', classifiers[i])
        print('2', weak_classifiers2[i])
        print()

# Speed test

In [25]:
n_features = 16000
n_samples = 15000
s_t = time.time()

X, y, weights = generate_data(n_features=n_features, n_samples=n_samples, floatornot=False)

weak_classifiers = BestClassifier(X, y, weights, 1000, show_time=True, show_mem=True, debug=False, getClassifier=True, delete_unused=True, delta=0.000001)
# weak_classifiers = BestClassifier(X, y, weights, 1000, show_time=True, show_mem=True, debug=False, getClassifier=False, delete_unused=True, delta=0.000001)
print('Created object, now training at %f seconds' % (time.time() - s_t))
BC, (classifiers4, LW, RW) = weak_classifiers.chooseClassifier()
best_index, best_threshold, best_polarity, best_error = BC.feature_index, BC.threshold, BC.polarity, BC.error

print('Cell took: %f seconds' % (time.time() - s_t))

Created object, now training at 4.795163 seconds
Starting to choose classifier
At batch number:  0 :  Start time:  0.09859371185302734
Memory for batch:  (58.1171875, 1446.0)
At batch number:  1 :  Start time:  0.263134241104126
Memory for batch:  (647.80712890625, 1446.0)
At batch number:  2 :  Start time:  0.4236454963684082
Memory for batch:  (647.3662109375, 1446.0)
At batch number:  3 :  Start time:  0.5764758586883545
Memory for batch:  (646.26416015625, 1446.0)
At batch number:  4 :  Start time:  0.7311372756958008
Memory for batch:  (647.3662109375, 1446.0)
At batch number:  5 :  Start time:  0.8856208324432373
Memory for batch:  (647.03955078125, 1446.0)
At batch number:  6 :  Start time:  1.0397913455963135
Memory for batch:  (647.3818359375, 1446.0)
At batch number:  7 :  Start time:  1.2577593326568604
Memory for batch:  (646.26806640625, 1446.0)
At batch number:  8 :  Start time:  1.4109973907470703
Memory for batch:  (647.3818359375, 1446.0)
At batch number:  9 :  Start t

In [26]:
raise 'STOP'

In [27]:
s_t = time.time()
weak_classifiers2 = train_weak(X, y, None, weights)
print('Cell took: %f seconds' % (time.time() - s_t))

Trained 1000 classifiers out of 16000
Trained 2000 classifiers out of 16000
Trained 3000 classifiers out of 16000
Trained 4000 classifiers out of 16000
Trained 5000 classifiers out of 16000
Trained 6000 classifiers out of 16000
Trained 7000 classifiers out of 16000
Trained 8000 classifiers out of 16000
Trained 9000 classifiers out of 16000
Trained 10000 classifiers out of 16000
Trained 11000 classifiers out of 16000
Trained 12000 classifiers out of 16000
Trained 13000 classifiers out of 16000
Trained 14000 classifiers out of 16000
Trained 15000 classifiers out of 16000
Time taken: 277.268840 seconds
Cell took: 277.270168 seconds


In [28]:

#! dies already! 😏😒
# s_t = time.time()
# best_clf2, best_error2, best_accuracy2, aaa2 = select_best(weak_classifiers2, weights, X, y)

# print('Cell took: %f seconds' % (time.time() - s_t))

In [29]:

#! fails because previous died already! 😏😒
try:
    assert EQ(best_error, best_error2), 'ϵ NOT EQUAL'
    assert EQ(best_threshold, best_clf2.threshold), 'θ NOT EQUAL'
    assert EQ(best_polarity, best_clf2.polarity), 'p NOT EQUAL'
    OK('Best classifier is correct')
except AssertionError as e:
    NOK("Best classifier is not correct, but it's ok 😏😒")
    print(e)

[31mBest classifier is not correct, but it's ok 😏😒[0m
ϵ NOT EQUAL


In [30]:
differences = []
for i in range(len(classifiers4)):
    try:
        assert EQ(classifiers4[i].feature_index, weak_classifiers2[i].feature_index), 'index NOT EQUAL'
        assert EQ(classifiers4[i].threshold, weak_classifiers2[i].threshold, permittivity=0.001), 'θ NOT EQUAL'
        assert EQ(classifiers4[i].polarity, weak_classifiers2[i].polarity, permittivity=0.001), 'p NOT EQUAL'
        assert EQ(classifiers4[i].error, weak_classifiers2[i].error, permittivity=0.001), 'e NOT EQUAL'
        # OK(f'classifier {i} is correct')
    except AssertionError as e:
        # NOK('Classifiers are not equal')
        # print(e)
        # print('4', classifiers4[i])
        # print('2', weak_classifiers2[i])
        # print()
        differences.append(i)

OK('Classifiers are equal, except for %d classifiers' % len(differences))

[32mClassifiers are equal, except for 8 classifiers[0m


In [31]:
for i in differences:
    print('4', classifiers4[i])
    print('2', weak_classifiers2[i])
    OK('it is so close')

4 [ 2.31100000e+03 -1.02175319e+00  1.00000000e+00  4.87428665e-01]
2 [ 2.31300000e+03 -1.02143836e+00 -1.02144836e+00  1.00000000e+00
  4.87428769e-01]
[32mit is so close[0m
4 [ 1.31400000e+03 -1.35705221e+00  1.00000000e+00  4.91436005e-01]
2 [ 1.31300000e+03 -1.35705127e+00 -1.35706127e+00  1.00000000e+00
  4.91437265e-01]
[32mit is so close[0m
4 [ 5.34000000e+02 -1.82065535e+00  1.00000000e+00  4.92238462e-01]
2 [ 1.40050000e+04  1.53930030e+00  1.53929030e+00 -1.00000000e+00
  4.92238253e-01]
[32mit is so close[0m
4 [ 1.36550000e+04  1.36341846e+00 -1.00000000e+00  4.90879685e-01]
2 [ 1.36670000e+04  1.36779858e+00  1.36778858e+00 -1.00000000e+00
  4.90879320e-01]
[32mit is so close[0m
4 [ 8.95400000e+03  2.40293860e-01 -1.00000000e+00  4.87884402e-01]
2 [ 8.95800000e+03  2.40481278e-01  2.40471278e-01 -1.00000000e+00
  4.87884178e-01]
[32mit is so close[0m
4 [ 1.66800000e+03 -1.22020626e+00  1.00000000e+00  4.90470648e-01]
2 [ 1.66900000e+03 -1.22009661e+00 -1.22010661e