In [1]:
import numpy as np
from matplotlib import pyplot as plt
import time
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [2]:
class WeakClassifier:
    def __init__(self, feature_index, feature_val, threshold, polarity, error):
        self.feature_index = feature_index
        self.feature_val = feature_val
        self.threshold = threshold
        self.polarity = polarity
        self.error = error
    
    # make a function for easier access as numpy array, example: np.array(wc)
    def __array__(self):
        # return tensor.cpu() if members are tensors else np.array
        if type(self.feature_index) == torch.Tensor:
            return np.array([self.feature_index.cpu().numpy(), self.feature_val.cpu().numpy(), self.threshold.cpu().numpy(), self.polarity.cpu().numpy(), self.error.cpu().numpy()])
        else:
            return np.array([self.feature_index, self.feature_val, self.threshold, self.polarity, self.error])
        
    def __str__(self):
        return np.array(self).__str__()
    

# np init

In [11]:
n_features = 16000
n_samples = 15000
# n_features = 3
# n_samples = 10
n_classes = 2
X = np.random.randn(n_features, n_samples)
y = np.random.randint(0, n_classes, n_samples)
y = np.array([1 if i == 1 else -1 for i in y])
weights = np.random.randn(n_samples)


# Their method

In [12]:
s_t = time.time()

total_pos, total_neg = 0, 0
for w, label in zip(weights, y):
    if label == 1:
        total_pos += w
    else:
        total_neg += w

classifiers = []
total_features = X.shape[0]
for index, feature in enumerate(X):
    if len(classifiers) % 1000 == 0 and len(classifiers) != 0:
        print("Trained %d classifiers out of %d" % (len(classifiers), total_features))

    applied_feature = sorted(zip(weights, feature, y), key=lambda x: x[1])

    pos_seen, neg_seen = 0, 0
    pos_weights, neg_weights = 0, 0
    min_error, best_feature, best_threshold, best_polarity = float('inf'), None, None, None
    current_idx = 0
    ws = []
    last_error = 0
    pos_seen_list = []
    for w, f, label in applied_feature:
        ws.append(w)
        # min(all before current example are positive and all after are negative, all before current example are negative and all after are positive)
        # error = sum of weights of misclassified examples
        error = min(neg_weights + total_pos - pos_weights, pos_weights + total_neg - neg_weights)
        last_error = error
        # print("error : ", error)
        if error < min_error:
            min_error = error
            # best_feature = features[index]
            best_feature = (current_idx, f)
            best_threshold = f
            best_polarity = 1 if pos_seen > neg_seen else -1

        if label == 1:
            pos_seen += 1
            pos_weights += w
        else:
            neg_seen += 1
            neg_weights += w
        current_idx += 1
        pos_seen_list.append(pos_seen)

    # clf = WeakClassifier(best_feature[0], best_feature[1], best_threshold, best_polarity)
    clf = WeakClassifier(best_feature[0], best_feature[1], best_threshold, best_polarity, min_error)
    classifiers.append(clf)

print("Time taken: %f seconds" % (time.time() - s_t))

Trained 1000 classifiers out of 16000
Trained 2000 classifiers out of 16000
Trained 3000 classifiers out of 16000
Trained 4000 classifiers out of 16000
Trained 5000 classifiers out of 16000
Trained 6000 classifiers out of 16000
Trained 7000 classifiers out of 16000
Trained 8000 classifiers out of 16000
Trained 9000 classifiers out of 16000
Trained 10000 classifiers out of 16000
Trained 11000 classifiers out of 16000
Trained 12000 classifiers out of 16000
Trained 13000 classifiers out of 16000
Trained 14000 classifiers out of 16000
Trained 15000 classifiers out of 16000
Time taken: 345.721772 seconds


# Mine

<h3>
<div style='color: red;'>
Question: for the polarity:<br>
 Direction that gives minimum weights <span style='color:pink'> -> To be consistent with finding minimum error using weights</span><br> 
    or<br>
 Direction that gives less misclassified samples???? <span style='color:pink'> -> To find threshold giving better accuracy???</span></div>
</h3>

In [13]:
s_t = time.time()
classifiers2 = []
total_features2 = X.shape[0]
# TODO parallize this
# TODO can we get rid of this loop and make them matrices?
for index, feature in enumerate(X):
    if len(classifiers2) % 1000 == 0 and len(classifiers2) != 0:
        print("Trained %d classifiers out of %d" % (len(classifiers2), total_features2))
        
    min_error, best_feature, best_threshold, best_polarity = float('inf'), None, None, None

    # sort by feature value first, then by weight, then by label
    # TODO No need for lexsort, and argsort is ok?
    sorting_indecies = np.lexsort((y, weights, feature))
    
    # s_w: srorted weights, s_f: sorted features, s_y: sorted labels
    s_w, s_f, s_y = weights[sorting_indecies], feature[sorting_indecies], y[sorting_indecies]
    
    for left, right in [(-1, 1), (1, -1)]: #@ y: -1 or 1
        left_weights = np.concatenate(([0], np.cumsum(s_w * (s_y == left))))
        right_weights = np.flip(np.cumsum(np.flip(s_w * (s_y == right))))
        right_weights = np.concatenate((right_weights, [0]))
        idx = np.argmin(left_weights + right_weights)
        if idx >= len(s_f):
            idx = len(s_f) - 1
            
        cur_min_error = left_weights[idx] + right_weights[idx]
        if cur_min_error < min_error:
            min_error = cur_min_error
            best_feature, best_threshold = (idx, s_f[idx]), s_f[idx]
            best_polarity = left

    clf2 = WeakClassifier(best_feature[0], best_feature[1], best_threshold, best_polarity, min_error)
    classifiers2.append(clf2)

    

print("Time taken: %f seconds" % (time.time() - s_t))

Trained 1000 classifiers out of 16000
Trained 2000 classifiers out of 16000
Trained 3000 classifiers out of 16000
Trained 4000 classifiers out of 16000
Trained 5000 classifiers out of 16000
Trained 6000 classifiers out of 16000
Trained 7000 classifiers out of 16000
Trained 8000 classifiers out of 16000
Trained 9000 classifiers out of 16000
Trained 10000 classifiers out of 16000
Trained 11000 classifiers out of 16000
Trained 12000 classifiers out of 16000
Trained 13000 classifiers out of 16000
Trained 14000 classifiers out of 16000
Trained 15000 classifiers out of 16000
Time taken: 37.049964 seconds


# Even more optimized?
<h1>
    <div style='color:red'>
        Unfortunately, didn't work. The computer freezes when I try to run it.
    </div>
    <div style='color:red'>
        Maybe will need to run it on a GPU??
    </div>
</h1>


In [94]:
# s_t = time.time()
# classifiers3 = []
# total_features3 = X.shape[0]

# # TODO parallize this
# # TODO can we get rid of this loop and make them matrices?
# # for index, feature in enumerate(X):
# #     if len(classifiers3) % 1000 == 0 and len(classifiers3) != 0:
# #         print("Trained %d classifiers out of %d" % (len(classifiers3), total_features3))
        

# min_error, best_feature, best_threshold, best_polarity = np.array([float('inf')]*X.shape[0])\
#     , np.zeros((X.shape[0], 2)), np.zeros(X.shape[0]), np.zeros(X.shape[0])

# # sort by feature value first, then by weight, then by label
# # TODO No need for lexsort, and argsort is ok? looks like tile is expensive
# weights2d = np.tile(weights, (X.shape[0], 1))
# y2d = np.tile(y, (X.shape[0], 1))
# sorting_indecies = np.lexsort((y2d
#                                ,weights2d
#                                ,X))
# idx0 = np.arange(X.shape[0]).reshape(-1, 1)
# # s_w: srorted weights, s_f: sorted features, s_y: sorted labels
# s_w, s_f, s_y = weights2d[idx0, sorting_indecies], X[idx0, sorting_indecies], y2d[idx0, sorting_indecies]

# for left, right in [(-1, 1), (1, -1)]: #@ y: -1 or 1
#     # left_weights = np.concatenate(([0], np.cumsum(s_w * (s_y == left), axis=1)))
#     left_weights = np.c_[np.zeros((s_w.shape[0], 1))
#                          , np.cumsum(s_w * (s_y == left), axis=1)]
#     right_weights = np.flip(np.cumsum(np.flip(s_w * (s_y == right), axis = 1), axis=1), axis=1)
#     # right_weights = np.concatenate((right_weights, [0]))
#     right_weights = np.c_[right_weights, np.zeros((right_weights.shape[0], 1))]
#     idx = np.argmin(left_weights + right_weights, axis=1)
#     # should it be zero???
#     idx[idx >= s_f.shape[1]] = s_f.shape[1] - 1
#     # if idx >= len(s_f):
#     #     idx = len(s_f) - 1
#     ii1 = np.arange(idx.shape[0])
#     cur_min_error = left_weights[ii1, idx] + right_weights[ii1, idx]
#     temp_bool = cur_min_error < min_error
#     min_error[temp_bool] = cur_min_error[temp_bool]
#     selected_idx = idx[temp_bool]
#     selected_features = s_f[ii1[temp_bool], selected_idx]
#     best_feature[temp_bool] = np.array(list(zip(selected_idx, selected_features)))
#     best_threshold[temp_bool] = s_f[ii1[temp_bool], idx[temp_bool]]
#     best_polarity[temp_bool] = left
#     # if cur_min_error < min_error:
#     #     min_error = cur_min_error
#     #     best_feature, best_threshold = (idx, s_f[idx]), s_f[idx]
#     #     best_polarity = left
# classifiers3 = [WeakClassifier(*clf3) for clf3 in zip(best_feature[:,0], best_feature[:,1], best_threshold, best_polarity, min_error)]
# # clf3 = WeakClassifier(best_feature[0], best_feature[1], best_threshold, best_polarity, min_error)
# # classifiers3.append(clf2)

    

# print("Time taken: %f seconds" % (time.time() - s_t))

Time taken: 0.004261 seconds


# Pytorch

In [14]:

print(torch.cuda.is_available())

device = torch.device('cuda:0')
print(device)


True
cuda:0


In [15]:

# n_features = 16000
# n_samples = 15000
# n_features = 3
# n_samples = 10
# n_classes = 2

# X = torch.rand(n_features, n_samples, device=device)
# y = torch.randint(0, n_classes, (n_samples, ))
# y = torch.tensor([1 if i == 1 else -1 for i in y], device=device)
# weights = torch.rand(n_samples, device=device)
y = torch.tensor(y, device=device, dtype=torch.float32)
weights = torch.tensor(weights, device=device, dtype=torch.float32)

In [16]:
def mem(idx=None):
    if idx:print('At index: ', idx, ': ')
    print(torch.cuda.memory_allocated()/(1024**2), 'Mb')
    print(torch.cuda.memory_reserved()/(1024**2), 'Mb')
mem()

172.5654296875 Mb
1158.0 Mb


In [17]:
class FeaturesDataset(Dataset):
    def __init__(self, n_features=3, n_samples=10):
        self.X = torch.rand(n_features, n_samples)

    def __getitem__(self, index):
        return self.X[index]

    def __len__(self):
        return len(self.X)
dataset = FeaturesDataset(n_features=n_features, n_samples=n_samples)
dataset.X = torch.tensor(X, dtype=torch.float32)
dataloader = DataLoader(dataset=dataset, batch_size=1000, num_workers=2)

In [18]:

s_t = time.time()

classifiers4 = []
total_features4 = n_features

# TODO parallize this
# TODO can we get rid of this loop and make them matrices?
# for index, feature in enumerate(X):
#     if len(classifiers4) % 1000 == 0 and len(classifiers4) != 0:
#         print("Trained %d classifiers out of %d" % (len(classifiers4), total_features4))
        
for index, X in enumerate(dataloader):

    print('At index: ', index, ':', ' Start time: ', time.time() - s_t)
    X = X.to(device)
    
    mem(index)
    min_error, best_feature, best_threshold, best_polarity = torch.tensor([float('inf')]*X.shape[0], device=device)\
        , torch.zeros((X.shape[0], 2), device=device), torch.zeros(X.shape[0], device=device), torch.zeros(X.shape[0], device=device)

    # sort by feature value first, then by weight, then by label
    # TODO No need for lexsort, and argsort is ok? looks like tile is expensive
    weights2d = torch.tile(weights, (X.shape[0], 1))
    # del weights
    y2d = torch.tile(y, (X.shape[0], 1))
    # del y
    # sorting_indecies = torch.lexsort((y2d
    #                             ,weights2d
    #                             ,X)).to(device)
    sorting_indecies = torch.argsort(X,stable=True) 
    idx0 = torch.arange(X.shape[0]).reshape(-1, 1).to(device)
    # s_w: srorted weights, s_f: sorted features, s_y: sorted labels
    s_w = weights2d[idx0, sorting_indecies]
    # del weights2d
    s_f = X[idx0, sorting_indecies]
    # del X
    s_y = y2d[idx0, sorting_indecies]
    # del y2d
    # del idx0
    
    mem()


    for left, right in [(-1, 1), (1, -1)]: #@ y: -1 or 1
        print(s_w.shape)
        # left_weights = np.concatenate(([0], np.cumsum(s_w * (s_y == left), axis=1)))
        left_weights = torch.cat((torch.zeros((s_w.shape[0], 1), device=device)
                            , torch.cumsum(s_w * (s_y == left), axis=1)), axis=1)
        right_weights = torch.flip(torch.cumsum(torch.flip(s_w * (s_y == right), dims = [1]), axis=1), dims=[1])
        # right_weights = np.concatenate((right_weights, [0]))
        right_weights = torch.cat((right_weights, torch.zeros((right_weights.shape[0], 1), device=device)), axis=1)
        idx = torch.argmin(left_weights + right_weights, axis=1)
        # should it be zero???
        idx[idx >= s_f.shape[1]] = s_f.shape[1] - 1
        # if idx >= len(s_f):
        #     idx = len(s_f) - 1
        ii1 = torch.arange(idx.shape[0], device=device)
        cur_min_error = left_weights[ii1, idx] + right_weights[ii1, idx]
        temp_bool = cur_min_error < min_error
        min_error[temp_bool] = cur_min_error[temp_bool]
        selected_idx = idx[temp_bool]
        selected_features = s_f[ii1[temp_bool], selected_idx]
        best_feature[temp_bool] = torch.tensor(list(zip(selected_idx, selected_features)), device=device)
        best_threshold[temp_bool] = s_f[ii1[temp_bool], idx[temp_bool]]
        best_polarity[temp_bool] = left
        # if cur_min_error < min_error:
        #     min_error = cur_min_error
        #     best_feature, best_threshold = (idx, s_f[idx]), s_f[idx]
        #     best_polarity = left
    # add to classifiers4, converted to numpy
    classifiers4.extend([WeakClassifier(*clf4) for clf4 in zip(best_feature[:,0], best_feature[:,1], best_threshold, best_polarity, min_error)])
    # classifiers4 = [WeakClassifier(*clf4) for clf4 in zip(best_feature[:,0], best_feature[:,1], best_threshold, best_polarity, min_error)]
    # clf3 = WeakClassifier(best_feature[0], best_feature[1], best_threshold, best_polarity, min_error)
    # classifiers3.append(clf2)

        

print("Time taken: %f seconds" % (time.time() - s_t))

At index:  0 :  Start time:  1.6864206790924072
230.2529296875 Mb
1158.0 Mb
518.71337890625 Mb
1158.0 Mb
torch.Size([1000, 15000])
torch.Size([1000, 15000])
At index:  1 :  Start time:  2.5111536979675293
At index:  1 : 
633.17431640625 Mb
1158.0 Mb
633.19384765625 Mb
1158.0 Mb
torch.Size([1000, 15000])
torch.Size([1000, 15000])
At index:  2 :  Start time:  2.7511305809020996
At index:  2 : 
633.19384765625 Mb
1158.0 Mb
633.21337890625 Mb
1158.0 Mb
torch.Size([1000, 15000])
torch.Size([1000, 15000])
At index:  3 :  Start time:  2.9252548217773438
At index:  3 : 
633.21337890625 Mb
1158.0 Mb
633.67431640625 Mb
1158.0 Mb
torch.Size([1000, 15000])
torch.Size([1000, 15000])
At index:  4 :  Start time:  3.098696708679199
At index:  4 : 
633.23291015625 Mb
1158.0 Mb
634.03173828125 Mb
1158.0 Mb
torch.Size([1000, 15000])
torch.Size([1000, 15000])
At index:  5 :  Start time:  3.2732608318328857
At index:  5 : 
634.47412109375 Mb
1158.0 Mb
633.27294921875 Mb
1158.0 Mb
torch.Size([1000, 15000])


In [19]:
for el in classifiers:
    print(el)
print()
for el in classifiers2:
    print(el)
print()
# for el in classifiers3:
#     print(el)
# print()

for el in classifiers4:
    print(el)
print()

# classifiers==classifiers2
# print(classifiers[0])
# print(classifiers2[0])

[ 1.18940000e+04  8.26430887e-01  8.26430887e-01 -1.00000000e+00
 -1.09595765e+02]
[ 1.32820000e+04  1.22832837e+00  1.22832837e+00 -1.00000000e+00
 -1.80398181e+02]
[ 9.62400000e+03  3.70040043e-01  3.70040043e-01  1.00000000e+00
 -1.43018376e+02]
[ 4.36400000e+03 -5.45239215e-01 -5.45239215e-01 -1.00000000e+00
 -1.44850560e+02]
[ 5.98000000e+03 -2.64500508e-01 -2.64500508e-01 -1.00000000e+00
 -1.51520314e+02]
[ 5.26900000e+03 -3.74154498e-01 -3.74154498e-01  1.00000000e+00
 -1.87398995e+02]
[ 2.41800000e+03 -9.95523232e-01 -9.95523232e-01 -1.00000000e+00
 -1.47336623e+02]
[ 1.27440000e+04  1.04548455e+00  1.04548455e+00 -1.00000000e+00
 -1.03797434e+02]
[ 4.21800000e+03 -5.66942905e-01 -5.66942905e-01  1.00000000e+00
 -1.85498733e+02]
[ 6.55000000e+03 -1.60927120e-01 -1.60927120e-01  1.00000000e+00
 -1.66214742e+02]
[ 3.62800000e+03 -6.86730349e-01 -6.86730349e-01  1.00000000e+00
 -1.38232655e+02]
[ 6.31600000e+03 -2.06264746e-01 -2.06264746e-01 -1.00000000e+00
 -1.67230747e+02]
[ 7.

In [21]:
for i in range(4):
    print(classifiers[i])
    print(classifiers2[i])
    print(classifiers4[i])
    print()

[ 1.18940000e+04  8.26430887e-01  8.26430887e-01 -1.00000000e+00
 -1.09595765e+02]
[ 1.18940000e+04  8.26430887e-01  8.26430887e-01  1.00000000e+00
 -1.09595765e+02]
[ 1.189400e+04  8.264309e-01  8.264309e-01  1.000000e+00 -1.095957e+02]

[ 1.32820000e+04  1.22832837e+00  1.22832837e+00 -1.00000000e+00
 -1.80398181e+02]
[ 1.32820000e+04  1.22832837e+00  1.22832837e+00 -1.00000000e+00
 -1.80398181e+02]
[ 1.3282000e+04  1.2283283e+00  1.2283283e+00 -1.0000000e+00
 -1.8039819e+02]

[ 9.62400000e+03  3.70040043e-01  3.70040043e-01  1.00000000e+00
 -1.43018376e+02]
[ 9.62400000e+03  3.70040043e-01  3.70040043e-01 -1.00000000e+00
 -1.43018376e+02]
[ 9.6240000e+03  3.7004003e-01  3.7004003e-01 -1.0000000e+00
 -1.4301843e+02]

[ 4.36400000e+03 -5.45239215e-01 -5.45239215e-01 -1.00000000e+00
 -1.44850560e+02]
[ 4.36400000e+03 -5.45239215e-01 -5.45239215e-01  1.00000000e+00
 -1.44850560e+02]
[ 4.364000e+03 -5.452392e-01 -5.452392e-01  1.000000e+00 -1.448506e+02]



In [101]:
a1 = np.zeros((len(classifiers), 5))
for i in range(len(classifiers)):
    a1[i] = np.array(classifiers[i])

a2 = np.zeros((len(classifiers2), 5))
for i in range(len(classifiers2)):
    a2[i] = np.array(classifiers2[i])
a3 = np.zeros((len(classifiers3), 5))
for i in range(len(classifiers3)):
    a3[i] = np.array(classifiers3[i])
    
print(a1[a1!=a2])
print(a1)
print()
print(a2)
print()
print(a3)
print(np.argwhere(a2!=a3))
print(np.argwhere(a2!=a1))
print(a2[a1!=a2])

[-1.         -2.07043852]
[[ 7.          0.80027287  0.80027287 -1.         -0.97541437]
 [ 2.          0.15299952  0.15299952 -1.         -3.08719862]
 [ 9.          1.17385292  1.17385292 -1.         -2.07043852]]

[[ 7.          0.80027287  0.80027287 -1.         -0.97541437]
 [ 2.          0.15299952  0.15299952 -1.         -3.08719862]
 [ 9.          1.17385292  1.17385292  1.         -2.07043852]]

[[ 7.          0.80027287  0.80027287 -1.         -0.97541437]
 [ 2.          0.15299952  0.15299952 -1.         -3.08719862]
 [ 9.          1.17385292  1.17385292  1.         -2.07043852]]
[]
[[2 3]
 [2 4]]
[ 1.         -2.07043852]
