In [1]:
import numpy as np
import time
from sklearn.datasets import load_svmlight_file
from sklearn.datasets import fetch_mldata
from sklearn.linear_model import SGDClassifier

In [2]:
class DataLoader(object):
    
    def __init__(self, dataset_name):
        if dataset_name == "mnist":
            self.loadMnistData()
            self.num_class = 10
        elif dataset_name == "isolet":
            self.loadIsoletData()
            self.num_class = 26
        elif dataset_name == "sector":
            self.loadSectorData()
            self.num_class = 105
        elif dataset_name == "aloi":
            self.loadAloiData()
            self.num_class = 1000
        else:
            raise ValueError('No such dataset name exists.')
        
        self.train_data_size = self.train_data.shape[0]
        self.test_data_size = self.test_data.shape[0]
        self.num_feature = self.train_data.shape[1]
        
        self.shuffle()
    
    def loadMnistData(self):
        train_data, train_labels = load_svmlight_file('./datasets/mnist/mnist.scale')
        self.train_data = train_data.toarray()
        self.train_labels = train_labels
        test_data, test_labels = load_svmlight_file('./datasets/mnist/mnist.scale.t')
        self.test_data = test_data.toarray()
        self.test_labels = test_labels
            
    def loadIsoletData(self):
        train_set = np.genfromtxt('./datasets/isolet/isolet_data.data',
                         dtype=None,
                         delimiter=',')
        test_set = np.genfromtxt('./datasets/isolet/isolet5.data',
                         dtype=None,
                         delimiter=',')
        
        self.train_data = train_set[:,:-1]
        self.train_labels = train_set[:,-1].astype(int)
        self.test_data = test_set[:,:-1]
        self.test_labels = test_set[:,-1].astype(int) 
    
    def loadSectorData(self):
        data, labels = load_svmlight_file('./datasets/sector/sector.scale')
        data = data.toarray()
        n_tr = int(data.shape[0] * 0.9)
        
        self.train_data = data[:n_tr]
        self.train_labels = labels[:n_tr]
        
        self.test_data = data[n_tr:]
        self.test_labels = labels[n_tr:]
        
    def loadAloiData(self):
        data, labels = load_svmlight_file('./datasets/aloi/aloi.scale')
        data = data.toarray()
        n_tr = int(data.shape[0] * 0.9)
        
        self.train_data = data[:n_tr]
        self.train_labels = labels[:n_tr]
        
        self.test_data = data[n_tr:]
        self.test_labels = labels[n_tr:]
    
    def shuffle(self):
        shuffle = np.random.permutation(self.train_data_size)
        self.train_data = self.train_data[shuffle]
        self.train_labels = self.train_labels[shuffle]
        
        shuffle = np.random.permutation(self.test_data_size)
        self.test_data = self.test_data[shuffle]
        self.test_labels = self.test_labels[shuffle]
        
    def generator(self, train=True):
        if train:
            for i in range(self.train_data_size):
                yield (self.train_data[i].reshape(1, -1), self.train_labels[i])
        else:
            for i in range(self.test_data_size):
                yield (self.test_data[i].reshape(1, -1), self.test_labels[i])

In [3]:
class OnlineLogistRegression(object):
    
    def __init__(self, num_feature, eta0=0.01):
        self.w = .01 * (np.random.rand(num_feature + 1) - 0.5)
        self.lr = eta0
    
    def reset(self):
        self.w = .01 * (np.random.rand(len(self.w)) - 0.5)
        
    def train(self, x, c):
        A = np.concatenate((np.array([1]), x), axis=0)
        y_hat = A.dot(self.w)
        c_hat = 1 / (1 + np.exp(-y_hat))
        
        fgrad = A.T.dot(c_hat - c)
        self.w = self.w - self.lr * fgrad
        
    def test(self, x):
        A = np.concatenate((np.array([1]), x),axis=0)
        y_hat = A.dot(self.w)
        
        return 1 / (1 + np.exp(-y_hat))

In [4]:
class OnlineClassification(object):
    
    def __init__(self, num_feature, learning_rate='constant', eta0=0.01):
        self.classifier = SGDClassifier(learning_rate=learning_rate, eta0=eta0, warm_start=True)
        
    def train(self, x, c):
        self.classifier.partial_fit(x, np.array([c]), [-1, 1])
        
    def test(self, x):
        return self.classifier.predict(x)

In [5]:
class Node(object):
    
    def __init__(self, num_feature, learning_rate='constant', eta0=0.01):
        self.left = None
        self.right = None
        self.parent = None
        self.max_label = 1
        self.max_label_count = 0
        self.n_all = 0
        self.m_all = 0
        self.C = 0
        self.l = {}
        self.n = {}
        self.m = {}
        self.model = OnlineClassification(num_feature, learning_rate=learning_rate, eta0=eta0)
#         self.model = OnlineLogistRegression(num_feature, eta0)
    
    def reset(self):
        self.left = None
        self.right = None
        self.parent = None
        self.max_label = 1
        self.max_label_count = 0
        self.n_all = 0
        self.m_all = 0
        self.C = 0
        self.l.clear()
        self.n.clear()
        self.m.clear()

    def testModel(self, x):
        return self.model.test(x)
    
    def trainModel(self, x, c):
        self.model.train(x, c)
        
    def addClass(self, class_name):
        self.n[class_name] = 0
        self.m[class_name] = 0
        self.l[class_name] = 0
        
    def findExpectationAll(self):
        if self.n_all == 0:
            return 0
        else:
            return self.m_all / self.n_all
        
    def findExpectationOneClass(self, y):
        if self.n[y] == 0:
            return 0
        else:
            return self.m[y] / self.n[y]
    
    def judgeInTrain(self, y):
        #c == -1: left, c == 1: right
        return -1 if self.findExpectationAll() > self.findExpectationOneClass(y) else 1

In [6]:
class Tree(object):
    def __init__(self, T, data_loader, Rs=16, epoch=1, learning_rate='constant', eta0=0.01):
        self.data_loader = data_loader
        self.num_feature = data_loader.num_feature
        self.eta0 = eta0;
        self.learning_rate = learning_rate
        self.epoch = epoch
        self.Rs = Rs
        self.T = T
        self.t = 1
        self.size = 0
        self.root = self.generateNode()
        
    def generateNode(self):
        node = Node(self.num_feature, self.learning_rate, self.eta0)
        self.size = self.size + 1
        return node
        
    def split(self, node):
        self.t = self.t + 1
        left = self.generateNode()
        right = self.generateNode()
        
        node.left = left
        left.parent = node
        node.right = right
        right.parent = node
        
    def swap(self, node):
        cur = self.root
        while cur.left != None:
            cur = cur.left if cur.left.C < cur.right.C else cur.right
        
        parent = cur.parent
        grandpa = parent.parent
        sib = parent.left if parent.left == cur else parent.right
        if parent == grandpa.left:
            grandpa.left = sib
        else:
            grandpa.right = sib
        sib.parent = grandpa
        
        self.updateC(sib)
        cur.reset()
        parent.reset()
        node.left = cur
        cur.parent = node
        node.right = parent
        parent.parent = node
        
    def updateC(self, node):
        while node != self.root and node.parent.C != node.C:
            node = node.parent
            node.C = min(node.left.C, node.right.C)
    
    def train(self):
        start = time.time()
        print('Start training.......')
        for i in range(self.epoch):
            train_generator = self.data_loader.generator(train=True)
            for sample in train_generator:
                self.onlineTrain(sample)
            acc = self.test()
            print('epoch %d: >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> accuracy=%f' % (i, acc))
        end = time.time()
        print('time used: %d s' % (end - start))
    
    def test(self):
        test_generator = self.data_loader.generator(train=False)
        test_result = []
        for sample in test_generator:
            x, y = sample
            test_result.append(int(y == self.predict(x)))
        acc = np.mean(test_result)
        return acc
        
    def onlineTrain(self, xy):
        x, y = xy
        node = self.root
        #register if y is new in this node
        while node != None:
            not_registered = node.l.get(y) == None
            if not_registered:
                node.addClass(y)
            
            node.l[y] += 1
            
            if node.l[y] > node.max_label_count:
                node.max_label = y
                node.max_label_count = node.l[y]

            #give birth or swap in a leaf node if num_class >= 2 or 
            if node.left == None and len(node.n) > 1:
                if self.t < self.T or node.C - node.l[node.max_label] > self.Rs * (self.root.C + 1):
                    if self.t < self.T:
                        #give birth
                        self.split(node)
                    else:
                        #swap
                        self.swap(node)
                    node.left.C = node.C // 2
                    node.right.C = node.C - node.left.C
                    node.left.max_label = node.max_label
                    node.right.max_label = node.max_label
                    self.updateC(node.left)

            #train if node is not leaf
            if node.left != None:
                c = node.judgeInTrain(y)
                node.trainModel(x, c)
                c_hat = node.testModel(x)
                node.n_all += 1
                node.m_all += c_hat
                node.n[y] += 1
                node.m[y] += c_hat
                
                node = node.left if c_hat == -1 else node.right
            else:
                node.C += 1
                self.updateC(node)
                break

    def predict(self, x):
        node = self.root
        while node.left != None:
            node = node.left if node.testModel(x) == -1 else node.right
        return node.max_label

In [11]:
#build
dataset_name = "isolet"
data_loader = DataLoader(dataset_name)
K = data_loader.num_class
print('training data size: %d, test data size: %d' % (data_loader.train_data_size, data_loader.test_data_size))

training data size: 6238, test data size: 1559


In [10]:
T = 4 * K - 1
Rs = 64
learning_rate = 'optimal'
eta0 = 0.25
epoch = 2

LOM_tree = Tree(T, data_loader, epoch=epoch, Rs=Rs, learning_rate=learning_rate, eta0=eta0)

print('dataset_name=\'%s\', T=%d, Rs=%d, learning_rate=\'%s\', eta0=%.1f, epoch=%d' % (dataset_name, T, Rs, learning_rate,  eta0, epoch))
#train
LOM_tree.train()

dataset_name='mnist', T=39, Rs=64, learning_rate='optimal', eta0=0.2, epoch=2
Start training.......




KeyboardInterrupt: 

In [9]:
# check balance
count = 0
node = LOM_tree.root
print(node.left.n_all, node.right.n_all)
while node != None:
    node = node.left
    count += 1
print(count)

count = 0
node = LOM_tree.root
while node != None:
    node = node.right
    count += 1
print(count)

3140 3094
6
7


In [10]:
# compare with traditional O(k) alogrithm
start = time.time()
model = SGDClassifier(max_iter=2, learning_rate=learning_rate, eta0=eta0, warm_start=True)
gen = data_loader.generator()
c = range(1, 27)
for sample in gen:
    x, y = sample
    model.partial_fit(x, np.array([y]), c)
acc = model.score(data_loader.test_data, data_loader.test_labels)
end = time.time()
print('acc=%f' % acc)
print('time=%f' % (end - start))

acc=0.853111
time=21.565372
