In [1]:
import numpy as np
from IPython import get_ipython
from sklearn.datasets import load_svmlight_file
from sklearn.datasets import fetch_mldata
get_ipython().run_line_magic('matplotlib', 'inline')

In [2]:
class DataLoader(object):
    
    def __init__(self, dataset_name):
        if dataset_name == "mnist":
            self.loadMnistData()
        elif dataset_name == "isolet":
            ## lr = 0.05
            self.loadIsoletData()
        elif dataset_name == "sector":
            self.loadSectorData()
        elif dataset_name == "aloi":
            self.loadAloiData()
        else:
            raise ValueError('No such dataset name exists.')
        
        self.data_size = self.data.shape[0]
        self.num_feature = self.data.shape[1]
        
        self.shuffle()
    
    def loadMnistData(self):
        dataset = fetch_mldata("MNIST original",data_home = './datasets/mnist-original.mat')
        data = dataset.data
        data_size = data.shape[0]
        #normalize
        data = data / np.reshape(np.std(data, axis=1), (data_size, 1))
        data = data - np.reshape(np.mean(data, axis=1), (data_size, 1))
        
        self.data, self.labels = data, dataset.target
            
    def loadIsoletData(self):
        dataset = np.genfromtxt('./datasets/isolet/isolet_data.data',
                         dtype=None,
                         delimiter=',')
        dataset_2 = np.genfromtxt('./datasets/isolet/isolet5.data',
                         dtype=None,
                         delimiter=',')
        data = np.vstack((dataset, dataset_2))
        
        Y = data[:,-1].astype(int) 
        X = np.delete(data, -1, axis =1)
        
        self.data, self.labels = X, Y
    
    def loadSectorData(self):
        data, labels = load_svmlight_file('./datasets/sector/sector.scale')
        self.data = data.toarray()
        self.labels = labels
        
    def loadAloiData(self):
        data, labels = load_svmlight_file('./datasets/aloi/aloi.scale')
        self.data = data.toarray()
        self.labels = labels    
    
    def shuffle(self):
        shuffle = np.random.permutation(self.data_size)
        self.data = self.data[shuffle]
        self.labels = self.labels[shuffle]
        
    def generator(self):
        for i in range(self.data_size):
            yield (self.data[i], self.labels[i])
    
data_loader = DataLoader("sector")

In [3]:
class OnlineLogistRegression(object):
    
    def __init__(self, num_feature, lr=0.001):
        self.w = .01 * (np.random.rand(num_feature + 1) - 0.5)
        self.lr = lr
        
    def test(self, x):
        A = np.concatenate((np.array([1]), x),axis=0)
        y_hat = A.dot(self.w)
        
        return 1 / (1 + np.exp(-y_hat))
        
    def train(self, x, c):
        A = np.concatenate((np.array([1]), x), axis=0)
        y_hat = A.dot(self.w)
        c_hat = 1 / (1 + np.exp(-y_hat))
        
        fgrad = A.T.dot(c_hat - c)
        self.w = self.w - self.lr * fgrad

In [4]:
class Node(object):
    
    def __init__(self, node_index, num_feature, lr):
        self.left = None
        self.right = None
        self.parent = None
        self.n_all = 0
        self.e_all = 0
        self.n = []
        self.e = []
        self.class_list = []
        self.node_index = node_index
        self.model = OnlineLogistRegression(num_feature, lr)
        
    def setLeft(self, node):
        self.left = node

    def setRight(self, node):
        self.right = node
        
    def setParent(self, node):
        self.parent = node
        
    def testModel(self, x):
        return self.model.test(x)
    
    def trainModel(self, x, c):
        self.model.train(x, c)
        
    def addClass(self, class_name):
        self.n.append(0)
        self.e.append(0)
        self.class_list.append(class_name)
        
    def updateExpectation(self, c, class_index_in_node):
        self.n_all = self.n_all + 1
        self.e_all = self.e_all + c
        self.n[class_index_in_node] = self.n[class_index_in_node] + 1
        self.e[class_index_in_node] = self.e[class_index_in_node] + c
        
    def findExpectationAll(self):
        if self.n_all == 0:
            return 0
        else:
            return self.e_all / self.n_all
        
    def findExpectationOneClass(self, class_index_in_node):
        if self.n[class_index_in_node] == 0:
            return 0
        else:
            return self.e[class_index_in_node] / self.n[class_index_in_node]
    
    def judgeInTrain(self, class_index_in_node):
        #c == 0: left, c == 1: right
        return int(self.findExpectationAll() <= self.findExpectationOneClass(class_index_in_node))
    
    def judgeInTest(self, x):
        return int(self.findExpectationAll() <= self.testModel(x))

In [5]:
class Tree(object):
    def __init__(self, data_loader, lr=0.001):
        self.data_loader = data_loader
        self.num_feature = data_loader.num_feature
        self.lr = lr;
        
        self.size = 0
        self.nodes = []
        self.root = self.generateNode()
        self.nodes.append(self.root)
        
        
    def generateNode(self):
        node = Node(self.size, self.num_feature, self.lr)
        self.size = self.size + 1
        return node
        
    def split(self, node_index):
        node = self.nodes[node_index]
        left = self.generateNode()
        right = self.generateNode()
        
        self.nodes.append(left)
        node.setLeft(left)
        left.setParent(node)
        
        self.nodes.append(right)
        node.setRight(self.nodes[-1])
        right.setParent(node)

    def onlineTrain(self, xy, node):
        x, y = xy
        
        #1:register if y is new in this node
        not_registered = (y not in node.class_list)
        if not_registered:
            node.addClass(y)
            
        #2:judge (if current h(x) is above average)
        class_index_in_node = node.class_list.index(y)
        #print('class_index_in_node =', class_index_in_node)
        #c == 0: left, c == 1: right
        c = node.judgeInTrain(class_index_in_node)
        #print('c =', c)
        
        #3:train
        node.trainModel(x, c)
        
        #4:update e, n
        c_hat = node.testModel(x)
        node.updateExpectation(c_hat, class_index_in_node)
        
        #5:give birth if second class arrives at this node
        if not_registered and len(node.class_list) == 2:
            self.split(node.node_index)
            #the only previous class should be arranged to other child
            #though it cannot be trained in an online algorithm
            [node.left, node.right][1-c].addClass(node.class_list[0])
            
        #6:recursive down all the way to a leaf
        del x, y, not_registered, class_index_in_node, c_hat
        if node.left != None:
            self.onlineTrain(xy, [node.left, node.right][c])

    def startOnlineTrain(self, xy):
        self.onlineTrain(xy, self.root)

    def onlineTest(self, x, node):
        if len(node.class_list) == 1:
            return node.class_list[0]
        else:
            return self.onlineTest(x, [node.left, node.right][node.judgeInTest(x)])

    def startOnlineTest(self, x):
        return self.onlineTest(x, self.root)

In [6]:
#build
dataset_name = "sector"
data_loader = DataLoader(dataset_name)
print(data_loader.data_size)


6412


In [7]:
my_tree = Tree(data_loader, lr=0.05)
my_generator = data_loader.generator()
#train
for i in range(8000):
    my_tree.startOnlineTrain(next(my_generator))
    
#test
test_result = []
for i in range(1000):
    x, y = next(my_generator)
    test_result.append(int(y == my_tree.startOnlineTest(x)))
accurancy = np.mean(test_result)
print('accurancy =', accurancy)

StopIteration: 

In [None]:
# def swap(self, node_index):
#     self.node=self.nodes[node_index]
#     self.pa=self.node.parent
#     self.gpa=self.pa.parent
#     self.sib = self.pa.right if self.pa.left == self.node else self.pa.left
#     if self.pa==self.gpa.left:
#         self.gpa.left=self.sib
#     else:
#         self.gpa.right=self.sib
#     updateC(self.sib)
# def updateC(self, )