In [1]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import threading
import queue
%matplotlib inline

In [2]:
x, y = make_classification(n_samples=2000, n_features=20, n_informative=14,
                                        n_redundant=4, n_repeated=2, n_clusters_per_class=2,
                                        n_classes=2, random_state=2018)
print(x.shape, y.shape)

(2000, 20) (2000,)


In [60]:
def base_learn(x, y, index, learner='tree'):
    data = boostrap(x, y, size=x.shape[0])
    # get the data from dictionary
    train_x, train_y = data['train_x'], data['train_y']
    out_of_bag_x, out_of_bag_y = data['out_of_bag_x'], data['out_of_bag_y']
    if learner == 'tree':
        clf = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5)
    else:
        clf = learner
    clf.fit(train_x, train_y)
    train_pre = clf.predict(train_x)
    out_of_bag_pre = clf.predict(out_of_bag_x)
    print(index, ": train_acc: {0:.2f}, test_acc: {1:.2f}".format(np.mean((train_pre==train_y).astype(np.float32)),
                                                              np.mean((out_of_bag_pre==out_of_bag_y).astype(np.float32))))
    global q
    q.put(clf)
    return clf

In [4]:
def boostrap(x, y, size):
    m = x.shape[0]
    
    index = np.random.choice(m, size, replace=True)
    train_x = x[index, :]
    train_y = y[index]
    # the data not be choosen
    out_of_bag = list(set(range(m)) - set(index))
    out_of_bag_x = x[out_of_bag, :]
    out_of_bag_y = y[out_of_bag]
    
    data = {}
    data['train_x'] = train_x
    data['train_y'] = train_y
    data['out_of_bag_x'] = out_of_bag_x
    data['out_of_bag_y'] = out_of_bag_y
    return data

In [53]:
def vote(clfs, x):
    res = 0
    for tree in clfs:
        pre =  tree.predict(x)
        res += pre     
    
    criteria = len(clfs)/2
    res = np.where(res>=criteria, 1, 0)
    return res

In [6]:
class myThread(threading.Thread):
    def __init__(self,func,args=()):
        super(myThread,self).__init__()
        self.func = func
        self.args = args

    def run(self):
        self.func(*self.args)

In [64]:
def bagging(x, y, test_size=0.7, n_learner=3, random_state=2018):
    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=test_size, random_state=random_state)
    np.random.seed(random_state)
    
    clfs = []
    threads = []
    for i in range(n_learner):
        t = myThread(base_learn, args=(x, y, i))
        threads.append(t)
    for i in range(n_learner):
        threads[i].start()
    for i in range(n_learner):
        threads[i].join()
    while not q.empty():
        clf = q.get()
        clfs.append(clf)
    """
    tree = base_learn(train_x, train_y, 1)
    clfs.append(tree)
    tree = base_learn(train_x, train_y, 2)
    clfs.append(tree)
    tree = base_learn(train_x, train_y, 3)
    clfs.append(tree)
    """
    train_pred = vote(clfs, train_x)
    print("-"*20, "\ntrain acc: {:.2f}".format(np.mean((train_pred==train_y).astype(np.float32))))
    test_pred = vote(clfs, test_x)
    print("test acc: {:.2f}".format(np.mean((test_pred==test_y).astype(np.float32))), "\n", "-"*20)
    
    return clfs

In [65]:
q = queue.Queue()
bagging(x, y)

1 : train_acc: 0.90, test_acc: 0.81
0 : train_acc: 0.91, test_acc: 0.80
2 : train_acc: 0.90, test_acc: 0.82
-------------------- 
train acc: 0.93
test acc: 0.88 
 --------------------


[DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=5, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
             splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=5, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
             splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=5, min_samples_split=2,
             min_weight_fraction_leaf=0.0, pre