Bagging

In [9]:
import pandas
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier

In [3]:
Trainfile = pandas.read_csv('train_data.csv')
Trainlabels = pandas.read_csv('train_labels.csv')
Testfile = pandas.read_csv('test_data.csv')
X_train = Trainfile.values
Y_train = Trainlabels.values
X_test = Testfile.values

In [4]:
def train_and_test(model, filename, pr=False):
    model.fit(X_train, Y_train.flatten())
    if pr:
        pr('training accuracy:', accuracy_score(model.predict(X_train), Y_train.flatten()))
    Y_test_pred = model.predict(X_test).astype(object)
    np.savetxt(filename, np.dstack((np.arange(1, Y_test_pred.size+1),Y_test_pred))[0],"%d,%s",header="ID,Font",comments="")

Random Forest

In [5]:
# Bagging with DTs (Random Forest)

tree = DecisionTreeClassifier(max_depth = 100, min_samples_split = 3, min_samples_leaf = 2)
clf = BaggingClassifier(base_estimator=tree, n_estimators=2048).fit(X_train, Y_train.flatten())
train_and_test(clf, "bagging_dt.csv")

# 1000 estimators, max_depth=10: 68.740% test accuracy 

# 30 estimators, max_depth=30: 84.209% test accuracy 

# 20 estimators, max_depth=50: 88.774% test accuracy 

# 20 estimators, max_depth = 100: 89.137% test accuracy 
# 50 estimators, max_depth = 100: 89.698% test accuracy 
# 300 estimators, max_depth = 100: 89.986% test accuracy 
# 2048 estimators, max_depth = 100: 90.157% test accuracy <- Best

# 500 estimators, max_depth = 150: 84.236% test accuracy 

# 20 estimators, max_depth=200: 89.295% test accuracy 
# 100 estimators, max_depth=200: 89.616% test accuracy 

# 100 estimators, max_depth=400: 89.883% test accuracy 
# 2048 estimators, max_depth=400: 90.109% test accuracy


In [6]:
# RandomForestClassifier should give very similar overall performances (with same hyperparameters)
# as to using Bagging with DTs (same thing!!)

# Just tried some different # of estimators here
train_and_test(RandomForestClassifier(n_estimators=256), "random_forest.csv")
# gives 79.938% test accuracy with n=256 estimators 

train_and_test(RandomForestClassifier(n_estimators=256, max_depth=8), "random_forest.csv")
# gives 58.110% test accuracy with n=256 estimators, max_depth=8 (UNDERFITTING)

Extreme Random Forests (Extra-Trees Classifier)

In [10]:
train_and_test(ExtraTreesClassifier(n_estimators=1000, max_depth=50, min_samples_split=2, random_state=0), "extreme_random_forest.csv")

# 100 estimators, max_depth=200: 80.417% test accuracy
# 1000 estimators, max_depth=100: 81.327% test accuracy
# 1000 estimators, max_depth=50: 81.279% test accuracy
# 100 estimators, max_depth=100: 80.417% test accuracy
# 1000 estimators, max_depth=10: 60.047% test accuracy
# 100 estimators, max_depth=10: 59.664% test accuracy




Bagging with 1-NN (k=1 in k-NN)

In [11]:
# Bagging with 1-NN

knn = KNeighborsClassifier(n_neighbors=1)
clf = BaggingClassifier(base_estimator=knn, n_estimators=300).fit(X_train, Y_train.flatten())
Y_test_pred = clf.predict(X_test)
np.savetxt("bagging_1nn_300.csv", np.dstack((np.arange(1, Y_test_pred.size+1),Y_test_pred))[0],"%d,%s",header="ID,Font",comments="")

# 300 estimators: 67% test accuracy