XGBoost (extreme gradient boosting)

In [2]:
import numpy as np
import pandas
from sklearn import linear_model
from sklearn import metrics
import matplotlib.pyplot as plt

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.experimental import enable_hist_gradient_boosting  # explicitly require this experimental feature
from sklearn.ensemble import HistGradientBoostingClassifier

In [3]:
Trainfile = pandas.read_csv('train_data.csv')
Trainlabels = pandas.read_csv('train_labels.csv')
Testfile = pandas.read_csv('test_data.csv')
X_train = Trainfile.values
Y_train = Trainlabels.values
X_test = Testfile.values

In [4]:
def train_and_test(model, filename, pr=False):
    model.fit(X_train, Y_train.flatten())
    if pr:
        print('training accuracy:', accuracy_score(model.predict(X_train), Y_train.flatten()))
    Y_test_pred = model.predict(X_test).astype(object)
    np.savetxt(filename, np.dstack((np.arange(1, Y_test_pred.size+1),Y_test_pred))[0],"%d,%s",header="ID,Font",comments="")

In [None]:
depths = [4, 8, 10, 16, 20, 30, 32, 40, 50]
num_estimators = [250, 500, 1000]
i = 1
for n in num_estimators:
    for d in depths:
        clf = XGBClassifier(n_estimators=n, max_depth=d)
        train_and_test(clf, 'mgz27_xgb'+str(i)+'.csv')
        print('finished model')
        i+=1

# max_depth=9, 1500 estimators: 93.921% test accuracy
# max_depth=9, 1350 estimators: 93.915% test accuracy
# max_depth=9, 1300 estimators: 94.010% test accuracy 
# max_depth=9, 1250 estimators: 94.017% test accuracy
# max_depth=9, 1200 estimators: 94.024% test accuracy 
# max_depth=9, 1150 estimators: 93.915% test accuracy
# max_depth=9, 1000 estimators: 93.976% test accuracy 
        
# max_depth=500, 500 estimators: 93.210% test accuracy

# max_depth=100, 250 estimators: 93.011% test accuracy

# max_depth=50, 500 estimators: 93.127% test accuracy 
# max_depth=50, 250 estimators: 92.984% test accuracy

# max_depth=40, 500 estimators: 93.107% test accuracy
# max_depth=40, 250 estimators: 92.936% test accuracy

# max_depth=32, 500 estimators: 93.271% test accuracy
# max_depth=32, 250 estimators: 92.991% test accuracy

# max_depth=30, 500 estimators: 93.251% test accuracy
# max_depth=30, 250 estimators: 93.011% test accuracy

# max_depth=20, 500 estimators: 93.182% test accuracy
# max_depth=20, 250 estimators: 92.970% test accuracy

# max_depth=16, 1000 estimators: 93.566% test accuracy
# max_depth=16, 500 estimators: 93.381% test accuracy
# max_depth=16, 250 estimators: 93.237% test accuracy

# max_depth=10, 1000 estimators: 93.853% test accuracy
# max_depth=10, 500 estimators: 93.620% test accuracy
# max_depth=10, 250 estimators: 93.436% test accuracy

# max_depth=8, 2000 estimators: 93.921% test accuracy      
# max_depth=8, 1300 estimators: 93.956% test accuracy 
# max_depth=8, 1000 estimators: 93.908% test accuracy 
# max_depth=8, 500 estimators: 93.730% test accuracy
# max_depth=8, 250 estimators: 92.977% test accuracy

# max_depth=4, 1000 estimators: 91.108% test accuracy
# max_depth=4, 500 estimators: 88.199% test accuracy
# max_depth=4, 250 estimators: 84.086% test accuracy

In [None]:
# Histogram-based XGB classifiers:

train_and_test(HistGradientBoostingClassifier(max_iter=1225, max_depth=9), 'histxgb.csv')

# DEFAULT max leaf nodes = 31

# 1200 iterations, max depth 10: 94.366% test accuracy 

# 1300 iterations, max depth 9: 94.428% test accuracy 
# 1250 iterations, max depth 9: 94.455% test accuracy 
# 1245 iterations, max depth 9: 94.462% test accuracy
# 1140 iterations, max depth 9: 94.476% test accuracy
# 1135 iterations, max depth 9: 94.496% test accuracy
# 1230 iterations, max depth 9: 94.496% test accuracy
# 1225 iterations, max depth 9: 94.531% test accuracy <- BEST MODEL OVERALL
# 1220 iterations, max depth 9: 94.496% test accuracy
# 1215 iterations, max depth 9: 94.524% test accuracy 
# 1210 iterations, max depth 9: 94.510% test accuracy
# 1205 iterations, max depth 9: 94.510% test accuracy
# 1200 iterations, max depth 9: 94.517% test accuracy 
# 1195 iterations, max depth 9: 94.510% test accuracy
# 1190 iterations, max depth 9: 94.510% test accuracy
# 1185 iterations, max depth 9: 94.517% test accuracy
# 1180 iterations, max depth 9: 94.503% test accuracy
# 1175 iterations, max depth 9: 94.476% test accuracy
# 1170 iterations, max depth 9: 94.483% test accuracy
# 1165 iterations, max depth 9: 94.476% test accuracy
# 1160 iterations, max depth 9: 94.490% test accuracy
# 1155 iterations, max depth 9: 94.483% test accuracy
# 1150 iterations, max depth 9: 94.366% test accuracy 

# 1200 iterations, max depth 8: 94.339% test accuracy 

# 1300 iterations, max depth 7: 94.449% test accuracy 
# 1250 iterations, max depth 7: 94.442% test accuracy 
# 1200 iterations, max depth 7: 94.428% test accuracy 

In [10]:
# max leaf nodes 30, 1250 iterations, max depth 9: 94.414% test accuracy

# max leaf nodes 30, 1240 iterations, max depth 9: 94.401% test accuracy

# max leaf nodes 45, 1240 iterations, max depth 9: 94.339% test accuracy
# max leaf nodes 40, 1240 iterations, max depth 9: 94.264% test accuracy
# max leaf nodes 35, 1240 iterations, max depth 9: 94.442% test accuracy
# max leaf nodes 30, 1240 iterations, max depth 9: 94.401% test accuracy
# max leaf nodes 25, 1240 iterations, max depth 9: ____% test accuracy 

# max leaf nodes 45, 1235 iterations, max depth 9: 94.339% test accuracy
# max leaf nodes 40, 1235 iterations, max depth 9: 94.277% test accuracy
# max leaf nodes 35, 1235 iterations, max depth 9: 94.428% test accuracy
# max leaf nodes 30, 1235 iterations, max depth 9: 94.387% test accuracy
# max leaf nodes 25, 1235 iterations, max depth 9: 94.250% test accuracy 

# max leaf nodes 45, 1230 iterations, max depth 9: 94.353% test accuracy
# max leaf nodes 40, 1230 iterations, max depth 9: 94.277% test accuracy
# max leaf nodes 35, 1230 iterations, max depth 9: 94.435% test accuracy
# max leaf nodes 30, 1230 iterations, max depth 9: 94.373% test accuracy
# max leaf nodes 25, 1230 iterations, max depth 9: 94.264% test accuracy 

# max leaf nodes 50, 1225 iterations, max depth 9: 94.449% test accuracy
# max leaf nodes 45, 1225 iterations, max depth 9: 94.353% test accuracy
# max leaf nodes 40, 1225 iterations, max depth 9: 94.291% test accuracy
# max leaf nodes 35, 1225 iterations, max depth 9: 94.449% test accuracy
# max leaf nodes 32, 1225 iterations, max depth 9: 93.613% test accuracy
# max leaf nodes 30, 1225 iterations, max depth 9: 94.394% test accuracy
# max leaf nodes 25, 1225 iterations, max depth 9: 94.277% test accuracy 
# max leaf nodes 20, 1225 iterations, max depth 9: 94.325% test accuracy

# max leaf nodes 45, 1220 iterations, max depth 9: 94.360% test accuracy
# max leaf nodes 40, 1220 iterations, max depth 9: 94.305% test accuracy
# max leaf nodes 35, 1220 iterations, max depth 9: 94.490% test accuracy
# max leaf nodes 30, 1220 iterations, max depth 9: 94.394% test accuracy
# max leaf nodes 25, 1220 iterations, max depth 9: 94.298% test accuracy 

# max leaf nodes 45, 1215 iterations, max depth 9: 94.387% test accuracy
# max leaf nodes 40, 1215 iterations, max depth 9: 94.271% test accuracy
# max leaf nodes 35, 1215 iterations, max depth 9: 94.476% test accuracy
# max leaf nodes 32, 1215 iterations, max depth 9: 94.401% test accuracy 
# max leaf nodes 30, 1215 iterations, max depth 9: 94.353% test accuracy
# max leaf nodes 25, 1215 iterations, max depth 9: 94.298% test accuracy 

# max leaf nodes 45, 1210 iterations, max depth 9: 94.387% test accuracy
# max leaf nodes 40, 1210 iterations, max depth 9: 94.236% test accuracy
# max leaf nodes 35, 1210 iterations, max depth 9: 94.469% test accuracy
# max leaf nodes 30, 1210 iterations, max depth 9: 94.373% test accuracy
# max leaf nodes 25, 1210 iterations, max depth 9: ____% test accuracy 

# max leaf nodes 45, 1205 iterations, max depth 9: 94.360% test accuracy
# max leaf nodes 40, 1205 iterations, max depth 9: 94.257% test accuracy
# max leaf nodes 35, 1205 iterations, max depth 9: 93.490% test accuracy
# max leaf nodes 30, 1205 iterations, max depth 9: 94.380% test accuracy
# max leaf nodes 25, 1205 iterations, max depth 9: 94.229% test accuracy 

# max leaf nodes 45, 1200 iterations, max depth 9: 94.366% test accuracy
# max leaf nodes 40, 1200 iterations, max depth 9: 94.291% test accuracy
# max leaf nodes 35, 1200 iterations, max depth 9: 94.462% test accuracy
# max leaf nodes 30, 1200 iterations, max depth 9: 94.401% test accuracy
# max leaf nodes 25, 1200 iterations, max depth 9: ____% test accuracy 