# Split Data into Training and Test

In [1]:
!pip install imblearn
!pip install xgboost

[0mCollecting imblearn
  Using cached imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Using cached imbalanced_learn-0.12.0-py3-none-any.whl.metadata (8.2 kB)
Using cached imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Using cached imbalanced_learn-0.12.0-py3-none-any.whl (257 kB)
[0mInstalling collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.0 imblearn-0.0
[0mCollecting xgboost
  Using cached xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Using cached xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[0mInstalling collected packages: xgboost
Successfully installed xgboost-2.0.3


In [89]:
# load dependencies
import pandas as pd # for importing and handling data
import numpy as np # for working with arrays
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold # for splitting data and hyperparameter tuning
from sklearn.preprocessing import LabelEncoder # for processing data for xgboost
from sklearn.metrics import confusion_matrix, f1_score, mean_squared_error # for model performance metrics
from imblearn.over_sampling import SMOTE # for smote oversampling
import xgboost as xgb # for xgboost model

In [26]:
# load data
df = pd.read_csv("/home/jupyter/final_project/pisa_median.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,ST352Q06JA,AGE,ST004D01T,DURECEC,REPEAT,MISSSC,SKIPPING,TARDYSD,EXPECEDU,...,ST353Q07JA,ST353Q08JA,ST348Q01JA,ST348Q02JA,ST348Q03JA,ST348Q04JA,ST348Q05JA,ST348Q06JA,ST348Q07JA,ST348Q08JA
0,1.0,2.0,15.58,1.0,2.0,0.0,0.0,1.0,2.0,7.0,...,2.0,2.0,4.0,4.0,4.0,3.0,4.0,3.0,2.0,2.0
1,2.0,2.0,16.17,2.0,2.0,0.0,1.0,0.0,1.0,7.0,...,4.0,4.0,2.0,3.0,2.0,3.0,4.0,4.0,4.0,4.0
2,3.0,4.0,15.58,2.0,0.0,0.0,0.0,0.0,0.0,9.0,...,2.0,3.0,3.0,3.0,2.0,3.0,3.0,2.0,2.0,2.0
3,4.0,4.0,15.42,2.0,2.0,0.0,0.0,0.0,0.0,4.0,...,1.0,2.0,1.0,3.0,4.0,1.0,4.0,1.0,2.0,1.0
4,5.0,2.0,15.75,2.0,1.0,0.0,0.0,0.0,0.0,8.0,...,2.0,2.0,1.0,3.0,4.0,3.0,4.0,3.0,2.0,2.0


In [27]:
# split into target and features
y = df["ST352Q06JA"].values
X = df.drop(["ST352Q06JA", "Unnamed: 0"], axis=1).values

In [28]:
# split data into training and test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.2, # common to use 20-30% of data as test set
                                                    random_state = 2001, # set seed equivalent
                                                    stratify = y) # churn amounts equal in train and test

In [6]:
print(X_train[0:3])
print(y_train[0:3])

[[ 1.5670e+01  2.0000e+00  2.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  8.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   6.0000e+00  1.0000e+01  0.0000e+00  6.0000e+00 -7.1900e-02 -1.2280e+00
   1.1246e+00  1.4750e-01 -7.1000e-03 -3.2030e-01  2.7474e+00 -9.7550e-01
   1.8227e+00  5.2030e-01  2.3650e+00  3.8830e+00  1.1187e+00  4.9584e+00
   1.3749e+00  2.4100e-02  1.3679e+00  3.1637e+00  1.7174e+00  1.7465e+00
   1.0481e+00  1.0000e+00  1.0000e+00  1.0000e+00  2.0000e+00  1.0000e+00
   2.0000e+00  1.0000e+00  3.0000e+00  3.0000e+00  4.0000e+00  3.0000e+00
   4.0000e+00  2.0000e+00  2.0000e+00  1.0000e+00  1.0000e+00  1.0000e+00
   2.0000e+00  2.0000e+00  2.0000e+00  2.0000e+00  3.0000e+00  3.0000e+00
   3.0000e+00  4.0000e+00  4.0000e+00  2.0000e+00  1.0000e+00]
 [ 1.5830e+01  1.0000e+00  4.0000e+00  0.0000e+00  0.0000e+00  1.0000e+00
   0.0000e+00  6.0000e+00  1.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  1.0000e+01  6.0000e+00  1.0000e+01

# Conduct Oversampling with SMOTE on Training Data

In [29]:
# frequency of observations in training data pre-oversampling
temp = y_train.astype(int)
np.bincount(temp)

array([   0, 1349, 2325, 2088, 3182])

In [30]:
# conduct oversampling on training data using SMOTE
# SMOTE overview here: https://www.youtube.com/watch?v=1Ic7GRtDrPM
smote = SMOTE(random_state = 2001)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [31]:
# convert y_smote to integer (needed for xgboost later)
y_smote = y_smote.astype(int)
np.bincount(y_smote)

array([   0, 3182, 3182, 3182, 3182])

In [32]:
# transform data to work with xgboost (encoding needs to start at zero)
le = LabelEncoder()
y_smote = le.fit_transform(y_smote)

In [45]:
type(X_smote)
X_smote.shape

(12728, 65)

# Hyperparameter Tuning and Cross-Validation for xgboost

We want to train hyperparameters for `eta` (learning rate), `n_estimators` (number of estimators), and `colsample_bytree` (proportion of total features considered at each split). Specifically, using the following values:

* `eta`: 0.01, 0.05, 0.1, 0.3
* `n_estimators`: 100, 200, 500, 1000, 2000
* `colsample_bytree`: 0.1, 0.5, 0.7, 1.0

Furthermore, we will conduct 5-fold cross-validation for each training model to return an aggregate RMSE value. In total, that yields 4 (`eta`) x 5 (`n_estimators`) x 4 (`colsample_bytree`) x 5 (cross-validation) = 400 estimated models. We will use randomized search for this endeavour.

In [33]:
# create values for hyperparameter tuning
parameter_grid = {'learning_rate': [0.01, 0.05, 0.1, 0.3],
             'n_estimators': [100, 200, 500, 1000, 2000],
             'colsample_bytree': [0.1, 0.5, 0.7, 1.0]}

In [34]:
clf = xgb.XGBClassifier(random_state = 2001)

cross_val = KFold(5, random_state = 2001, shuffle=True)

In [None]:
# clf_grid = GridSearchCV(estimator = clf,
#                                 param_grid = parameter_grid,
#                                 scoring = "neg_mean_squared_error",
#                                 cv = cross_val,
#                                 verbose = 1,
#                                 return_train_score = True)

In [23]:
# clf_grid.fit(X_smote, y_smote)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


In [24]:
# clf_grid.cv_results_
# print(clf_grid.best_score_)
# print(clf_grid.best_params_)
# print(clf_grid.best_index_)
# clf_grid.cv_results_['params'][clf_grid.best_index_]

-0.7029368306847521
{'colsample_bytree': 0.5, 'learning_rate': 0.1, 'n_estimators': 1000}
33


{'colsample_bytree': 0.5, 'learning_rate': 0.1, 'n_estimators': 1000}

In [25]:
# # convert results to pandas df
# cv_results_dict = pd.DataFrame.from_dict(clf_grid.cv_results_)

# # write to csv for future reference
# cv_results_dict.to_csv("/home/jupyter/final_project/xgb_cv_results.csv")

# cv_results_dict

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,1.768965,0.035886,0.040885,0.020003,0.1,0.01,100,"{'colsample_bytree': 0.1, 'learning_rate': 0.0...",-1.071877,-0.985860,...,-1.013591,0.034698,80,-0.827735,-0.794245,-0.789039,-0.811254,-0.803201,-0.805095,0.013635
1,3.480168,0.082015,0.058982,0.000561,0.1,0.01,200,"{'colsample_bytree': 0.1, 'learning_rate': 0.0...",-1.023566,-0.981540,...,-1.004322,0.030071,79,-0.776763,-0.739442,-0.746022,-0.747029,-0.749190,-0.751689,0.012952
2,8.484597,0.076963,0.174741,0.041907,0.1,0.01,500,"{'colsample_bytree': 0.1, 'learning_rate': 0.0...",-0.912019,-0.889631,...,-0.909728,0.021197,76,-0.616185,-0.605775,-0.600275,-0.598056,-0.605814,-0.605221,0.006272
3,16.870224,0.259230,0.355146,0.019923,0.1,0.01,1000,"{'colsample_bytree': 0.1, 'learning_rate': 0.0...",-0.846819,-0.835428,...,-0.848368,0.018492,68,-0.424475,-0.403261,-0.421528,-0.411667,-0.418737,-0.415933,0.007628
4,33.750319,1.261094,0.746841,0.024281,0.1,0.01,2000,"{'colsample_bytree': 0.1, 'learning_rate': 0.0...",-0.791830,-0.813040,...,-0.803740,0.016039,58,-0.186309,-0.170595,-0.175800,-0.170677,-0.181970,-0.177070,0.006225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,4.400641,0.063208,0.041248,0.014610,1.0,0.3,100,"{'colsample_bytree': 1.0, 'learning_rate': 0.3...",-0.687353,-0.763158,...,-0.757388,0.036248,36,-0.013455,-0.008839,-0.013062,-0.008838,-0.015516,-0.011942,0.002668
76,8.081089,0.169628,0.062616,0.007492,1.0,0.3,200,"{'colsample_bytree': 1.0, 'learning_rate': 0.3...",-0.684996,-0.727808,...,-0.729261,0.028217,21,-0.000000,-0.000000,-0.000000,-0.000000,-0.000393,-0.000079,0.000157
77,16.640020,0.201813,0.148675,0.001219,1.0,0.3,500,"{'colsample_bytree': 1.0, 'learning_rate': 0.3...",-0.669285,-0.705420,...,-0.720698,0.030012,11,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,0.000000,0.000000
78,27.887004,0.362617,0.318817,0.004580,1.0,0.3,1000,"{'colsample_bytree': 1.0, 'learning_rate': 0.3...",-0.664179,-0.721917,...,-0.719675,0.029154,9,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,0.000000,0.000000


# Visualizing and Pulling Hyperparameters

In [50]:
# read csv of results (so you don't have to run hyperparameter tuning again)
hyperparams = pd.read_csv("xgb_cv_results.csv")

# get best hyperparameters
print(hyperparams[hyperparams["rank_test_score"] == 1])
eta = hyperparams[hyperparams["rank_test_score"] == 1]["param_learning_rate"].values[0]
n_estimators =  hyperparams[hyperparams["rank_test_score"] == 1]["param_n_estimators"].values[0]
colsample_bytree = hyperparams[hyperparams["rank_test_score"] == 1]["param_colsample_bytree"].values[0]

    Unnamed: 0  mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
33          33      24.466784      0.326777         0.319741        0.020982   

    param_colsample_bytree  param_learning_rate  param_n_estimators  \
33                     0.5                  0.1                1000   

                                               params  split0_test_score  ...  \
33  {'colsample_bytree': 0.5, 'learning_rate': 0.1...          -0.689317  ...   

    mean_test_score  std_test_score  rank_test_score  split0_train_score  \
33        -0.702937        0.017051                1                -0.0   

    split1_train_score  split2_train_score  split3_train_score  \
33                -0.0                -0.0                -0.0   

    split4_train_score  mean_train_score  std_train_score  
33                -0.0               0.0              0.0  

[1 rows x 24 columns]


In [51]:
print(eta, n_estimators, colsample_bytree)

0.1 1000 0.5


In [None]:
# graph rmse by each hyperparameter to visualize 

# could even use plotly for a 3D plot

# feature importance in training data

# Training Model Performance

In [59]:
# fit cross_val with best estimator in training data to get various outcomes beyond rmse
# instantiate classifier
clf = xgb.XGBClassifier(n_estimators = n_estimators,
                        learning_rate = eta,
                        colsample_bytree = colsample_bytree,
                        random_state = 2001)

In [60]:
# fit classifier to training data
clf.fit(X_smote, y_smote)

In [61]:
# fit cross_val to test data, get predicted values
# cross_val KFold object created earlier
train_scores = cross_val_score(clf, X_smote, y_smote, cv=cross_val)

In [62]:
train_scores

array([0.71052632, 0.69520817, 0.69520817, 0.71198428, 0.70294695])

# Test Model Performance

In [91]:
# compare predicted values to actual y_test
y_pred = clf.predict(X_test)
# reverse transformation xgboost required
y_pred = le.inverse_transform(y_pred)
# could do in one line, but it's harder to read: le.inverse_transform(clf.predict(X_test))

cm = confusion_matrix(y_test, y_pred)
print(cm)

f1 = f1_score(y_test, ypred, average="micro")
print(f1)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(rmse)

[[182  86  24  45]
 [ 67 312  93 110]
 [ 22 139 189 172]
 [ 41  87  96 572]]
0.5610192221725525
1.03557636972049


In [None]:
# feature importance in test data