# Split Data into Training and Test

In [1]:
!pip install imblearn
!pip install xgboost

[0mCollecting xgboost
  Using cached xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Using cached xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[0mInstalling collected packages: xgboost
Successfully installed xgboost-2.0.3


In [20]:
# load dependencies
import pandas as pd # for importing and handling data
import numpy as np # for working with arrays
from sklearn.model_selection import train_test_split, RandomizedSearchCV # for splitting data and grid search
from sklearn.preprocessing import LabelEncoder # for processing data for xgboost
from imblearn.over_sampling import SMOTE # for smote oversampling
import xgboost as xgb # for xgboost model

In [2]:
# load data
df = pd.read_csv("/home/jupyter/final_project/pisa_median.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,ST352Q06JA,AGE,ST004D01T,DURECEC,REPEAT,MISSSC,SKIPPING,TARDYSD,EXPECEDU,...,ST353Q07JA,ST353Q08JA,ST348Q01JA,ST348Q02JA,ST348Q03JA,ST348Q04JA,ST348Q05JA,ST348Q06JA,ST348Q07JA,ST348Q08JA
0,1.0,2.0,15.58,1.0,2.0,0.0,0.0,1.0,2.0,7.0,...,2.0,2.0,4.0,4.0,4.0,3.0,4.0,3.0,2.0,2.0
1,2.0,2.0,16.17,2.0,2.0,0.0,1.0,0.0,1.0,7.0,...,4.0,4.0,2.0,3.0,2.0,3.0,4.0,4.0,4.0,4.0
2,3.0,4.0,15.58,2.0,0.0,0.0,0.0,0.0,0.0,9.0,...,2.0,3.0,3.0,3.0,2.0,3.0,3.0,2.0,2.0,2.0
3,4.0,4.0,15.42,2.0,2.0,0.0,0.0,0.0,0.0,4.0,...,1.0,2.0,1.0,3.0,4.0,1.0,4.0,1.0,2.0,1.0
4,5.0,2.0,15.75,2.0,1.0,0.0,0.0,0.0,0.0,8.0,...,2.0,2.0,1.0,3.0,4.0,3.0,4.0,3.0,2.0,2.0


In [3]:
# split into target and features
y = df["ST352Q06JA"].values
X = df.drop(["ST352Q06JA", "Unnamed: 0"], axis=1).values

In [4]:
# split data into training and test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.2, # common to use 20-30% of data as test set
                                                    random_state = 2001, # set seed equivalent
                                                    stratify = y) # churn amounts equal in train and test

In [24]:
print(X_train[0:3])
print(y_train[0:3])

[[ 1.5670e+01  2.0000e+00  2.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  8.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   6.0000e+00  1.0000e+01  0.0000e+00  6.0000e+00 -7.1900e-02 -1.2280e+00
   1.1246e+00  1.4750e-01 -7.1000e-03 -3.2030e-01  2.7474e+00 -9.7550e-01
   1.8227e+00  5.2030e-01  2.3650e+00  3.8830e+00  1.1187e+00  4.9584e+00
   1.3749e+00  2.4100e-02  1.3679e+00  3.1637e+00  1.7174e+00  1.7465e+00
   1.0481e+00  1.0000e+00  1.0000e+00  1.0000e+00  2.0000e+00  1.0000e+00
   2.0000e+00  1.0000e+00  3.0000e+00  3.0000e+00  4.0000e+00  3.0000e+00
   4.0000e+00  2.0000e+00  2.0000e+00  1.0000e+00  1.0000e+00  1.0000e+00
   2.0000e+00  2.0000e+00  2.0000e+00  2.0000e+00  3.0000e+00  3.0000e+00
   3.0000e+00  4.0000e+00  4.0000e+00  2.0000e+00  1.0000e+00]
 [ 1.5830e+01  1.0000e+00  4.0000e+00  0.0000e+00  0.0000e+00  1.0000e+00
   0.0000e+00  6.0000e+00  1.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  1.0000e+01  6.0000e+00  1.0000e+01

# Conduct Oversampling with SMOTE on Training Data

In [5]:
# frequency of observations in training data pre-oversampling
temp = y_train.astype(int)
np.bincount(temp)

array([   0, 1349, 2325, 2088, 3182])

In [5]:
# conduct oversampling on training data using SMOTE
# SMOTE overview here: https://www.youtube.com/watch?v=1Ic7GRtDrPM
smote = SMOTE(random_state = 2001)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [6]:
# convert y_smote to integer (needed for xgboost later)
y_smote = y_smote.astype(int)
np.bincount(y_smote)

array([   0, 3182, 3182, 3182, 3182])

In [7]:
# transform data to work with xgboost (encoding needs to start at zero)
le = LabelEncoder()
y_smote = le.fit_transform(y_smote)

# Hyperparameter Tuning and Cross-Validation for xgboost

We want to train hyperparameters for `eta` (learning rate), `n_estimators` (number of estimators), and `max_features` (maximum features considered at each split). Specifically, using the following values:

* `eta`: 0.1, 0.01, 0.001
* `n_estimators`: 50, 100, 250, 500
* `colsample_bytree`: 0.1, 0.5, 0.7, 1.0

Furthermore, we will conduct 5-fold cross-validation for each training model to return an aggregate RMSE value. In total, that yields 3 (`eta`) x 4 (`n_estimators`) x 4 (`colsample_bytree`) x 5 (cross-validation) = 240 estimated models. We will use randomized search for this endeavour.

In [18]:
# EXAMPLE CODE, TO BE DELETED

from sklearn.metrics import confusion_matrix, mean_squared_error as MSE

# test single xgboost
bst = xgb.XGBClassifier(eta=0.1, n_estimators=50, colsample_bytree=0.5)
bst.fit(X_smote, y_smote)

# undo y_smote refactoring
y_pred = bst.predict(X_test)
print(f"ypred before reversing transformation: {y_pred}")
y_pred = le.inverse_transform(y_pred)
print(f"ypred after reversing transformation: {y_pred}")

cm = confusion_matrix(y_test, y_pred)
rmse = np.sqrt(MSE(y_test, y_pred))

print(cm)
print(f"RMSE: {rmse}")

ypred before reversing transformation: [0 3 2 ... 3 3 1]
ypred after reversing transformation: [1 4 3 ... 4 4 2]
[[222  74  14  27]
 [ 80 344  74  84]
 [ 28 162 170 162]
 [ 58 120  97 521]]
RMSE: 1.0353605122153373


In [26]:
# create values for hyperparameter tuning
parameter_grid = {'learning_rate': [0.1, 0.01, 0.001],
             'n_estimators': [50, 100, 250, 500],
             'colsample_bytree': [0.1, 0.5, 0.7, 1.0]}

# for test
#parameter_grid = {'eta': [0.1, 0.01, 0.001],
#                  'n_estimators': [50, 100],
#                  'colsample_bytree': [0.1, 1.0]
#}

# instantiate the regressor
xgboost_class = xgb.XGBClassifier(random_state = 2001)

In [27]:
# create grid search object
# neg_mean_squared_error per https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
grid_xgboost = RandomizedSearchCV(
    estimator = xgboost_class,
    param_distributions = parameter_grid,
    scoring = 'neg_mean_squared_error',
    n_iter = 10,
    cv = 5,
    refit = True,
    return_train_score = True,
    verbose = 1)

# fit grid search object to imputed and oversampled training data
grid_xgboost.fit(X_smote, y_smote)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [35]:
# print values used for search
print(f"\n The learning rate parameters searched:\n {grid_xgboost.cv_results_['param_learning_rate']}")
print(f"\n The number of estimators parameters searched:\n {grid_xgboost.cv_results_['param_n_estimators']}")
print(f"\n The number of features considered at each split, parameters searched:\n {grid_xgboost.cv_results_['param_colsample_bytree']}")

#print(grid_xgboost.cv_results_)

print("\n The best score across ALL searched params:\n", grid_xgboost.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_xgboost.best_params_)


 The learning rate parameters searched:
 [0.1 0.01 0.1 0.1 0.01 0.1 0.01 0.001 0.001 0.01]

 The number of estimators parameters searched:
 [50 250 500 250 100 100 50 250 50 500]

 The number of features considered at each split, parameters searched:
 [1.0 0.1 1.0 0.5 0.5 1.0 1.0 0.7 0.5 1.0]

 The best score across ALL searched params:
 -0.7555364630677652

 The best parameters across ALL searched params:
 {'n_estimators': 500, 'learning_rate': 0.1, 'colsample_bytree': 1.0}


# Testing Model

In [None]:
# get predicted values

# compare predicted values to actual y_test


# Performance Metrics for Training and Test