# Split Data into Training and Test

In [2]:
!pip install imblearn

[0mCollecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.0-py3-none-any.whl.metadata (8.2 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.0-py3-none-any.whl (257 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.7/257.7 kB[0m [31m46.8 kB/s[0m eta [36m0:00:00[0m00:01[0m
[0mInstalling collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.0 imblearn-0.0


In [26]:
# load dependencies
import pandas as pd # for importing and handling data
import numpy as np # for working with arrays
from sklearn.model_selection import train_test_split # for splitting data
from imblearn.over_sampling import SMOTE # for smote oversampling

In [13]:
# load data
df = pd.read_csv("/home/jupyter/final_project/pisa_median.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,ST352Q06JA,AGE,ST004D01T,DURECEC,REPEAT,MISSSC,SKIPPING,TARDYSD,EXPECEDU,...,ST353Q07JA,ST353Q08JA,ST348Q01JA,ST348Q02JA,ST348Q03JA,ST348Q04JA,ST348Q05JA,ST348Q06JA,ST348Q07JA,ST348Q08JA
0,1.0,2.0,15.58,1.0,2.0,0.0,0.0,1.0,2.0,7.0,...,2.0,2.0,4.0,4.0,4.0,3.0,4.0,3.0,2.0,2.0
1,2.0,2.0,16.17,2.0,2.0,0.0,1.0,0.0,1.0,7.0,...,4.0,4.0,2.0,3.0,2.0,3.0,4.0,4.0,4.0,4.0
2,3.0,4.0,15.58,2.0,0.0,0.0,0.0,0.0,0.0,9.0,...,2.0,3.0,3.0,3.0,2.0,3.0,3.0,2.0,2.0,2.0
3,4.0,4.0,15.42,2.0,2.0,0.0,0.0,0.0,0.0,4.0,...,1.0,2.0,1.0,3.0,4.0,1.0,4.0,1.0,2.0,1.0
4,5.0,2.0,15.75,2.0,1.0,0.0,0.0,0.0,0.0,8.0,...,2.0,2.0,1.0,3.0,4.0,3.0,4.0,3.0,2.0,2.0


In [18]:
# split into target and features
y = df["ST352Q06JA"].values
X = df.drop(["ST352Q06JA"], axis=1).values

In [19]:
# split data into training and test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.2, # common to use 20-30% of data as test set
                                                    random_state = 2001, # set seed equivalent
                                                    stratify = y) # churn amounts equal in train and test

In [22]:
print(X_train[0:3])
print(y_train[0:3])

[[ 2.0870e+03  1.5670e+01  2.0000e+00  2.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  8.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  6.0000e+00  1.0000e+01  0.0000e+00  6.0000e+00 -7.1900e-02
  -1.2280e+00  1.1246e+00  1.4750e-01 -7.1000e-03 -3.2030e-01  2.7474e+00
  -9.7550e-01  1.8227e+00  5.2030e-01  2.3650e+00  3.8830e+00  1.1187e+00
   4.9584e+00  1.3749e+00  2.4100e-02  1.3679e+00  3.1637e+00  1.7174e+00
   1.7465e+00  1.0481e+00  1.0000e+00  1.0000e+00  1.0000e+00  2.0000e+00
   1.0000e+00  2.0000e+00  1.0000e+00  3.0000e+00  3.0000e+00  4.0000e+00
   3.0000e+00  4.0000e+00  2.0000e+00  2.0000e+00  1.0000e+00  1.0000e+00
   1.0000e+00  2.0000e+00  2.0000e+00  2.0000e+00  2.0000e+00  3.0000e+00
   3.0000e+00  3.0000e+00  4.0000e+00  4.0000e+00  2.0000e+00  1.0000e+00]
 [ 6.3190e+03  1.5830e+01  1.0000e+00  4.0000e+00  0.0000e+00  0.0000e+00
   1.0000e+00  0.0000e+00  6.0000e+00  1.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  1.0000e+01

# Conduct Oversampling with SMOTE on Training Data

In [32]:
# frequency of observations in training data pre-oversampling
temp = y_train.astype(int)
np.bincount(temp)

array([   0, 1349, 2325, 2088, 3182])

In [33]:
# conduct oversampling on training data using SMOTE
# SMOTE overview here: https://www.youtube.com/watch?v=1Ic7GRtDrPM
smote = SMOTE(random_state = 2001)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [37]:
temp2 = y_smote.astype(int)
np.bincount(temp2)

array([   0, 3182, 3182, 3182, 3182])

# Hyperparameter Tuning and Cross-Validation for xgboost

# Testing Model

# Performance Metrics for Training and Test