## Syria-Tel Classification Modeling

## Business Understanding

Telecom company.  Trying to come up with a model that predicts churn based on customer location and usage data. 

## Data Understanding

#### Imports

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer,  make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

from pandas.api.types import is_numeric_dtype
from sklearn.svm import SVC

from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

#### Functions and Constants

In [81]:
test_size = .35

def cross_val_and_print_results_on_model(model_name,pipe, X_tr, y_tr, X_te, y_te):
    results = cross_val_score(pipe, X_tr, y_tr, cv=10)
    y_hat = pipe.fit(X_tr, y_tr).predict(X_te)
    print(model_name + ":")
    print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
    print(f"Results on test data:             {round(accuracy_score(y_te, y_hat), 4)}")
    
    
def grid_search_and_print_results(params, pipe, X_tr, y_tr, X_te, y_te):
    gs = GridSearchCV(estimator=pipe,param_grid=params, cv=5, verbose=1)
    gs.fit(X_tr, y_tr)
    print("Results:")
    print("- Best params: ", gs.best_params_)
    print("- Best score: ", gs.best_score_)
    print("- Test data score: ", gs.best_estimator_.score(X_te, y_te))

#### Importing data

In [4]:
df = pd.read_csv('data/bigml_59c28831336c6604c800002a.csv')
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


#### Data exploration / cleaning

Review and fix the feature names to be more code friendly. 

In [5]:
# View column names
df.columns

Index(['state', 'account length', 'area code', 'phone number',
       'international plan', 'voice mail plan', 'number vmail messages',
       'total day minutes', 'total day calls', 'total day charge',
       'total eve minutes', 'total eve calls', 'total eve charge',
       'total night minutes', 'total night calls', 'total night charge',
       'total intl minutes', 'total intl calls', 'total intl charge',
       'customer service calls', 'churn'],
      dtype='object')

In [6]:
# Rename columns
df.columns = df.columns.str.replace(" ", "_")
df.columns

Index(['state', 'account_length', 'area_code', 'phone_number',
       'international_plan', 'voice_mail_plan', 'number_vmail_messages',
       'total_day_minutes', 'total_day_calls', 'total_day_charge',
       'total_eve_minutes', 'total_eve_calls', 'total_eve_charge',
       'total_night_minutes', 'total_night_calls', 'total_night_charge',
       'total_intl_minutes', 'total_intl_calls', 'total_intl_charge',
       'customer_service_calls', 'churn'],
      dtype='object')

Check out the features we have and if there are any null values.

In [7]:
df

Unnamed: 0,state,account_length,area_code,phone_number,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,...,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.70,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.70,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.30,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.90,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,AZ,192,415,414-4276,no,yes,36,156.2,77,26.55,...,126,18.32,279.1,83,12.56,9.9,6,2.67,2,False
3329,WV,68,415,370-3271,no,no,0,231.1,57,39.29,...,55,13.04,191.3,123,8.61,9.6,4,2.59,3,False
3330,RI,28,510,328-8230,no,no,0,180.8,109,30.74,...,58,24.55,191.9,91,8.64,14.1,6,3.81,2,False
3331,CT,184,510,364-6381,yes,no,0,213.8,105,36.35,...,84,13.57,139.2,137,6.26,5.0,10,1.35,2,False


Check out the values in each feature to see what type of data each really is.

In [8]:
for x in df.columns:
    print(df[x].value_counts())        

WV    106
MN     84
NY     83
AL     80
OH     78
WI     78
OR     78
VA     77
WY     77
CT     74
ID     73
VT     73
MI     73
TX     72
UT     72
IN     71
MD     70
KS     70
MT     68
NC     68
NJ     68
CO     66
WA     66
NV     66
RI     65
MA     65
MS     65
AZ     64
MO     63
FL     63
ME     62
ND     62
NM     62
OK     61
DE     61
NE     61
SD     60
SC     60
KY     59
IL     58
NH     56
AR     55
GA     54
DC     54
TN     53
HI     53
AK     52
LA     51
PA     45
IA     44
CA     34
Name: state, dtype: int64
105    43
87     42
93     40
101    40
90     39
       ..
191     1
199     1
215     1
221     1
2       1
Name: account_length, Length: 212, dtype: int64
415    1655
510     840
408     838
Name: area_code, dtype: int64
404-6337    1
351-4616    1
389-7012    1
408-2119    1
362-5579    1
           ..
413-4831    1
412-2520    1
375-5547    1
395-6030    1
349-3005    1
Name: phone_number, Length: 3333, dtype: int64
no     3010
yes     323
Name: internati

In [9]:
df.describe()

Unnamed: 0,account_length,area_code,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls
count,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0
mean,101.064806,437.182418,8.09901,179.775098,100.435644,30.562307,200.980348,100.114311,17.08354,200.872037,100.107711,9.039325,10.237294,4.479448,2.764581,1.562856
std,39.822106,42.37129,13.688365,54.467389,20.069084,9.259435,50.713844,19.922625,4.310668,50.573847,19.568609,2.275873,2.79184,2.461214,0.753773,1.315491
min,1.0,408.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.2,33.0,1.04,0.0,0.0,0.0,0.0
25%,74.0,408.0,0.0,143.7,87.0,24.43,166.6,87.0,14.16,167.0,87.0,7.52,8.5,3.0,2.3,1.0
50%,101.0,415.0,0.0,179.4,101.0,30.5,201.4,100.0,17.12,201.2,100.0,9.05,10.3,4.0,2.78,1.0
75%,127.0,510.0,20.0,216.4,114.0,36.79,235.3,114.0,20.0,235.3,113.0,10.59,12.1,6.0,3.27,2.0
max,243.0,510.0,51.0,350.8,165.0,59.64,363.7,170.0,30.91,395.0,175.0,17.77,20.0,20.0,5.4,9.0


In [10]:
df.isna().sum()

state                     0
account_length            0
area_code                 0
phone_number              0
international_plan        0
voice_mail_plan           0
number_vmail_messages     0
total_day_minutes         0
total_day_calls           0
total_day_charge          0
total_eve_minutes         0
total_eve_calls           0
total_eve_charge          0
total_night_minutes       0
total_night_calls         0
total_night_charge        0
total_intl_minutes        0
total_intl_calls          0
total_intl_charge         0
customer_service_calls    0
churn                     0
dtype: int64

## Data Preparation

#### Figure out feature types

In [11]:
X_numeric_cols = ['account_length','number_vmail_messages',
       'total_day_minutes', 'total_day_calls', 'total_day_charge',
       'total_eve_minutes', 'total_eve_calls', 'total_eve_charge',
       'total_night_minutes', 'total_night_calls', 'total_night_charge',
       'total_intl_minutes', 'total_intl_calls', 'total_intl_charge',
       'customer_service_calls']
X_categorical_cols = ['state', 'area_code','international_plan', 'voice_mail_plan']
X_ignore_cols = ['phone_number']
y_target_col = 'churn'

#### Train-test-split of the data

In [12]:
X = df.drop(X_ignore_cols + [y_target_col], axis=1)
y = df[y_target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=test_size)

In [13]:
df.churn.value_counts(normalize=True)

False    0.855086
True     0.144914
Name: churn, dtype: float64

This seems to be imbalanced.  Plan on using SMOTE to oversample the minority class later on. 

#### Create column transformer pipelines

In [14]:
# Numerical column transformers
num_pipe = Pipeline([('ss',StandardScaler())])

# Categorical column transformers
cat_pipe = Pipeline([
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])
CT = ColumnTransformer(transformers=[
    ('num_trans', num_pipe, X_numeric_cols),
    ('cat_trans', cat_pipe, X_categorical_cols)
],remainder='passthrough')

## Modeling

### Baseline Model

#### Dummy Classifier

In [33]:
model_name = "Baseline"
model = DummyClassifier(strategy='most_frequent', random_state=42)
pipe = Pipeline([ ('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

Baseline:
CV score on training data (mean): 0.8555
Results on test data:             0.8543


### Round 1 - Simple Models (default hyperparameters)

#### Logistic Regression Classifier

In [38]:
model_name = "Logistic Regression"
model = LogisticRegression(random_state=42)
pipe = Pipeline([ ('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

Logistic Regression:
CV score on training data (mean): 0.8569
Results on test data:             0.8646


Results:
- The Logistic Regression Classifier did not improve the accuracy over the baseline model notably.
- Appears potentially underfit as the test data outperformed the training data.

#### Decision Tree Classifier

In [39]:
model_name = "Decision Tree Classifier"
model = DecisionTreeClassifier(random_state=42)
pipe = Pipeline([ ('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

Decision Tree Classifier:
CV score on training data (mean): 0.9031
Results on test data:             0.9186


Results:
- This model improved the accuracy over the baseline model by ~6%.  
- Appears potentially underfit as the test data outperformed the training data.

#### KNearestNeighbors Classifier

In [31]:
model_name = "KNearestNeighbors Classifier"
model = KNeighborsClassifier()
pipe = Pipeline([('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

KNearestNeighbors Classifier:
CV score on training data (mean): 0.8818
Results on test data:             0.88


Results:
- The KNN model improved the accuracy over the baseline model by ~3%.
- The model does not appear notably underfit or overfit.

#### Random Forest Classifier

In [30]:
model_name = "Random Forest Classifier"
model = RandomForestClassifier(random_state=42)
pipe = Pipeline([ ('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

Random Forest Classifier:
CV score on training data (mean): 0.9381
Results on test data:             0.9383


Results:
- The Random Forest model improved the accuracy over the baseline model on test data by ~9%.  
- The model does not appear notably underfit or overfit.
- This is a top performing model.

#### GuassianNB Classifier

In [29]:
model_name = "GuassianNB Classifier"
model = GaussianNB()
pipe = Pipeline([ ('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

GuassianNB Classifier:
CV score on training data (mean): 0.5527
Results on test data:             0.5758


Results:
- The Guassian NB model did not perform well. 

#### SVC Classifier

In [34]:
model_name = "SVC Classifier"
model = SVC(random_state=42)
pipe = Pipeline([ ('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

SVC Classifier:
CV score on training data (mean): 0.9017
Results on test data:             0.9109


Results:
- The SVC model improved the accuracy over the baseline model on test data by 4-5%.  
- Appears potentially underfit as the test data outperformed the training data.

#### Gradient Booster Classifier

In [35]:
model_name = "Gradient Booster"
model = GradientBoostingClassifier(random_state=42)
pipe = Pipeline([ ('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

Gradient Booster:
CV score on training data (mean): 0.9451
Results on test data:             0.952


Results:
- The Gradient Boosting Classifier improved the accuracy over the baseline model on test data by ~9%.  
- Appears potentially underfit as the test data outperformed the training data.
- This is a top performing model.

#### XGBoost Classifier

In [40]:
encoder = LabelEncoder()
y_train_xgb = pd.Series(encoder.fit_transform(y_train))
y_test_xgb = pd.Series(encoder.transform(y_test))

model_name = "XGBoost Classifier"
model = XGBClassifier(random_state=42)
pipe = Pipeline([ ('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train_xgb, X_test, y_test_xgb)

XGBoost Classifier:
CV score on training data (mean): 0.9529
Results on test data:             0.9529


Results:
- The XGBoost Classifier improved the accuracy over the baseline model on test data by ~10%.  
- It does not appear to be notably underfit or overfit. 
- This is a top performing model.

#### Conclusion from Round 1
- The XGBoost, Gradient Booster, and Random Forest classifiers performed the best on test data. 

### Round 2 - Simple Models with Balanced Data

As we can see below, the target classes are pretty unbalanced (86%/14%).  I'd like to balance these using SMOTE to see if we can get any better results with the simple models. 

In [23]:
y_train.value_counts()

False    1853
True      313
Name: churn, dtype: int64

In [24]:
y_train.value_counts(normalize=True)

False    0.855494
True     0.144506
Name: churn, dtype: float64

#### Baseline Model

In [51]:
model_name = "Baseline - Original"
model = DummyClassifier(strategy='most_frequent', random_state=42)
pipe = Pipeline([ ('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

print("\n")
model_name = "Baseline - Balanced"
model = DummyClassifier(strategy='most_frequent', random_state=42)
pipe = ImPipeline([ ('ct',CT), ('sm1', SMOTE()), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

Baseline - Original:
CV score on training data (mean): 0.8555
Results on test data:             0.8543


Baseline - Balanced:
CV score on training data (mean): 0.8555
Results on test data:             0.8543


Results:
- Balancing the data did not change the results on the baseline model.  I'm not certain of the reason.  Moving on to see if balancing the data changes the results of the other models.  

#### Logistic Regression Classifier

In [52]:
model_name = "Logistic Regression - Original"
model = LogisticRegression(random_state=42, max_iter=200)
pipe = Pipeline([ ('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

print("\n")
model_name = "Logistic Regression - Balanced"
model = LogisticRegression(random_state=42, max_iter=200)
pipe = ImPipeline([ ('ct',CT), ('sm1', SMOTE()), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

Logistic Regression - Original:
CV score on training data (mean): 0.8569
Results on test data:             0.8646


Logistic Regression - Balanced:
CV score on training data (mean): 0.7558
Results on test data:             0.7592


Results:  
- Balancing the data decreased the accuracy of the model on test data.   

#### Decision Tree Classifier

In [54]:
model_name = "Decision Tree Classifier - Original"
model = DecisionTreeClassifier(random_state=42)
pipe = Pipeline([ ('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

print("\n")
model_name = "Decision Tree Classifier - Balanced"
model = DecisionTreeClassifier(random_state=42)
pipe = ImPipeline([ ('ct',CT), ('sm1', SMOTE()), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

Decision Tree Classifier - Original:
CV score on training data (mean): 0.9031
Results on test data:             0.9186


Decision Tree Classifier - Balanced:
CV score on training data (mean): 0.8842
Results on test data:             0.9229


Results:
- Balancing the data slightly improved accuracy of the model on test data.   

#### KNearestNeighbors Classifier

In [56]:
model_name = "KNearestNeighbors Classifier - Original"
model = KNeighborsClassifier()
pipe = Pipeline([('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

print("\n")
model_name = "KNearestNeighbors Classifier - Balanced"
model = KNeighborsClassifier()
pipe = ImPipeline([ ('ct',CT), ('sm1', SMOTE()), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

KNearestNeighbors Classifier - Original:
CV score on training data (mean): 0.8818
Results on test data:             0.88


KNearestNeighbors Classifier - Balanced:
CV score on training data (mean): 0.7429
Results on test data:             0.7515


Results:
- Balancing the data decreased the accuracy of the model on test data.  

#### Random Forest Classifier

In [57]:
model_name = "Random Forest Classifier - Original"
model = RandomForestClassifier(random_state=42)
pipe = Pipeline([ ('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

print("\n")
model_name = "Random Forest Classifier - Balanced"
model = RandomForestClassifier(random_state=42)
pipe = ImPipeline([ ('ct',CT), ('sm1', SMOTE()), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

Random Forest Classifier - Original:
CV score on training data (mean): 0.9381
Results on test data:             0.9383


Random Forest Classifier - Balanced:
CV score on training data (mean): 0.9312
Results on test data:             0.9392


Results:
- Balancing the data did not notably increase the accuracy of the model on test data.  

#### GuassianNB Classifier

In [30]:
model_name = "GuassianNB Classifier - Original"
model = GaussianNB()
pipe = Pipeline([ ('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)


model_name = "GuassianNB Classifier - Balanced"
model = GaussianNB()
pipe = ImPipeline([ ('ct',CT), ('sm1', SMOTE()), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

GuassianNB Classifier - Unbalanced:
CV score on training data (mean): 0.5527
Results on test data:             0.5758


GuassianNB Classifier - Balanced:
CV score on training data (mean): 0.5513
Results on test data:             0.5527


Results:
- Balancing the data decreased the accuracy of the model on test data.   

#### SVC Classifier

In [59]:
model_name = "SVC Classifier - Original"
model = SVC(random_state=42)
pipe = Pipeline([ ('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

print("\n")
model_name = "SVC Classifier - Balanced"
model = SVC(random_state=42)
pipe = ImPipeline([ ('ct',CT), ('sm1', SMOTE()), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

SVC Classifier - Original:
CV score on training data (mean): 0.9017
Results on test data:             0.9109


SVC Classifier - Balanced:
CV score on training data (mean): 0.9017
Results on test data:             0.9109


Results:
- Balancing the data decreased the accuracy of the model on test data.  

#### Gradient Booster Classifier

In [61]:
model_name = "Gradient Booster - Original"
model = GradientBoostingClassifier(random_state=42)
pipe = Pipeline([ ('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

print("\n")
model_name = "Gradient Booster - Balanced"
model = GradientBoostingClassifier(random_state=42)
pipe = ImPipeline([ ('ct',CT), ('sm1', SMOTE()), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train, X_test, y_test)

Gradient Booster - Original:
CV score on training data (mean): 0.9451
Results on test data:             0.952


Gradient Booster - Balanced:
CV score on training data (mean): 0.928
Results on test data:             0.94


Results:
- Balancing the data decreased the accuracy of the model on test data.

#### XGBoost Classifier

In [33]:
encoder = LabelEncoder()
y_train_xgb = pd.Series(encoder.fit_transform(y_train))
y_test_xgb = pd.Series(encoder.transform(y_test))

model_name = "XGBoost Classifier - Original"
model = XGBClassifier(random_state=42)
pipe = Pipeline([ ('ct',CT), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train_xgb, X_test, y_test_xgb)

print("\n")
model_name = "XGBoost Classifier - Balanced"
model = XGBClassifier(random_state=42)
pipe = ImPipeline([ ('ct',CT), ('sm1', SMOTE()), ('model', model) ])
cross_val_and_print_results_on_model(model_name, pipe, X_train, y_train_xgb, X_test, y_test_xgb)

XGBoost Classifier - Unbalanced:
CV score on training data (mean): 0.9529
Results on test data:             0.9529


XGBoost Classifier - Balanced:
CV score on training data (mean): 0.9511
Results on test data:             0.9572


Results:
- Balancing the data slightly improved the accuracy of the model on test data.  

#### Conclusion from Round 2
- Using SMOTE to balance the data did not notably improve any of the models performance, and in fact it decreased the performance of most of the models.  It appears that using data as is (86/14 split) is not that bad for the models.  This would align with the 90/10 split threshold discussed during our class.  As such, I will not use SMOTE going forward in the next round of modeling.  

### Round 3 - Hyperparameter Tuning the 3 Best Performing Models

#### XGBoost Classifier

**Create the model and pipeline**

In [99]:
# Transform y-train and y-test based using LabelEncoder
# This is specific to XGBoost
encoder = LabelEncoder()
y_train_xgb = pd.Series(encoder.fit_transform(y_train))
y_test_xgb = pd.Series(encoder.transform(y_test))

model_name = "XGBoost Classifier"
model = XGBClassifier(random_state=42)
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])

**Round 1 of Gridsearching on XGBoost Classifer (try parameters to the left and right of defaults)**

In [100]:
parameters = {'model__learning_rate':[.1, .3, .5],          # default is 0.3 
              'model__max_depth': [4, 6, 8],                # default is 6    
              'model__min_child_weight': [0, 1, 2],         # default is 1    
              'model__subsample': [.75, 1.0]                # default is 1   
             }
grid_search_and_print_results(parameters, pipe, X_train, y_train_xgb, X_test, y_test_xgb)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 270 out of 270 | elapsed:   48.1s finished


Results:
- Best params:  {'model__learning_rate': 0.5, 'model__max_depth': 6, 'model__min_child_weight': 0, 'model__subsample': 1.0}
- Best score:  0.9547588893264226
- Test data score:  0.9562982005141388


Results:
- Best params:  {'model__learning_rate': 0.5, 'model__max_depth': 6, 'model__min_child_weight': 0, 'model__subsample': 1.0}
- Best score:  0.9547588893264226
- Test data score:  0.9562982005141388

**Round 2 of Gridsearching on XGBoost Classifer (use results of last gridsearch to narrow search)**

In [101]:
parameters = {'model__learning_rate':[.4, .5, .6],            
              'model__max_depth': [5, 6, 7],                 
              'model__min_child_weight': [0, .25, .5],      
              'model__subsample': [.8, .9, 1.0]                 
             }
grid_search_and_print_results(parameters, pipe, X_train, y_train_xgb, X_test, y_test_xgb)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 405 out of 405 | elapsed:  1.5min finished


Results:
- Best params:  {'model__learning_rate': 0.4, 'model__max_depth': 7, 'model__min_child_weight': 0, 'model__subsample': 1.0}
- Best score:  0.9561392492629921
- Test data score:  0.9545844044558698


Results:
- Best params:  {'model__learning_rate': 0.4, 'model__max_depth': 7, 'model__min_child_weight': 0, 'model__subsample': 1.0}
- Best score:  0.9561392492629921
- Test data score:  0.9545844044558698

**Round 3 of Gridsearching on XGBoost Classifer (use results of last gridsearch to narrow search)**

In [103]:
parameters = {'model__learning_rate':[.3, .4, .5],            
              'model__max_depth': [6, 7, 8],                 
              'model__min_child_weight': [0],      
              'model__subsample': [.9, .95]                 
             }
grid_search_and_print_results(parameters, pipe, X_train, y_train_xgb, X_test, y_test_xgb)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   20.1s finished


Results:
- Best params:  {'model__learning_rate': 0.4, 'model__max_depth': 7, 'model__min_child_weight': 0, 'model__subsample': 0.95}
- Best score:  0.9538340375262078
- Test data score:  0.9528706083976007


Results:
- Best params:  {'model__learning_rate': 0.4, 'model__max_depth': 7, 'model__min_child_weight': 0, 'model__subsample': 0.95}
- Best score:  0.9538340375262078
- Test data score:  0.9528706083976007

#### Gradient Booster Classifier

**Create the model and pipeline**

In [91]:
model_name = "Gradient Booster"
model = GradientBoostingClassifier(random_state=42)
pipe = Pipeline([ ('ct', CT), ('model', model) ])

**Round 1 of Gridsearching on Gradient Booster Classifier (try parameters to the left and right of defaults)**

In [93]:
parameters = {'model__n_estimators':[100, 200],                      # default is 100
              'model__learning_rate':[.1, .3],                       # default is 0.1 
              'model__max_depth': [5, 7],                            # default is 3 
              'model__subsample': [.8, 1.0]                          # default is 1   
             }

grid_search_and_print_results(parameters, pipe, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  2.0min finished


Results:
- Best params:  {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200, 'model__subsample': 0.8}
- Best score:  0.9547556965123828
- Test data score:  0.9511568123393316


Results:
- Best params:  {'model__learning_rate': 0.4, 'model__max_depth': 7, 'model__min_child_weight': 0, 'model__subsample': 0.95}
- Best score:  0.9538340375262078
- Test data score:  0.9528706083976007

**Round 2 of Gridsearching on Gradient Booster Classifier (adjusted from prior gridsearch)**

In [94]:
parameters = {'model__n_estimators':[150, 200, 250],                      # default is 100
              'model__learning_rate':[.1, .2],                       # default is 0.1 
              'model__max_depth': [4, 5, 6],                            # default is 3 
              'model__subsample': [.7, .8, .9]                          # default is 1   
             }

grid_search_and_print_results(parameters, pipe, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 270 out of 270 | elapsed:  6.7min finished


Results:
- Best params:  {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200, 'model__subsample': 0.9}
- Best score:  0.9552197188195102
- Test data score:  0.9554413024850043


Results:
- Best params:  {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200, 'model__subsample': 0.9}
- Best score:  0.9552197188195102
- Test data score:  0.9554413024850043

#### Random Forest Classifier

**Create the model and pipeline**

In [104]:
model_name = "Random Forest Classifier"
model = RandomForestClassifier(random_state=42)
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])

**Round 1 of Gridsearching on Random Forest Classifier (try parameters to the left and right of defaults)**

In [105]:
parameters = {'model__n_estimators':[50, 100, 200],                       # default is 100
              'model__max_depth': [None, 3, 6, 9],                        # default is None
              'model__criterion': ['gini', 'entropy'],                    # default is gini
              'model__min_samples_split': [1.0, 3, 5]                     # default is 1
             }
grid_search_and_print_results(parameters, pipe, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:  1.3min finished


Results:
- Best params:  {'model__criterion': 'gini', 'model__max_depth': None, 'model__min_samples_split': 5, 'model__n_estimators': 100}
- Best score:  0.935827630612701
- Test data score:  0.9451585261353899


Results:
- Best params:  {'model__criterion': 'gini', 'model__max_depth': None, 'model__min_samples_split': 5, 'model__n_estimators': 100}
- Best score:  0.935827630612701
- Test data score:  0.9451585261353899

#### Conclusion from Round 3
- XGBoost was the model with the best accuracy on test data using gridsearching.

### Best Model

#### XGBoost (Best Hyperparameters from Gridsearch)

In [108]:
# Transform y-train and y-test based using LabelEncoder
# This is specific to XGBoost
encoder = LabelEncoder()
y_train_xgb = pd.Series(encoder.fit_transform(y_train))
y_test_xgb = pd.Series(encoder.transform(y_test))

model_name = "XGBoost Classifier"
model = XGBClassifier(random_state=42)
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])

parameters = {'model__learning_rate':[.5],          
              'model__max_depth': [6],                 
              'model__min_child_weight': [0],        
              'model__subsample': [1.0]}           
              
grid_search_and_print_results(parameters, pipe, X_train, y_train_xgb, X_test, y_test_xgb)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.8s finished


Results:
- Best params:  {'model__learning_rate': 0.5, 'model__max_depth': 6, 'model__min_child_weight': 0, 'model__subsample': 1.0}
- Best score:  0.9547588893264226
- Test data score:  0.9562982005141388


## Evaluation

## Conclusion