## Syria-Tel Classification Modeling

## Business Understanding

Telecom company.  Trying to come up with a model that predicts churn based on customer location and usage data. 

## Data Understanding

bla bla bla 

## Data Preparation and Modeling

#### Imports

In [99]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from pandas.api.types import is_numeric_dtype

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer,  make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

from pandas.api.types import is_numeric_dtype
from sklearn.svm import SVC

from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

#### Functions

#### Importing data

In [100]:
df = pd.read_csv('data/bigml_59c28831336c6604c800002a.csv')
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


#### Data preparation

Review and fix the feature names to be more code friendly. 

In [101]:
# View column names
df.columns

Index(['state', 'account length', 'area code', 'phone number',
       'international plan', 'voice mail plan', 'number vmail messages',
       'total day minutes', 'total day calls', 'total day charge',
       'total eve minutes', 'total eve calls', 'total eve charge',
       'total night minutes', 'total night calls', 'total night charge',
       'total intl minutes', 'total intl calls', 'total intl charge',
       'customer service calls', 'churn'],
      dtype='object')

In [102]:
# Rename columns
df.columns = df.columns.str.replace(" ", "_")
df.columns

Index(['state', 'account_length', 'area_code', 'phone_number',
       'international_plan', 'voice_mail_plan', 'number_vmail_messages',
       'total_day_minutes', 'total_day_calls', 'total_day_charge',
       'total_eve_minutes', 'total_eve_calls', 'total_eve_charge',
       'total_night_minutes', 'total_night_calls', 'total_night_charge',
       'total_intl_minutes', 'total_intl_calls', 'total_intl_charge',
       'customer_service_calls', 'churn'],
      dtype='object')

#### Data Cleaning

Check out the features we have and if there are any null values.

In [103]:
X_train.isna().sum()

state                     0
account_length            0
area_code                 0
international_plan        0
voice_mail_plan           0
number_vmail_messages     0
total_day_minutes         0
total_day_calls           0
total_day_charge          0
total_eve_minutes         0
total_eve_calls           0
total_eve_charge          0
total_night_minutes       0
total_night_calls         0
total_night_charge        0
total_intl_minutes        0
total_intl_calls          0
total_intl_charge         0
customer_service_calls    0
dtype: int64

In [104]:
df

Unnamed: 0,state,account_length,area_code,phone_number,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,...,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.70,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.70,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.30,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.90,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,AZ,192,415,414-4276,no,yes,36,156.2,77,26.55,...,126,18.32,279.1,83,12.56,9.9,6,2.67,2,False
3329,WV,68,415,370-3271,no,no,0,231.1,57,39.29,...,55,13.04,191.3,123,8.61,9.6,4,2.59,3,False
3330,RI,28,510,328-8230,no,no,0,180.8,109,30.74,...,58,24.55,191.9,91,8.64,14.1,6,3.81,2,False
3331,CT,184,510,364-6381,yes,no,0,213.8,105,36.35,...,84,13.57,139.2,137,6.26,5.0,10,1.35,2,False


Check out the values in each feature

In [105]:
for x in df.columns:
    if x in X_ignore_cols:
        continue
    print("\n", x)
    if x in X_numeric_cols:
        print("min:", df[x].min())
        print("max:", df[x].max())
        print("mean:", df[x].mean())
    print(df[x].value_counts())
            


 state
WV    106
MN     84
NY     83
AL     80
OR     78
OH     78
WI     78
VA     77
WY     77
CT     74
MI     73
VT     73
ID     73
UT     72
TX     72
IN     71
KS     70
MD     70
NJ     68
MT     68
NC     68
CO     66
NV     66
WA     66
MS     65
RI     65
MA     65
AZ     64
MO     63
FL     63
NM     62
ME     62
ND     62
DE     61
OK     61
NE     61
SD     60
SC     60
KY     59
IL     58
NH     56
AR     55
GA     54
DC     54
TN     53
HI     53
AK     52
LA     51
PA     45
IA     44
CA     34
Name: state, dtype: int64

 account_length
min: 1
max: 243
mean: 101.06480648064806
105    43
87     42
93     40
101    40
90     39
       ..
191     1
199     1
215     1
221     1
2       1
Name: account_length, Length: 212, dtype: int64

 area_code
415    1655
510     840
408     838
Name: area_code, dtype: int64

 international_plan
no     3010
yes     323
Name: international_plan, dtype: int64

 voice_mail_plan
no     2411
yes     922
Name: voice_mail_plan, dtype: int64


In [106]:
df.describe()

Unnamed: 0,account_length,area_code,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls
count,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0
mean,101.064806,437.182418,8.09901,179.775098,100.435644,30.562307,200.980348,100.114311,17.08354,200.872037,100.107711,9.039325,10.237294,4.479448,2.764581,1.562856
std,39.822106,42.37129,13.688365,54.467389,20.069084,9.259435,50.713844,19.922625,4.310668,50.573847,19.568609,2.275873,2.79184,2.461214,0.753773,1.315491
min,1.0,408.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.2,33.0,1.04,0.0,0.0,0.0,0.0
25%,74.0,408.0,0.0,143.7,87.0,24.43,166.6,87.0,14.16,167.0,87.0,7.52,8.5,3.0,2.3,1.0
50%,101.0,415.0,0.0,179.4,101.0,30.5,201.4,100.0,17.12,201.2,100.0,9.05,10.3,4.0,2.78,1.0
75%,127.0,510.0,20.0,216.4,114.0,36.79,235.3,114.0,20.0,235.3,113.0,10.59,12.1,6.0,3.27,2.0
max,243.0,510.0,51.0,350.8,165.0,59.64,363.7,170.0,30.91,395.0,175.0,17.77,20.0,20.0,5.4,9.0


- Separate X (predictor) and Y (target)column names.  
- Distinguish between numeric and categorical X columns.

In [110]:
X_numeric_cols = ['account_length','number_vmail_messages',
       'total_day_minutes', 'total_day_calls', 'total_day_charge',
       'total_eve_minutes', 'total_eve_calls', 'total_eve_charge',
       'total_night_minutes', 'total_night_calls', 'total_night_charge',
       'total_intl_minutes', 'total_intl_calls', 'total_intl_charge',
       'customer_service_calls']
X_categorical_cols = ['state', 'area_code','international_plan', 'voice_mail_plan']
X_ignore_cols = ['phone_number']
y_target_col = 'churn'

#### Train-test-split of the data

In [117]:
X = df.drop(X_ignore_cols + [y_target_col], axis=1)
y = df[y_target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.25)

In [118]:
df.churn.value_counts(normalize=True)

False    0.855086
True     0.144914
Name: churn, dtype: float64

This seems to be imbalanced.  Plan on using SMOTE to oversample the minority class later on. 

#### Modeling

#### Create Column Transformer Pipelines

In [119]:
# Numerical column transformers
num_pipe = Pipeline([('ss',StandardScaler())])

# Categorical column transformers
cat_pipe = Pipeline([
    # ('cat_impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])
print(X_numeric_cols, '\n')
print(X_categorical_cols)

CT = ColumnTransformer(transformers=[
    ('num_trans', num_pipe, X_numeric_cols),
    ('cat_trans', cat_pipe, X_categorical_cols)
],remainder='passthrough')

['account_length', 'number_vmail_messages', 'total_day_minutes', 'total_day_calls', 'total_day_charge', 'total_eve_minutes', 'total_eve_calls', 'total_eve_charge', 'total_night_minutes', 'total_night_calls', 'total_night_charge', 'total_intl_minutes', 'total_intl_calls', 'total_intl_charge', 'customer_service_calls'] 

['state', 'area_code', 'international_plan', 'voice_mail_plan']


#### Modeling - Round One - Simple Models (default hyperparameters)

**Baseline**

In [146]:
model_name = "Baseline"
model = DummyClassifier(strategy='most_frequent', random_state=42)
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

Baseline:
CV score on training data (mean): 0.8567
Results on test data:             0.8501


**Decision Tree Classifier**

In [147]:
model_name = "Decision Tree Classifier"
model = DecisionTreeClassifier(random_state=42)
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

Decision Tree Classifier:
CV score on training data (mean): 0.9096
Results on test data:             0.9257


This model improved the accuracy over the baseline model by ~6%.  Appears potentially underfit. 

**KNearestNeighbors Classifier**

In [148]:
model_name = "KNearestNeighbors Classifier"
model = KNeighborsClassifier()
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

KNearestNeighbors Classifier:
CV score on training data (mean): 0.884
Results on test data:             0.8837


The KNN model improved the accuracy over the baseline model by ~3%.

**Random Forest Classifier**

In [149]:
model_name = "Random Forest Classifier"
model = RandomForestClassifier(random_state=42)
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

rfc:
CV score on training data (mean): 0.9384
Results on test data:             0.9424


The Random Forest model improved the accuracy over the baseline model on test data by ~9%.  The model seems a little underfit as it performed slightly better on test data than on the training data.  This appears to be the best performing model so far.

**GuassianNB Classifier**

In [150]:
model_name = "GuassianNB Classifier"
model = GaussianNB()
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

GuassianNB Classifier:
CV score on training data (mean): 0.5658
Results on test data:             0.5815


The Guassian NB model did not perform well. 

**SVC Classifier**

In [151]:
model_name = "SVC Classifier"
model = SVC(random_state=42)
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

SVC Classifier:
CV score on training data (mean): 0.9032
Results on test data:             0.9221


The SVC model improved the accuracy over the baseline model on test data by 4-5%.  It is slightly overfit as the performance on test data is a little less accurate than the performance on training data.     

**Gradient Booster Classifier**

In [152]:
model_name = "Gradient Booster"
model = GradientBoostingClassifier(random_state=42)
pipe = Pipeline([('ct', CT), 
                 ('model', GradientBoostingClassifier(random_state=42))])

results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

Gradient Booster:
CV score on training data (mean): 0.9492
Results on test data:             0.9496


The Gradient Booster Classifier improved the accuracy over the baseline model on test data by ~9%.  It does not appear to be notably underfit or overfit.  This is another top performing model.

**Logistic Regression Classifier**

In [127]:
model_name = "Logistic Regression"
model = LogisticRegression(random_state=42)
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

Logistic Regression:
CV score on training data (mean): 0.8591
Results on test data:             0.8573


The Gradient Boosting Classifier had the highest score.  

#### Modeling - Round Two - Simple Models with Balanced Data using SMOTE

As we can see below, the target is pretty unbalanced.  I'd like to balance these using SMOTE to see if we can get any better results with the simple models. 

In [134]:
y_train.value_counts()

False    2141
True      358
Name: churn, dtype: int64

In [135]:
y_train.value_counts(normalize=True)

False    0.856743
True     0.143257
Name: churn, dtype: float64

**Baseline Model - Unbalanced vs. Balanced**

In [162]:
model_name = "Baseline - Unbalanced"
model = DummyClassifier(strategy='most_frequent', random_state=42)
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

print("\n")

model_name = "Baseline - Balanced"
model = DummyClassifier(strategy='most_frequent', random_state=42)
pipe = ImPipeline([('ct',CT),
                 ('sm', SMOTE(sampling_strategy='auto', random_state=42)),
                 ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")


Baseline - Unbalanced:
CV score on training data (mean): 0.8567
Results on test data:             0.8501


Baseline - Balanced:
CV score on training data (mean): 0.8567
Results on test data:             0.8501


Balancing the data did not change the results on the baseline model.  I'm not certain of the reason.  Moving on to see if balancing the data changes the results of the other models.  

**Decision Tree Classifier - Unbalanced vs. Balanced**

In [174]:
model_name = "Decision Tree Classifier - Unbalanced"
model = DecisionTreeClassifier(random_state=42)
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

print("\n")

model_name = "Decision Tree Classifier - Balanced"
model = DecisionTreeClassifier(random_state=42)
pipe = ImPipeline([('ct',CT),
                   ('sm', SMOTE(sampling_strategy='auto', random_state=42)),
                   ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

Decision Tree Classifier - Unbalanced:
CV score on training data (mean): 0.9096
Results on test data:             0.9257


Decision Tree Classifier - Balanced:
CV score on training data (mean): 0.9032
Results on test data:             0.8909


Balancing the data decreased the accuracy of the model on test data.   

**KNearestNeighbors Classifier - Unbalanced vs. Balanced**

In [179]:
model_name = "KNearestNeighbors Classifier - Unbalanced"
model = KNeighborsClassifier()
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

print("\n")

model_name = "KNearestNeighbors Classifier - Balanced"
model = KNeighborsClassifier()
pipe = ImPipeline([('ct',CT),
                   ('sm', SMOTE(sampling_strategy='auto', random_state=42)),
                   ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

KNearestNeighbors Classifier - Unbalanced:
CV score on training data (mean): 0.884
Results on test data:             0.8837


KNearestNeighbors Classifier - Balanced:
CV score on training data (mean): 0.7607
Results on test data:             0.759


Balancing the data decreased the accuracy of the model on test data.   

**Random Forest Classifier - Unbalanced vs. Balanced**

In [165]:
model_name = "Random Forest Classifier - Unbalanced"
model = RandomForestClassifier(random_state=42)
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

print("\n")

model_name = "Random Forest Classifier - Balanced"
model = RandomForestClassifier(random_state=42)
pipe = ImPipeline([('ct',CT),
                   ('sm', SMOTE(sampling_strategy='auto', random_state=42)),
                   ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

Random Forest Classifier - Unbalanced:
CV score on training data (mean): 0.9384
Results on test data:             0.9424


Random Forest Classifier - Balanced:
CV score on training data (mean): 0.9408
Results on test data:             0.9365


Balancing the decreased the accuracy of the model on test data.

**GuassianNB Classifier - Unbalanced vs. Balanced**

In [166]:
model_name = "GuassianNB Classifier - Unbalanced"
model = GaussianNB()
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

print("\n")

model_name = "GuassianNB Classifier - Balanced"
model = GaussianNB()
pipe = ImPipeline([('ct',CT),
                   ('sm', SMOTE(sampling_strategy='auto', random_state=42)),
                   ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

GuassianNB Classifier - Unbalanced:
CV score on training data (mean): 0.5658
Results on test data:             0.5815


GuassianNB Classifier - Balanced:
CV score on training data (mean): 0.5594
Results on test data:             0.5647


Balancing the decreased the accuracy of the model.   

**C-Support Vector Classifier - Unbalanced vs. Balanced**

In [169]:
model_name = "SVC Classifier - Unbalanced"
model = SVC(random_state=42)
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

print("\n")

model_name = "SVC Classifier - Balanced"
model = SVC(random_state=42)
pipe = ImPipeline([('ct',CT),
                   ('sm', SMOTE(sampling_strategy='auto', random_state=42)),    
                   ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

SVC Classifier - Unbalanced:
CV score on training data (mean): 0.9032
Results on test data:             0.9221


SVC Classifier - Balanced:
CV score on training data (mean): 0.9016
Results on test data:             0.9113


Balancing the data decreased the accuracy of the model.   

**Gradient Booster Classifier - Unbalanced vs. Balanced**

In [180]:
model_name = "Gradient Booster Classifier - Unbalanced"
model = GradientBoostingClassifier(random_state=42)
pipe = Pipeline([('ct', CT), 
                 ('model', GradientBoostingClassifier(random_state=42))
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

print("\n")

model_name = "Gradient Booster Classifier - Balanced"
model = GradientBoostingClassifier(random_state=42)
pipe = ImPipeline([('ct', CT), 
                   ('sm', SMOTE(sampling_strategy='auto', random_state=42)),
                   ('model', GradientBoostingClassifier(random_state=42))
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

Gradient Booster Classifier - Unbalanced:
CV score on training data (mean): 0.9492
Results on test data:             0.9496


Gradient Booster Classifier - Balanced:
CV score on training data (mean): 0.9364
Results on test data:             0.9448


Balancing the data decreased the accuracy of the model.  

**Logistic Regression Classifier - Unbalanced vs. Balanced**

In [184]:
model_name = "Logistic Regression Classifier - Unbalanced"
model = LogisticRegression(random_state=42, max_iter=1000)
pipe = Pipeline([('ct',CT),
                 ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")

print("\n")

model_name = "Logistic Regression Classifier - Balanced"
model = LogisticRegression(random_state=42, max_iter=1000)
pipe = ImPipeline([('ct',CT),
                   ('sm', SMOTE(sampling_strategy='auto', random_state=42)),
                   ('model', model) 
])
results = cross_val_score(pipe, X_train, y_train, cv=10)
y_hat = pipe.fit(X_train, y_train).predict(X_test)
print(model_name + ":")
print(f"CV score on training data (mean): {round(np.mean(results), 4)}")
print(f"Results on test data:             {round(accuracy_score(y_test, y_hat), 4)}")


Logistic Regression Classifier - Unbalanced:
CV score on training data (mean): 0.8591
Results on test data:             0.8573


Logistic Regression Classifier - Balanced:
CV score on training data (mean): 0.7639
Results on test data:             0.777


Using SMOTE to balance the data decreased the accuracy of all of the models.  It appears that using data as is (85/15) is not that bad for the models.  This would align with the 90/10 threshold discussed during our class.  As such, I will not use balanced data going forward in the next round of modeling.  