## Syria-Tel Classification Modeling

## Business Understanding

Telecom company.  Trying to come up with a model that predicts churn based on customer location and usage data. 

## Data Understanding

bla bla bla 

## Data Preparation and Modeling

#### Imports

In [144]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from pandas.api.types import is_numeric_dtype

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer,  make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline
from pandas.api.types import is_numeric_dtype
from sklearn.svm import SVC

from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

#### Functions

In [126]:
# Class copied from Flat Iron School content

class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model: {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy''')
        print(cv_summary)

        
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given 
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )

        return ax

Function to Cross-validate various models easily using the Class written above

In [160]:
def ModelItAndPrintResults(model_pipe, model_name, X_train, y_train, X_test, y_test):
    
    # Train the model
    model_train =  ModelWithCV(model_pipe, model_name=model_name + '_train',X=X_train, y=y_train)
    # Test the model
    model_test =  ModelWithCV(model_pipe, model_name=model_name + '_test',X=X_test, y=y_test)
    # Print both results
    model_train.print_cv_summary()
    model_test.print_cv_summary()

    # Return the ModelWithCV classes
    return (model_train, model_test)

#### Importing data

In [81]:
df = pd.read_csv('data/bigml_59c28831336c6604c800002a.csv')
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


#### Data preparation

Review and fix the feature names to be more code friendly. 

In [73]:
# View column names
df.columns

Index(['state', 'account length', 'area code', 'phone number',
       'international plan', 'voice mail plan', 'number vmail messages',
       'total day minutes', 'total day calls', 'total day charge',
       'total eve minutes', 'total eve calls', 'total eve charge',
       'total night minutes', 'total night calls', 'total night charge',
       'total intl minutes', 'total intl calls', 'total intl charge',
       'customer service calls', 'churn'],
      dtype='object')

In [85]:
# Rename columns
df.columns = df.columns.str.replace(" ", "_")
df.columns

Index(['state', 'account_length', 'area_code', 'phone_number',
       'international_plan', 'voice_mail_plan', 'number_vmail_messages',
       'total_day_minutes', 'total_day_calls', 'total_day_charge',
       'total_eve_minutes', 'total_eve_calls', 'total_eve_charge',
       'total_night_minutes', 'total_night_calls', 'total_night_charge',
       'total_intl_minutes', 'total_intl_calls', 'total_intl_charge',
       'customer_service_calls', 'churn'],
      dtype='object')

- Separate X (predictor) and Y (target)column names.  
- Distinguish between numeric and categorical X columns.

In [86]:
X_numeric_cols = ['account_length','number_vmail_messages',
       'total_day_minutes', 'total_day_calls', 'total_day_charge',
       'total_eve_minutes', 'total_eve_calls', 'total_eve_charge',
       'total_night_minutes', 'total_night_calls', 'total_night_charge',
       'total_intl_minutes', 'total_intl_calls', 'total_intl_charge',
       'customer_service_calls']
X_categorical_cols = ['state', 'area_code','international_plan', 'voice_mail_plan']
X_ignore_cols = ['phone_number']
y_target_col = ['churn']

#### Train-test-split of the data

In [94]:
X = df.drop(X_ignore_cols + y_target_col, axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.25)

#### Data Cleaning

Check out the features we have and if there are any null values.

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3333 non-null   object 
 1   account_length          3333 non-null   int64  
 2   area_code               3333 non-null   int64  
 3   phone_number            3333 non-null   object 
 4   international_plan      3333 non-null   object 
 5   voice_mail_plan         3333 non-null   object 
 6   number_vmail_messages   3333 non-null   int64  
 7   total_day_minutes       3333 non-null   float64
 8   total_day_calls         3333 non-null   int64  
 9   total_day_charge        3333 non-null   float64
 10  total_eve_minutes       3333 non-null   float64
 11  total_eve_calls         3333 non-null   int64  
 12  total_eve_charge        3333 non-null   float64
 13  total_night_minutes     3333 non-null   float64
 14  total_night_calls       3333 non-null   

In [89]:
df.isna().sum()

state                     0
account_length            0
area_code                 0
phone_number              0
international_plan        0
voice_mail_plan           0
number_vmail_messages     0
total_day_minutes         0
total_day_calls           0
total_day_charge          0
total_eve_minutes         0
total_eve_calls           0
total_eve_charge          0
total_night_minutes       0
total_night_calls         0
total_night_charge        0
total_intl_minutes        0
total_intl_calls          0
total_intl_charge         0
customer_service_calls    0
churn                     0
dtype: int64

In [91]:
df

Unnamed: 0,state,account_length,area_code,phone_number,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,...,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.70,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.70,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.30,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.90,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,AZ,192,415,414-4276,no,yes,36,156.2,77,26.55,...,126,18.32,279.1,83,12.56,9.9,6,2.67,2,False
3329,WV,68,415,370-3271,no,no,0,231.1,57,39.29,...,55,13.04,191.3,123,8.61,9.6,4,2.59,3,False
3330,RI,28,510,328-8230,no,no,0,180.8,109,30.74,...,58,24.55,191.9,91,8.64,14.1,6,3.81,2,False
3331,CT,184,510,364-6381,yes,no,0,213.8,105,36.35,...,84,13.57,139.2,137,6.26,5.0,10,1.35,2,False


Check out the values in each feature

In [152]:
for x in df.columns:
    if x in ignore_cols:
        continue
    print("\n", x)
    if x in X_numeric_cols:
        print("min:", df[x].min())
        print("max:", df[x].max())
        print("mean:", df[x].mean())
    print(df[x].value_counts())
            


 state
WV    106
MN     84
NY     83
AL     80
OH     78
OR     78
WI     78
VA     77
WY     77
CT     74
ID     73
VT     73
MI     73
TX     72
UT     72
IN     71
KS     70
MD     70
NC     68
NJ     68
MT     68
NV     66
WA     66
CO     66
MA     65
MS     65
RI     65
AZ     64
FL     63
MO     63
NM     62
ME     62
ND     62
NE     61
DE     61
OK     61
SD     60
SC     60
KY     59
IL     58
NH     56
AR     55
DC     54
GA     54
HI     53
TN     53
AK     52
LA     51
PA     45
IA     44
CA     34
Name: state, dtype: int64

 account_length
min: 1
max: 243
mean: 101.06480648064806
105    43
87     42
93     40
101    40
90     39
       ..
191     1
199     1
215     1
221     1
2       1
Name: account_length, Length: 212, dtype: int64

 area_code
415    1655
510     840
408     838
Name: area_code, dtype: int64

 international_plan
no     3010
yes     323
Name: international_plan, dtype: int64

 voice_mail_plan
no     2411
yes     922
Name: voice_mail_plan, dtype: int64


In [8]:
df.describe()

Unnamed: 0,account length,area code,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls
count,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0
mean,101.064806,437.182418,8.09901,179.775098,100.435644,30.562307,200.980348,100.114311,17.08354,200.872037,100.107711,9.039325,10.237294,4.479448,2.764581,1.562856
std,39.822106,42.37129,13.688365,54.467389,20.069084,9.259435,50.713844,19.922625,4.310668,50.573847,19.568609,2.275873,2.79184,2.461214,0.753773,1.315491
min,1.0,408.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.2,33.0,1.04,0.0,0.0,0.0,0.0
25%,74.0,408.0,0.0,143.7,87.0,24.43,166.6,87.0,14.16,167.0,87.0,7.52,8.5,3.0,2.3,1.0
50%,101.0,415.0,0.0,179.4,101.0,30.5,201.4,100.0,17.12,201.2,100.0,9.05,10.3,4.0,2.78,1.0
75%,127.0,510.0,20.0,216.4,114.0,36.79,235.3,114.0,20.0,235.3,113.0,10.59,12.1,6.0,3.27,2.0
max,243.0,510.0,51.0,350.8,165.0,59.64,363.7,170.0,30.91,395.0,175.0,17.77,20.0,20.0,5.4,9.0


In [6]:
df.churn.value_counts(normalize=True)

False    0.855086
True     0.144914
Name: churn, dtype: float64

This seems to be imbalanced.  Plan on using SMOTE to oversample the minority class.  

#### Modeling

#### Create Column Transformer Pipelines

In [149]:
# Numerical column transformers
num_pipe = Pipeline([('ss',StandardScaler())])

# Categorical column transformers
cat_pipe = Pipeline([
    # ('cat_impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

CT = ColumnTransformer(transformers=[
    ('num_trans', num_pipe, numeric_cols),
    ('cat_trans', cat_pipe, categorical_cols)
],remainder='passthrough')

#### Baseline Model

In [150]:
# Create Dummy/Baseline
dummy_model_pipe = Pipeline([('ct',CT),('dummy',DummyClassifier(strategy='most_frequent'))])
dummy_model_train =  ModelWithCV(dummy_model_pipe, model_name='baseline_train',X=X_train, y=y_train)
dummy_model_train.print_cv_summary()
dummy_model_test =  ModelWithCV(dummy_model_pipe, model_name='baseline_test',X=X_test, y=y_test)
dummy_model_test.print_cv_summary()

CV Results for `baseline_train` model: 0.85674 ± 0.00149 accuracy
CV Results for `baseline_test` model: 0.85014 ± 0.00530 accuracy


In [162]:
# Create Dummy/Baseline
baseline_model_train, baseline_model_test = ModelItAndPrintResults(
    Pipeline([('ct',CT),('dummy', DummyClassifier(strategy='most_frequent'))]),
    "baseline", 
    X_train,
    y_train,
    X_test,
    y_test
)

CV Results for `baseline_train` model: 0.85674 ± 0.00149 accuracy
CV Results for `baseline_test` model: 0.85014 ± 0.00530 accuracy


#### Round One - Simple Models

Simple Model 1 - Decision Tree Classifier (default hyperparameters)

In [130]:
# Decision Tree Classifier
dtc_model_pipe = Pipeline([('ct',CT),('dtc',DecisionTreeClassifier())])
dtc_model_train =  ModelWithCV(dtc_model_pipe, model_name='dtc_model_train',X=X_train, y=y_train)
dtc_model_train.print_cv_summary()
dtc_model_test =  ModelWithCV(dtc_model_pipe, model_name='dtc_model_test',X=X_test, y=y_test)
dtc_model_test.print_cv_summary()

CV Results for `dtc_model_train` model: 0.89996 ± 0.01620 accuracy
CV Results for `dtc_model_test` model: 0.89803 ± 0.03783 accuracy


This improved the average accuracy of the model by ~4% over the baseline model. The model's accuracy on test data was about the same as the model's accuracy on training data, so it does not appear overfit or underfit.  

Simple Model 2 - KNearestNeighbors Classifier (default hyperparameters)

In [131]:
# KNN Classifier
knn_model_pipe = Pipeline([('ct',CT),('knn',KNeighborsClassifier())])
knn_model_train =  ModelWithCV(knn_model_pipe, model_name='knn_model_train',X=X_train, y=y_train)
knn_model_train.print_cv_summary()
knn_model_test =  ModelWithCV(knn_model_pipe, model_name='knn_model_test',X=X_test, y=y_test)
knn_model_test.print_cv_summary()

CV Results for `knn_model_train` model: 0.88396 ± 0.01276 accuracy
CV Results for `knn_model_test` model: 0.87530 ± 0.02296 accuracy


The KNN model improved the average accuracy over the baseline model by 2-3%.  The model seems slightly overfit as it performed a little worse on test data than on the training data.  

Simple Model 3 - Random Forest Classifier (default hyperparameters)

In [147]:
# Random Forest Classifier
rfc_model_pipe = Pipeline([('ct',CT),('rfc',RandomForestClassifier())])
rfc_model_train =  ModelWithCV(rfc_model_pipe, model_name='rfc_model_train',X=X_train, y=y_train)
rfc_model_train.print_cv_summary()
rfc_model_test =  ModelWithCV(rfc_model_pipe, model_name='rfc_model_test',X=X_test, y=y_test)
rfc_model_test.print_cv_summary()

CV Results for `rfc_model_train` model: 0.94278 ± 0.00715 accuracy
CV Results for `rfc_model_test` model: 0.90403 ± 0.02482 accuracy


The Random Forest model improved the average accuracy over the baseline model on test data by ~5%.  The model seems considerably overfit as it performed worse on test data than on the training data.  

Simple Model 4 - GuassianNB Classifier (default hyperparameters)

In [146]:
# GuassianNB Classifier
gnb_model_pipe = Pipeline([('ct',CT),('gnb', GaussianNB())])
gnb_model_train =  ModelWithCV(gnb_model_pipe, model_name='gnb_model_train',X=X_train, y=y_train)
gnb_model_train.print_cv_summary()
gnb_model_test =  ModelWithCV(gnb_model_pipe, model_name='gnb_model_test',X=X_test, y=y_test)
gnb_model_test.print_cv_summary()

CV Results for `gnb_model_train` model: 0.56585 ± 0.07002 accuracy
CV Results for `gnb_model_test` model: 0.29129 ± 0.02762 accuracy


The Guassian NB model did not perform well. 

Simple Model 5 - C-Support Vector Classifier (default hyperparameters)

In [142]:
# C-Support Vector Classifier (SVC)
svc_model_pipe = Pipeline([('ct',CT), ('svc', SVC())])
svc_model_train =  ModelWithCV(svc_model_pipe, model_name='svc_model_train',X=X_train, y=y_train)
svc_model_train.print_cv_summary()
svc_model_test =  ModelWithCV(svc_model_pipe, model_name='svc_model_test',X=X_test, y=y_test)
svc_model_test.print_cv_summary()

CV Results for `svc_model_train` model: 0.90316 ± 0.01358 accuracy
CV Results for `svc_model_test` model: 0.89567 ± 0.02151 accuracy


The SVC model improved the average accuracy over the baseline model by 4-5%.  It is slightly overfit as the performance on test data is a little less accurate than the performance on training data.     

Simple Model 6 - Logistic Regression Classifier (default hyperparameters)

In [143]:
# Logistic Regression Classifier
logreg_model_pipe = Pipeline([
    ('ct',CT),
    ('logreg', LogisticRegression())
])
logreg_model_train =  ModelWithCV(logreg_model_pipe, model_name='logreg_model_train',X=X_train, y=y_train)
logreg_model_train.print_cv_summary()
logreg_model_test =  ModelWithCV(logreg_model_pipe, model_name='logreg_model_test',X=X_test, y=y_test)
logreg_model_test.print_cv_summary()

CV Results for `logreg_model_train` model: 0.85915 ± 0.01447 accuracy
CV Results for `logreg_model_test` model: 0.86338 ± 0.02604 accuracy


The Logistic Regression Classifier barely outperforms the baseline model.  It also appears potentially underfit as the test data is more accurate than the training data.   

Simple Model 7 - ADABoost Classifier (default hyperparameters)

In [145]:
# ADABoost Classifier
ada_model_pipe = Pipeline([
    ('ct',CT),
    ('svc', AdaBoostClassifier())
])
ada_model_train =  ModelWithCV(ada_model_pipe, model_name='ada_model_train',X=X_train, y=y_train)
ada_model_train.print_cv_summary()
ada_model_test =  ModelWithCV(ada_model_pipe, model_name='ada_model_test',X=X_test, y=y_test)
ada_model_test.print_cv_summary()

CV Results for `ada_model_train` model: 0.87676 ± 0.01943 accuracy
CV Results for `ada_model_test` model: 0.85974 ± 0.04410 accuracy
