## Telco Customer Churn - Modeling 

### Import required packages and load dataset

In [63]:
# general packages
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import warnings

# sklearn packages
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model, metrics
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier

# imbelearn package
from imblearn.over_sampling import SMOTE

In [18]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv', sep = ',')

# print first few rows of df 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [19]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Convert Total Charges variable to float for numeric analysis and impute median value for records converted to NA after float conversion

In [20]:
# convert the data type of TotalCharges from 'object' to 'float'
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors='coerce')

df.TotalCharges.fillna(value=df['TotalCharges'].median(),inplace=True)

### Convert Tenure with the company into categories for new, middle, long-term customers. Break monthly charges into categories for low, medium, high monthly charges. Break total charges into categories for low, high total charges

In [21]:
# Convert Tenure with the company into categories for new, middle, long-term customers 
df.loc[df['tenure']<=15, 'tenureCat'] = "New Customers"
df.loc[df['tenure'].between(16,50), 'tenureCat'] = "Middle Customers"
df.loc[df['tenure']>=51, 'tenureCat'] = "Long-Time Customers"

# Break monthly charges into categories for low, medium, high monthly charges 
df.loc[df['MonthlyCharges']<=35, 'MCC'] = "0-35"
df.loc[df['MonthlyCharges'].between(36,65), 'MCC'] = "36-65"
df.loc[df['MonthlyCharges']>=66, 'MCC'] = ">66"

# Break total charges into categories for low, high total charges 
df.loc[df['TotalCharges']<=2000, 'TCC'] = "0-2000"
df.loc[df['TotalCharges']>=2001, 'TCC'] = ">2000"

### Convert categorical variables into numeric categories

In [22]:
df['gender'] =df['gender'].astype('category').cat.codes
df['tenure'] =df['tenure'].astype('category').cat.codes
df['SeniorCitizen'] =df['SeniorCitizen'].astype('category').cat.codes
df['Partner'] =df['Partner'].astype('category').cat.codes
df['Dependents'] =df['Dependents'].astype('category').cat.codes
df['PhoneService'] =df['PhoneService'].astype('category').cat.codes
df['MultipleLines'] =df['MultipleLines'].astype('category').cat.codes
df['InternetService'] =df['InternetService'].astype('category').cat.codes
df['OnlineSecurity'] =df['OnlineSecurity'].astype('category').cat.codes
df['OnlineBackup'] =df['OnlineBackup'].astype('category').cat.codes
df['DeviceProtection'] =df['DeviceProtection'].astype('category').cat.codes
df['TechSupport'] =df['TechSupport'].astype('category').cat.codes
df['StreamingTV'] =df['StreamingTV'].astype('category').cat.codes
df['StreamingMovies'] =df['StreamingMovies'].astype('category').cat.codes
df['Contract'] =df['Contract'].astype('category').cat.codes
df['PaperlessBilling'] =df['PaperlessBilling'].astype('category').cat.codes
df['PaymentMethod'] =df['PaymentMethod'].astype('category').cat.codes
df['Churn'] =df['Churn'].astype('category').cat.codes
# User created categorical variabels 
df['tenureCat'] =df['tenureCat'].astype('category').cat.codes
df['TCC'] =df['TCC'].astype('category').cat.codes
df['MCC'] =df['MCC'].astype('category').cat.codes

### Drop customerID, as we will not need it for modeling 

In [23]:
df = df.drop('customerID', 1)

df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenureCat,MCC,TCC
0,0,0,1,0,1,0,1,0,0,2,...,0,0,1,2,29.85,29.85,0,2,0,0
1,1,0,0,0,34,1,0,0,2,0,...,0,1,0,3,56.95,1889.5,0,1,1,0
2,1,0,0,0,2,1,0,0,2,2,...,0,0,1,3,53.85,108.15,1,2,1,0
3,1,0,0,0,45,0,1,0,2,0,...,0,1,0,0,42.3,1840.75,0,1,1,0
4,0,0,0,0,2,1,0,1,0,0,...,0,0,1,2,70.7,151.65,1,2,2,0


### Identify highly correlated features and remove

In [11]:
# # Create correlation matrix
# corr_matrix = df.corr().abs()

# # Select upper triangle of correlation matrix
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# # Find features with correlation greater than 0.90
# to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

# to_drop

### Split Dataset into test/train, check to ensure spilt and binary ratios match between train and test data

In [30]:
X = df.iloc[:, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,20,21,22]]
y = df.iloc[:, 19]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

# check split of data
len(x_train), len(y_train), len(x_test), len(y_test), len(df)

(5634, 5634, 1409, 1409, 7043)

### Conduct standard scaling on the dataset. For the training dataset we will fit and normalize. For the test dataset we will just trasnform the variables, using the same params that it learned from the training data

In [34]:
# Conduct Standard Scaling 
scaler = StandardScaler()
x_train = scaler.fit_transform( x_train )
x_test = scaler.transform( x_test )

### Employ SMOTE on training dataset. We are using SMOTE because of the imbalance in our churn rate response variable. This is an oversampling technique

In [36]:
sm = SMOTE()
x_train, y_train = sm.fit_resample(x_train, y_train)

# check shape of x_train and y_train, and new response variable ratio
print(x_train.shape)
print(y_train.shape)
print(y_train.value_counts(normalize = True))

(8278, 22)
(8278,)
0    0.5
1    0.5
Name: Churn, dtype: float64


### We will run our training dataset through a number of different binary classification models, using negative mean squared error as a measuring stick to determine which models may provide optimized outputs. 

### Initial standard hyperparamters were used for initial testing, but we will run hyperparameter tuning for models we choose to move forward with

### As we can see, models employing ensemble learning perform better than standard binary classification models 

In [64]:
pipelines = []
# Standard binary classification models 
pipelines.append(('LogisticRegression', Pipeline([('LR',linear_model.LogisticRegression(max_iter=1000))])))
pipelines.append(('KNearestNeighbors', Pipeline([('KNN',KNeighborsClassifier(algorithm = 'brute', n_jobs=-1))])))
pipelines.append(('LinearSVC', Pipeline([('SVC',LinearSVC(C=0.0001))])))
pipelines.append(('DecisionTree', Pipeline([('DTREE',DecisionTreeClassifier())])))
# Employment of ensemble learning 
pipelines.append(('BaggingClassifier', Pipeline([('BAG',BaggingClassifier(DecisionTreeClassifier(),max_samples=0.5,max_features=1.0,n_estimators=10))])))
pipelines.append(('BoostClassifier', Pipeline([('BOOST',AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=10,max_depth=4),n_estimators=10,learning_rate=0.6))])))
pipelines.append(('RandomForest', Pipeline([('FOREST',RandomForestClassifier(n_estimators=30, max_depth=9))])))
pipelines.append(('GradientBoost', Pipeline([('GBoost',GradientBoostingClassifier())])))

results = []
names = []
for name, model in pipelines:
    kfold = KFold(n_splits=10, random_state=21, shuffle=True)
    cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring='neg_mean_squared_error')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LogisticRegression: -0.222881 (0.013103)
KNearestNeighbors: -0.204882 (0.012196)
LinearSVC: -0.237740 (0.011994)
DecisionTree: -0.199446 (0.013023)
BaggingClassifier: -0.169607 (0.016159)
BoostClassifier: -0.159820 (0.013245)
RandomForest: -0.162842 (0.012550)
GradientBoost: -0.153058 (0.013637)
