In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from scipy.stats import kstest
from scipy.stats import norm
from sklearn.pipeline import Pipeline
#model modules
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
org_data=pd.read_csv("datasets/Telco_Cusomer_Churn.csv")

In [3]:
org_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
org_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
#impute org_data["TotalCharges"] with mode
org_data["TotalCharges"].replace(" ", 0, inplace=True)

In [6]:
org_data["TotalCharges"]=pd.to_numeric(org_data["TotalCharges"])

In [7]:
org_data=org_data.drop("customerID", axis=1)

In [8]:
numeric_cols={}
nonnumeric_cols={}
for i in org_data.columns:
    if isinstance(org_data[i][0], str):
        labeler=LabelEncoder()
        labels=labeler.fit_transform(org_data[i])
        nonnumeric_cols[i]=labels
    else:
        numeric_cols[i]=org_data[i]

In [9]:
dataset=pd.concat([pd.DataFrame.from_dict(nonnumeric_cols), pd.DataFrame.from_dict(numeric_cols)], axis=1)

In [10]:
dataset.head()

Unnamed: 0,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0,1,0,0,1,0,0,2,0,0,0,0,0,1,2,0,0,1,29.85,29.85
1,1,0,0,1,0,0,2,0,2,0,0,0,1,0,3,0,0,34,56.95,1889.5
2,1,0,0,1,0,0,2,2,0,0,0,0,0,1,3,1,0,2,53.85,108.15
3,1,0,0,0,1,0,2,0,2,2,0,0,1,0,0,0,0,45,42.3,1840.75
4,0,0,0,1,0,1,0,0,0,0,0,0,0,1,2,1,0,2,70.7,151.65


In [85]:
#inspect multicollinearity
dataset.corr().where(abs(dataset.corr())>.79)

Unnamed: 0,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
gender,1.0,,,,,,,,,,,,,,,,,,,
Partner,,1.0,,,,,,,,,,,,,,,,,,
Dependents,,,1.0,,,,,,,,,,,,,,,,,
PhoneService,,,,1.0,,,,,,,,,,,,,,,,
MultipleLines,,,,,1.0,,,,,,,,,,,,,,,
InternetService,,,,,,1.0,,,,,,,,,,,,,,
OnlineSecurity,,,,,,,1.0,,,,,,,,,,,,,
OnlineBackup,,,,,,,,1.0,,,,,,,,,,,,
DeviceProtection,,,,,,,,,1.0,,,,,,,,,,,
TechSupport,,,,,,,,,,1.0,,,,,,,,,,


In [86]:
dataset.corr()

Unnamed: 0,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
gender,1.0,-0.001808,0.010517,-0.006488,-0.006739,-0.000863,-0.015017,-0.012057,0.000549,-0.006825,-0.006421,-0.008743,0.000126,-0.011754,0.017352,-0.008612,-0.001874,0.005106,-0.014569,-8e-05
Partner,-0.001808,1.0,0.452676,0.017706,0.14241,0.000891,0.150828,0.15313,0.16633,0.126733,0.137341,0.129574,0.294806,-0.014877,-0.154798,-0.150448,0.016479,0.379697,0.096848,0.317504
Dependents,0.010517,0.452676,1.0,-0.001762,-0.024991,0.04459,0.152166,0.091015,0.080537,0.133524,0.046885,0.021321,0.243187,-0.111377,-0.040292,-0.164221,-0.211185,0.159712,-0.11389,0.062078
PhoneService,-0.006488,0.017706,-0.001762,1.0,-0.020538,0.387436,-0.015198,0.024105,0.003727,-0.019158,0.055353,0.04387,0.002247,0.016505,-0.004184,0.011942,0.008576,0.008448,0.247398,0.113214
MultipleLines,-0.006739,0.14241,-0.024991,-0.020538,1.0,-0.109216,0.007141,0.117327,0.122318,0.011466,0.175059,0.180957,0.110842,0.165146,-0.176793,0.038037,0.146185,0.343032,0.433576,0.452577
InternetService,-0.000863,0.000891,0.04459,0.387436,-0.109216,1.0,-0.028416,0.036138,0.044944,-0.026047,0.107417,0.09835,0.099721,-0.138625,0.08614,-0.047291,-0.03231,-0.030359,-0.32326,-0.175755
OnlineSecurity,-0.015017,0.150828,0.152166,-0.015198,0.007141,-0.028416,1.0,0.185126,0.175985,0.285028,0.044669,0.055954,0.374416,-0.157641,-0.096726,-0.289309,-0.128221,0.325468,-0.053878,0.253224
OnlineBackup,-0.012057,0.15313,0.091015,0.024105,0.117327,0.036138,0.185126,1.0,0.187757,0.195748,0.147186,0.136722,0.28098,-0.01337,-0.124847,-0.195525,-0.013632,0.370876,0.119777,0.37441
DeviceProtection,0.000549,0.16633,0.080537,0.003727,0.122318,0.044944,0.175985,0.187757,1.0,0.240593,0.276652,0.288799,0.350277,-0.038234,-0.13575,-0.178134,-0.021398,0.371105,0.163652,0.387897
TechSupport,-0.006825,0.126733,0.133524,-0.019158,0.011466,-0.026047,0.285028,0.195748,0.240593,1.0,0.161305,0.161316,0.425367,-0.1136,-0.10467,-0.282492,-0.151268,0.322942,-0.008682,0.275625


In [87]:
dataset=dataset.drop("TotalCharges", axis=1)

In [88]:
#test for normality
for i in dataset.columns:
    stat, p = kstest(dataset[i], 'norm', args=(dataset[i].mean(), dataset[i].std()))
    print(f'Statistic={stat:.3f}, p-value={p:.3f}')

Statistic=0.344, p-value=0.000
Statistic=0.350, p-value=0.000
Statistic=0.444, p-value=0.000
Statistic=0.531, p-value=0.000
Statistic=0.321, p-value=0.000
Statistic=0.225, p-value=0.000
Statistic=0.318, p-value=0.000
Statistic=0.287, p-value=0.000
Statistic=0.287, p-value=0.000
Statistic=0.316, p-value=0.000
Statistic=0.266, p-value=0.000
Statistic=0.264, p-value=0.000
Statistic=0.346, p-value=0.000
Statistic=0.389, p-value=0.000
Statistic=0.220, p-value=0.000
Statistic=0.461, p-value=0.000
Statistic=0.508, p-value=0.000
Statistic=0.111, p-value=0.000
Statistic=0.126, p-value=0.000


In [89]:
X=dataset.drop("Churn", axis=1)
y=dataset["Churn"]

In [90]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=.30, random_state=42)

In [91]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', X_test.shape)
print('Testing Features Shape:', y_train.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (4930, 18)
Training Labels Shape: (2113, 18)
Testing Features Shape: (4930,)
Testing Labels Shape: (2113,)


In [92]:
def classifier_scoring (X_train, X_test, y_train, y_test):
	classifiers = [
	RandomForestClassifier(random_state=42),
	RidgeClassifier(random_state=42),
	GradientBoostingClassifier(random_state=42),
	LogisticRegression(random_state=42)
	]
	
	scores=[]
	for i in classifiers:
		pipe = Pipeline(steps=[('classifier', i)])
		pi=pipe.fit(X_train, y_train)
		scores.append(cross_val_score(pi, X_test, y_test, cv=5,
		verbose=True).mean())
	return ["Random Forest Score {}".format(scores[0]), "RidgeClassifier Score {}".format(scores[1]), "Gradient Boosting Score {}".format(scores[2]),
	"Logistic Regression Score {}".format(scores[3])]

In [93]:
#get best classifiers by score results. 
#raise exception if number of classifiers do not match number of score results
try:
	score_results=classifier_scoring(X_train, X_test, y_train, y_test)
	print("Best Scoring Classifier: {}".format(sorted(score_results, reverse=False)[0]))
except IndexError:
	print("No Results. Number of classifers need to match number of scores to return!")
else:
	pass

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.9s finished


Best Scoring Classifier: Gradient Boosting Score 0.8012324515702554


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished


In [94]:
gb=GradientBoostingClassifier(max_depth=3, min_samples_split=2,
                              min_samples_leaf=1, subsample=.9, random_state=42)

In [95]:
gb.fit(X_train, y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,0.9
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [96]:
predictions=gb.predict(X_test)

In [97]:
accuracy_score(predictions, y_test)

0.8045433033601515

In [98]:
import joblib
joblib.dump(gb, 'telco_customer_churn.pkl')

['telco_customer_churn.pkl']