# Model Development

In this notebook, we will be developing predictive models using the `Telco Customer Churn` dataset. Using `sklearn` models: Logistic Regression, Decision Tree, and K-Nearest Neighbors; we will craft various classifiers as base models and then to optimize using the cost function, regularization and hyperparameter tuning.

By the end of this notebook, we will establish foundations for evaluating which model will perform best for predicting churn.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import log_loss, roc_curve, roc_auc_score
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:,.2f}'.format)

df = pd.read_csv('../data/encoded_telco_churn.csv')
df

Unnamed: 0,Male,Partner,Dependents,SeniorCitizen,DurationMonths,PhoneService,MultipleLines,NoInternet,DSLInternet,FiberOpticInternet,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,MonthlyContract,AnnualContract,BiannualContract,AutoBankTransfer,AutoCreditCard,ElectronicCheck,MailedCheck,PaperlessBilling,MonthlyCharges,TotalCharges,Churn
0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,29.85,29.85,0
1,1,0,0,0,34,1,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,56.95,1889.50,0
2,1,0,0,0,2,1,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,1,1,53.85,108.15,1
3,1,0,0,0,45,0,0,0,1,0,1,0,1,1,0,0,0,1,0,1,0,0,0,0,42.30,1840.75,0
4,0,0,0,0,2,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,1,1,1,0,24,1,1,0,1,0,1,0,1,1,1,1,0,1,0,0,0,0,1,1,84.80,1990.50,0
7028,0,1,1,0,72,1,1,0,0,1,0,1,1,0,1,1,0,1,0,0,1,0,0,1,103.20,7362.90,0
7029,0,1,1,0,11,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,29.60,346.45,0
7030,1,1,0,1,4,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,74.40,306.60,1


## Logistic Regression

In [2]:
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [3]:
logreg_base = LogisticRegression()
logreg_base.fit(X_train, y_train)

In [4]:
logreg_ypred = logreg_base.predict(X_test)
logreg_accuracy = accuracy_score(y_test, logreg_ypred)

logreg_ypred_proba = logreg_base.predict_proba(X_test)
logreg_logloss = log_loss(y_test, logreg_ypred_proba)

In [5]:
print(accuracy)
print(logreg_logloss)

NameError: name 'accuracy' is not defined

## Logistic Regression with C Regularization

In [None]:
C_list = [1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1e3]
cv_scores = []
cv_scores_std = []

for c in C_list:
    logreg = LogisticRegression(C=c, random_state=42)
    cv_loop_results = cross_validate(
                                    X=X_train,
                                    y=y_train,
                                    estimator=logreg,
                                    cv=8)
    cv_scores.append(np.mean(np.sqrt(np.abs(cv_loop_results['test_score']))))
    cv_scores_std.append(np.std(np.sqrt(np.abs(cv_loop_results['test_score']))))

In [None]:
cv_scores, cv_scores_std

In [None]:
fig, ax = plt.subplots()
sns.lineplot(x = np.log10(C_list), y = cv_scores, marker = 's', ax = ax)
ax.set_xlabel('Log(C)')
ax.set_ylabel('Mean Accuracy')
ax.set_title('Accuracy Averaged on LogReg C Validation Folds')
plt.show()

Regularization Strength `C=-2` average cross-validation score is almost 0.898

In [None]:
logreg_best = LogisticRegression(C=1e-2)
logreg_best.fit(X_train, y_train)

In [None]:
logbest_ypred = logreg_best.predict(X_test)
logbest_accuracy = accuracy_score(y_test, logbest_ypred)
logbest_ypred_proba = logreg_best.predict_proba(X_test)
logbest_logloss = log_loss(y_test, logbest_ypred_proba)

In [None]:
print(logbest_accuracy)
print(logbest_logloss)

## Decision Tree

In [None]:
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [None]:
dtree_base = DecisionTreeClassifier()
dtree_base.fit(X_train, y_train)

In [None]:
dtree_ypred = dtree_base.predict(X_test)
dtree_accuracy = accuracy_score(y_test, dtree_ypred)
dtree_report = classification_report(y_test, dtree_ypred)

In [None]:
print(dtree_accuracy)
print(dtree_report)

## Decision Tree with Tuning

In [None]:
# StackOverflow
# https://stackoverflow.com/questions/72924835/sklearn-decisiontreeclassifier-loop-for-max-depth

In [None]:
max_depth_list = [10, 20, 30, 40, 50]
cv_scores = []
cv_scores_std = []

for depth in max_depth_list:
    dtree = DecisionTreeClassifier(max_depth=depth, random_state=42)
    cv_loop_results = cross_validate(
                                X=X_train,
                                y=y_train,
                                estimator=dtree,
                                cv=8)
    cv_scores.append(np.mean(np.sqrt(np.abs(cv_loop_results['test_score']))))
    cv_scores_std.append(np.std(np.sqrt(np.abs(cv_loop_results['test_score']))))

In [None]:
best_depth = {'max_depth':max_depth_list, 'cv_scores':cv_scores, 'cv_scores_std':cv_scores_std}
best_depth = pd.DataFrame(best_depth)
best_depth

In [None]:
fig, ax = plt.subplots()
sns.lineplot(x = max_depth_list, y = cv_scores, marker = 's', ax = ax)
ax.set_xlabel('Max Depth')
ax.set_ylabel('Mean Accuracy')
ax.set_title('Accuracy averaged on Max Depth Validation Folds')
plt.show()

## Script

In [None]:
try:
    print('Script Executed Successfully')
except:
    print('FAILED')