# Metrics, Evaluation, Training and Tuning
We will explore the tools tecniques to:
- assest the performance of a predictor model
- Choose the best model

We will do this in 2 parts:
1. Building a test model: Cleaning data, using logistic regression, create some predictions
2. Evaluating and Tuning

# The predictor model: Logistic Regression

In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
#Data overview
df = pd.read_csv('../../datasets/kaggle/WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(f'number of rows:{len(df)}')
df.head().T

number of rows:7043


Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [23]:
# The column: TotalCharges needs to be a number
# Lets fill with 0 where function couldn't create a numeric value
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors="coerce")
df.TotalCharges = df.TotalCharges.fillna(0)
# Let’s make it uniform by lowercasing everything and replacing spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')
string_columns = list(df.dtypes[df.dtypes == 'object'].index)
 
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,no
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,no
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,yes
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,no
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,yes


In [24]:
# turn our attention to churn column, which is going to be our Target
df.churn = (df.churn == 'yes').astype(int)
df.churn.value_counts()

0    5174
1    1869
Name: churn, dtype: int64

In [49]:
# Separate data for training and testing
from sklearn.model_selection import train_test_split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
print(f'rows in full training data: {len(df_train_full)}')
print(f'rows in test data: {len(df_test)}')

# Use training data to separate into train and validation
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)
print(f'rows in training data: {len(df_train)}')
print(f'rows in validation data: {len(df_val)}')

# Obtain Target feature
y_train = df_train.churn.values
y_val = df_val.churn.values

# Remove target from X features
del df_train['churn']
del df_val['churn']

rows in full training data: 5634
rows in test data: 1409
rows in training data: 3774
rows in validation data: 1860


### Exploratory Data Analysis

In [26]:
#check for any missing values
df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [27]:
# Calculate churn rate
global_mean = round(df_train_full.churn.mean(),3)
# The global_mean indicates that this dataset is 'imbalanced'
print(f'global churn mean: {global_mean}')
# Let's divide our data in Numerical and Categorical variables
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

# Notice that our categorical values have few unique values
# Which is something good as it reduces the amount of cleaning
df_train_full[categorical].nunique()

global churn mean: 0.27


gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [28]:
# Churn Rates
#Lets calculate the risk for each category
for col in categorical:
    df_group = df_train_full.groupby(by=col).churn.agg(['mean']) 
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean'] / global_mean
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006824,1.025274
male,0.263214,-0.006786,0.974865


Unnamed: 0_level_0,mean,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.24227,-0.02773,0.897297
1,0.413377,0.143377,1.531027


Unnamed: 0_level_0,mean,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.329809,0.059809,1.221515
yes,0.205033,-0.064967,0.759383


Unnamed: 0_level_0,mean,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.31376,0.04376,1.162074
yes,0.165666,-0.104334,0.613579


Unnamed: 0_level_0,mean,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.241316,-0.028684,0.893764
yes,0.273049,0.003049,1.011292


Unnamed: 0_level_0,mean,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.257407,-0.012593,0.953361
no_phone_service,0.241316,-0.028684,0.893764
yes,0.290742,0.020742,1.07682


Unnamed: 0_level_0,mean,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192347,-0.077653,0.712398
fiber_optic,0.425171,0.155171,1.574709
no,0.077805,-0.192195,0.288167


Unnamed: 0_level_0,mean,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.420921,0.150921,1.558967
no_internet_service,0.077805,-0.192195,0.288167
yes,0.153226,-0.116774,0.567503


Unnamed: 0_level_0,mean,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.404323,0.134323,1.497494
no_internet_service,0.077805,-0.192195,0.288167
yes,0.217232,-0.052768,0.804564


Unnamed: 0_level_0,mean,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.395875,0.125875,1.466205
no_internet_service,0.077805,-0.192195,0.288167
yes,0.230412,-0.039588,0.853379


Unnamed: 0_level_0,mean,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.418914,0.148914,1.551534
no_internet_service,0.077805,-0.192195,0.288167
yes,0.159926,-0.110074,0.59232


Unnamed: 0_level_0,mean,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.342832,0.072832,1.269747
no_internet_service,0.077805,-0.192195,0.288167
yes,0.302723,0.032723,1.121195


Unnamed: 0_level_0,mean,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.338906,0.068906,1.255209
no_internet_service,0.077805,-0.192195,0.288167
yes,0.307273,0.037273,1.138047


Unnamed: 0_level_0,mean,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.431701,0.161701,1.598893
one_year,0.120573,-0.149427,0.446568
two_year,0.028274,-0.241726,0.104718


Unnamed: 0_level_0,mean,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.172071,-0.097929,0.6373
yes,0.338151,0.068151,1.252412


Unnamed: 0_level_0,mean,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.168171,-0.101829,0.622854
credit_card_(automatic),0.164339,-0.105661,0.608661
electronic_check,0.45589,0.18589,1.688482
mailed_check,0.19387,-0.07613,0.718036


In [29]:
# Mutual information
from sklearn.metrics import mutual_info_score
 
def calculate_mi(series):                                      
    return mutual_info_score(series, df_train_full.churn)      
 
df_mi = df_train_full[categorical].apply(calculate_mi)         
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI') 
# As we see, contract, onlinesecurity, and techsupport are among the most important features
df_mi

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


In [50]:
# Correlation
df_train_full[numerical].corrwith(df_train_full.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

In [52]:
# Feature Engineering
from sklearn.feature_extraction import DictVectorizer
# Train Dictionary
train_dict = df_train[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
# Obtain the X_train feature matrix
X_train = dv.transform(train_dict)
X_train.shape

(3774, 45)

## The logistic Regression Model

In [53]:
from sklearn.linear_model import LogisticRegression
# Create and train model
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

# Validate
# Use the validation data to obtain a validation data frame
val_dict = df_val[categorical + numerical].to_dict(orient='records') 
X_val = dv.transform(val_dict) 

# Create some predictions
y_pred = model.predict_proba(X_val)
y_pred

array([[0.76508733, 0.23491267],
       [0.73112969, 0.26887031],
       [0.6805478 , 0.3194522 ],
       ...,
       [0.94274623, 0.05725377],
       [0.38476886, 0.61523114],
       [0.93872731, 0.06127269]])

In [54]:
# Notice that the shape is of type (1860,2)
# We have 1860 rows and 2 columns. The first column denotes the probablity
# of a customer NOT Churning, the second the probability of the customer 
# Churning, thus the sum of this probabilities is alwats 1
y_pred[:,0]+y_pred[:,1]

array([1., 1., 1., ..., 1., 1., 1.])

In [34]:
# We only need one column
churn = y_pred[:,1] >= 0.5
churn

array([False, False, False, ..., False,  True, False])

In [35]:
# Accuracy
(y_val == churn).mean()

0.8016129032258065

In [36]:

small_subset = ['contract', 'tenure', 'totalcharges']
train_dict_small = df_train[small_subset].to_dict(orient='records')
dv_small = DictVectorizer(sparse=False)
dv_small.fit(train_dict_small)

X_small_train = dv_small.transform(train_dict_small)
model_small = LogisticRegression(solver='liblinear', random_state=1)
model_small.fit(X_small_train, y_train)

LogisticRegression(random_state=1, solver='liblinear')

# Evaluation Metrics for classification

## Accuracy

### Logistic Regression Model

In [37]:
from sklearn.metrics import accuracy_score
def classification_accuracy(pred, y):
    return (pred == y).mean()

my_accuracy = classification_accuracy(pred= churn, y=y_val)
churn_accuracy = accuracy_score(y_val, churn)
print(f'my_accuracy: {my_accuracy}, sklearn:{churn_accuracy}')


my_accuracy: 0.8016129032258065, sklearn:0.8016129032258065


### Small Logistic Regression Model

In [38]:
# Compare accuracy with another model (small model)
val_dict_small = df_val[small_subset].to_dict(orient='records')  
X_small_val = dv_small.transform(val_dict_small)                 
y_pred_small = model_small.predict_proba(X_small_val)[:, 1]      
 
churn_small = y_pred_small >= 0.5                                
print(f'small model accuracy:{accuracy_score(y_val, churn_small)}')  

small model accuracy:0.7672043010752688


### Dummy Model


In [39]:
dummy_pred = np.repeat(False, len(y_val))
print(f'dummy model accuracy: {accuracy_score(y_val, dummy_pred)}')

dummy model accuracy: 0.7387096774193549


### Tree Model

In [72]:
# Add a Decision Tree
from sklearn.tree import DecisionTreeClassifier
# Instantiate your decision tree model
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
tree_pred = tree_model.predict(X_val)
tree_pred



array([0, 1, 1, ..., 0, 0, 1])

### Compare models

In [73]:
model_accuracy_results = [['LogisticRegression', accuracy_score(y_val, churn)],
['SmallLogisticRegression', accuracy_score(y_val, churn_small)],
['Dummy', accuracy_score(y_val, dummy_pred)],
['DecisionTree', accuracy_score(y_val, tree_pred)]]
df_model_accuracies = pd.DataFrame(model_accuracy_results, columns=['Model','Accuracy'])
df_model_accuracies

Unnamed: 0,Model,Accuracy
0,LogisticRegression,0.801613
1,SmallLogisticRegression,0.767204
2,Dummy,0.73871
3,DecisionTree,0.72043


### Thresholds VS Accuracy

In [None]:
y_pred_small
y_pred
threshold = np.linspace(0, 1, 21)
accuracies = {"Accuracy":[], "LogisticRegressionModel":[], "SmallModel":[]}
for t in threshold:
    accuracies["Accuracy"].append(t)
    model_accuracy = accuracy_score(y_val, (y_pred >=t))
    accuracies["LogisticRegressionModel"].append(model_accuracy)
    small_model_accuracy = accuracy_score(y_val, (y_pred_small >=t))
    accuracies["SmallModel"].append(small_model_accuracy)

df_logisticRegression_accuracies = pd.DataFrame(accuracies)
df_logisticRegression_accuracies.head()
plt.plot(df_logisticRegression_accuracies.Accuracy, df_logisticRegression_accuracies.LogisticRegressionModel, color='orange')
plt.plot(df_logisticRegression_accuracies.Accuracy, df_logisticRegression_accuracies.SmallModel, color='green')


In [None]:
def confusion_table(y_val,y_pred):
    t = 0.5 #threshold
    predict_churn = y_pred >= t
    predict_no_churn = y_pred < t
    actual_churn = y_val == 1
    actual_no_churn = y_val == 0
    
    true_positives =  (predict_churn & actual_churn).sum()
    false_positives =  (predict_churn & actual_no_churn).sum()
    true_negatives = (predict_no_churn & actual_no_churn).sum()
    false_negatives = (predict_no_churn & actual_churn).sum()

    table = np.array([[true_negatives, false_positives],[false_negatives, true_positives]])
    display(table)
    

y_pred = model.predict_proba(X_val)[:,1]
confusion_table(y_val, y_pred)
