In [29]:
import pandas as pd
import numpy as np

import pickle
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
import joblib
from matplotlib import pyplot as plt
%matplotlib inline

In [30]:
loaded_model = joblib.load("./models/logistic_reg.bin")

In [31]:
loaded_model

(DictVectorizer(sparse=False),
 GridSearchCV(cv=5,
              estimator=LogisticRegression(class_weight='balanced',
                                           random_state=1, solver='liblinear'),
              param_grid={'C': [0.001, 0.01, 0.1, 1],
                          'fit_intercept': [True, False]},
              scoring='roc_auc'))

In [32]:
def read_dataframe(filename: str):
    df = pd.read_csv(filename)

    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df['TotalCharges'] = df['TotalCharges'].fillna(0)

    df.columns = df.columns.str.lower().str.replace(' ','_')
    string_columns = list(df.dtypes[df.dtypes == 'object'].index)

    for col in string_columns:
        df[col] = df[col].str.lower().str.replace(' ', '_')

    df.churn = (df.churn == 'yes').astype(int)

    return df

In [33]:
def prepare_dictionaries(df: pd.DataFrame):

    categorical = ['gender', 'seniorcitizen', 'partner', 'dependents','phoneservice', 'multiplelines', 'internetservice','onlinesecurity', 'onlinebackup', 'deviceprotection','techsupport', 'streamingtv', 'streamingmovies','contract', 'paperlessbilling', 'paymentmethod']
    numerical = ['tenure', 'monthlycharges', 'totalcharges']

    dicts = df[categorical + numerical].to_dict(orient='records')
    return dicts

In [34]:
df = read_dataframe("./data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

dicts = prepare_dictionaries(df)
y_pred = loaded_model.predict(dicts)

AttributeError: 'tuple' object has no attribute 'predict'

In [42]:
X_train = dv.transform(train_dict)

In [43]:
X_train.shape

(3774, 45)

In [44]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

Modelling: Training

I have built 3 prototype models using Grid Search and 5 fold cross validation for hyperparameter tuning.

1) Logistic Regression
2) Decision Tree
3) Random Forest
4) Gradient Boosted Classifier(GBC)

In [45]:
# Define a machine learning pipline with k-fold crossvalidation and gridsearch
import time
from sklearn.model_selection import GridSearchCV


def model_pipeline(model, param_grid, x_train, Y_train):

    """
    Pipeline to train sklearn model using k-fold
    cross validation and grid search

    returns the best model and results for all
    traning runs

    parameters -
    model: an sklearn machine learning model
    param_gird: search space for grid search as dict

    """

    # Initialisa model with GridSearchCV or just GridSearch

    Tuned_Model = GridSearchCV(
        estimator=model, param_grid=param_grid, scoring="roc_auc", cv=5
    )

    # Fit model & Time the process for training the model
    print("Training Model")
    start_time = time.process_time()

    Tuned_Model.fit(x_train, Y_train)

    print("Finished training model")

    # End of fit time
    print(time.process_time() - start_time, "Seconds")

    return Tuned_Model, pd.DataFrame(Tuned_Model.cv_results_)

In [56]:
# Model 1: Logistic Regression

# Train and test a logistic regression model
from sklearn.linear_model import LogisticRegression

# Initialise the random forest model
model = LogisticRegression(penalty="l2", solver="liblinear", class_weight="balanced", random_state=1)
# Set paramters for Grid Search
param_grid = {"C": [0.001, 0.01, 0.1, 1], "fit_intercept": [True, False]}

# Train model and get results
Tuned_LogReg, Results_LogReg = model_pipeline(model, param_grid, X_train, y_train)

Results_LogReg

Training Model
Finished training model
0.171875 Seconds


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_fit_intercept,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.013607,0.001829,0.002797,0.000739,0.001,True,"{'C': 0.001, 'fit_intercept': True}",0.815746,0.8491,0.827921,0.819497,0.817785,0.82601,0.012265,8
1,0.013919,0.001354,0.002396,0.000486,0.001,False,"{'C': 0.001, 'fit_intercept': False}",0.815887,0.849091,0.827947,0.819489,0.817706,0.826024,0.012251,7
2,0.017669,0.002696,0.00309,0.000935,0.01,True,"{'C': 0.01, 'fit_intercept': True}",0.831605,0.856884,0.847941,0.84084,0.834707,0.842395,0.00915,5
3,0.026632,0.010461,0.006587,0.007049,0.01,False,"{'C': 0.01, 'fit_intercept': False}",0.831605,0.856893,0.847994,0.840813,0.834583,0.842378,0.009181,6
4,0.015093,0.001551,0.002211,0.000409,0.1,True,"{'C': 0.1, 'fit_intercept': True}",0.831747,0.855465,0.85115,0.847037,0.836341,0.844348,0.008944,1
5,0.013992,0.001093,0.002012,0.000633,0.1,False,"{'C': 0.1, 'fit_intercept': False}",0.831773,0.855456,0.851105,0.847019,0.836208,0.844312,0.008951,2
6,0.016338,0.002762,0.002209,0.000406,1.0,True,"{'C': 1, 'fit_intercept': True}",0.831535,0.854874,0.851555,0.847196,0.836137,0.844259,0.008971,3
7,0.01551,0.000994,0.002023,0.00065,1.0,False,"{'C': 1, 'fit_intercept': False}",0.831509,0.854733,0.851599,0.84724,0.836049,0.844226,0.008972,4


In [57]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [58]:
model = Tuned_LogReg

In [59]:
model.predict_proba(X_val)

array([[0.54637744, 0.45362256],
       [0.49385026, 0.50614974],
       [0.44481904, 0.55518096],
       ...,
       [0.85740081, 0.14259919],
       [0.19919673, 0.80080327],
       [0.84628487, 0.15371513]])

In [60]:
y_pred = model.predict_proba(X_val)[:, 1]

In [61]:
y_pred

array([0.45362256, 0.50614974, 0.55518096, ..., 0.14259919, 0.80080327,
       0.15371513])

In [62]:
churn = y_pred > 0.5

In [63]:
# Simple Accuracy Calculation
(y_val == churn).mean()

0.7365591397849462

In [None]:
import pickle
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, model), f_out)