#### Cleaned up code for training model for churn prediction

In [1]:
#Import libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

In [None]:
#Parameters
n_splits = 5
C=1.0

In [3]:
#Load and prepare data
print("Loading data")
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

Loading data


In [8]:
df.iloc[0].to_dict()

{'customerID': '7590-VHVEG',
 'gender': 'Female',
 'SeniorCitizen': 0,
 'Partner': 'Yes',
 'Dependents': 'No',
 'tenure': 1,
 'PhoneService': 'No',
 'MultipleLines': 'No phone service',
 'InternetService': 'DSL',
 'OnlineSecurity': 'No',
 'OnlineBackup': 'Yes',
 'DeviceProtection': 'No',
 'TechSupport': 'No',
 'StreamingTV': 'No',
 'StreamingMovies': 'No',
 'Contract': 'Month-to-month',
 'PaperlessBilling': 'Yes',
 'PaymentMethod': 'Electronic check',
 'MonthlyCharges': 29.85,
 'TotalCharges': '29.85',
 'Churn': 'No'}

In [None]:
print("Preparing data")
df.columns = df.columns.str.lower().str.replace(' ','_')

categorical_cols = list(df.dtypes[df.dtypes == 'object'].index)
for c in categorical_cols:
    df[c] = df[c].str.lower().str.replace(' ','_')

df['totalcharges'] = pd.to_numeric(data['totalcharges'],errors='coerce')
df['totalcharges'] = df['totalcharges'].fillna(0)

df['churn'] = (df['churn'] == 'yes').astype(int)

In [None]:
numerical_cols = [ 'tenure', 'monthlycharges', 'totalcharges']

categorical_cols = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod'
]

In [None]:
df_full_train,df_test = train_test_split(df,test_size=0.2,random_state=1)

In [None]:
def train(df_train, y_train, C=1.0):
    dicts = df_train[categorical_cols + numerical_cols].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)
    
    return dv, model

In [None]:
def predict(df, dv, model):
    dicts = df[categorical_cols + numerical_cols].to_dict(orient='records')

    X = dv.transform(dicts)
    
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [None]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []
fold = 0

print("Training model")
for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.churn.values
    y_val = df_val.churn.values

    dv, model = train(df_train, y_train, C=C)
    
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)
    print(f"AUC for fold-{fold} is {auc}")
    fold = fold + 1

print(f'C={C}, mean_auc={np.mean(scores)}, std={np.std(scores)}')

In [None]:
print("Training final model")
dv, model = train(df_full_train, df_full_train['churn'].values, C=C)

y_test = df_test['churn'].values
y_pred = predict(df_test, dv, model)

auc = roc_auc_score(y_test, y_pred)
print(f"AUC on final model is {auc}")

Save model

In [None]:
import pickle

In [None]:
output_file = f'model_C={C}.bin'
output_file

In [None]:
#Open a file in write mode as binary file
f_out = open(output_file,'wb')
pickle.dump((dv,model),f_out)
f_out.close()
#It is very important to close the file

In [None]:
#A better way for file operations is using with where you do not need to explicitly close the file as it is automatically done when you come out of the with loop
with open(output_file,'wb') as f_out:
    pickle.dump((dv,model),f_out)

Load the model

In [1]:
#Restart the Jupyter notebook and simply run from this code block
import pickle

#Note that although we do not import sklearn here, it needs to be installed on the machine where you will be running this code, else it will complain about missing sklearn

In [2]:
model_file = 'model_C=1.0.bin'
with open(model_file,'rb') as f_in:
    dv,model = pickle.load(f_in)

In [3]:
dv,model

(DictVectorizer(sparse=False), LogisticRegression(max_iter=1000))

In [None]:
customer1 = {
 'gender': 'Female',
 'SeniorCitizen': 0,
 'Partner': 'Yes',
 'Dependents': 'No',
 'tenure': 1,
 'PhoneService': 'No',
 'MultipleLines': 'No phone service',
 'InternetService': 'DSL',
 'OnlineSecurity': 'No',
 'OnlineBackup': 'Yes',
 'DeviceProtection': 'No',
 'TechSupport': 'No',
 'StreamingTV': 'No',
 'StreamingMovies': 'No',
 'Contract': 'Month-to-month',
 'PaperlessBilling': 'Yes',
 'PaymentMethod': 'Electronic check',
 'MonthlyCharges': 29.85,
 'TotalCharges': '29.85',
}

customer2 = {
 'gender': 'Female',
 'SeniorCitizen': 0,
 'Partner': 'Yes',
 'Dependents': 'No',
 'tenure': 1,
 'PhoneService': 'No',
 'MultipleLines': 'No phone service',
 'InternetService': 'DSL',
 'OnlineSecurity': 'No',
 'OnlineBackup': 'Yes',
 'DeviceProtection': 'No',
 'TechSupport': 'No',
 'StreamingTV': 'No',
 'StreamingMovies': 'No',
 'Contract': 1,
 'PaperlessBilling': 'Yes',
 'PaymentMethod': 'Electronic check',
 'MonthlyCharges': 29.85,
 'TotalCharges': '29.85',
}

customer = customer2

In [None]:
X = dv.transform([customer])

In [None]:
y_pred = model.predict_proba(X)[0,1]
churn = (y_pred >= 0.5)
print(f'churn probability is {y_pred}, likelihood of customer churning: {churn}')

Make requests (for this create a separate notebook - to represent a request from a user)