In [15]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mutual_info_score, accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score

from tqdm.auto import tqdm
import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress only ConvergenceWarnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [10]:
# Data read and prep

df = pd.read_csv('../chap3/data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Standardize cols and col string data

df.columns = df.columns.str.lower().str.replace(' ', '_')
categorical = list(df.dtypes[df.dtypes == 'object'].index)
for c in categorical:
    df[c] = df[c].str.lower().str.replace(' ', '_')
    
# Fix totalcharges
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

# Convert churn to numerical
df.churn = (df.churn == 'yes').astype(int)

df_full_train, df_test = train_test_split(df, test_size=.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=20/80, random_state=1)
print(f"Split DF len (Full Train, Train, Val, Test): {len(df_full_train), len(df_train), len(df_val), len(df_test)}")

df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_full_train = df_full_train.pop('churn')
y_train = df_train.pop('churn')
y_val = df_val.pop('churn')
y_test = df_test.pop('churn')
print(f"Y DF len (Full Train, Train, Val, Test): {len(y_full_train), len(y_train), len(y_val), len(y_test)}")

numerical = ['tenure', 'monthlycharges', 'totalcharges']

categorical = [
 'gender',
 'seniorcitizen',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod'
]

Split DF len (Full Train, Train, Val, Test): (5634, 4225, 1409, 1409)
Y DF len (Full Train, Train, Val, Test): (5634, 4225, 1409, 1409)


In [3]:
def train(df_t, y, C=1.0):
    dicts = df_t[categorical + numerical].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_t = dv.fit_transform(dicts)

    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_t, y)

    return dv, model


In [5]:
def predict(df_v, dv, model):
    dicts = df_v[categorical + numerical].to_dict(orient='records')

    X_val = dv.transform(dicts)
    y_pred = model.predict_proba(X_val)[:, 1]

    return y_pred

In [11]:
C = 1.0
n_splits = 5


In [16]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = [] 

for train_idx, val_idx in tqdm(kfold.split(df_full_train)): 
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = y_full_train.iloc[train_idx]
    y_val = y_full_train.iloc[val_idx]

    dv, model = train(df_train, y_train)
    y_pred = predict(df_val, dv, model)

    score = roc_auc_score(y_val, y_pred)
    scores.append(score)


0it [00:00, ?it/s]

In [17]:
scores

[np.float64(0.8445573078160937),
 np.float64(0.8449563799496755),
 np.float64(0.8333493879189244),
 np.float64(0.8347529097653003),
 np.float64(0.8517225691067114)]

In [18]:
dv, model = train(df_full_train, y_full_train.values, C=1.0)
y_pred = predict(df_test, dv, model)

auc = roc_auc_score(y_test, y_pred)
auc

np.float64(0.858357166845418)

In [19]:
## Save the model


In [20]:
import pickle

In [21]:
output_file = f'model_C={C}.bin'
output_file

'model_C=1.0.bin'

In [28]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)

In [29]:
# restart kernel to simulate diff process

In [2]:
import pickle

In [7]:
model_file = 'model_C=1.0.bin'

In [10]:
with open(model_file, 'rb') as f_in:
    dv, model = pickle.load(f_in)

# Ensure scikit-learn is installed.

In [11]:
dv, model

(DictVectorizer(sparse=False), LogisticRegression(max_iter=1000))

In [12]:
customer = {
 'customerid': '0921-ohlvp',
 'gender': 'male',
 'seniorcitizen': 0,
 'partner': 'no',
 'dependents': 'no',
 'tenure': 22,
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'no',
 'onlinebackup': 'no',
 'deviceprotection': 'no',
 'techsupport': 'no',
 'streamingtv': 'no',
 'streamingmovies': 'yes',
 'contract': 'month-to-month',
 'paperlessbilling': 'no',
 'paymentmethod': 'electronic_check',
 'monthlycharges': 83.05,
 'totalcharges': 1799.3
}

In [15]:
X = dv.transform([customer])

In [16]:
model.predict_proba(X)[:, 1]

array([0.57204321])