In [7]:
data = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv"

In [8]:
!wget  $data -O data-week-3.csv

--2023-10-14 12:57:23--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 977501 (955K) [text/plain]
Saving to: ‘data-week-3.csv’


2023-10-14 12:57:23 (13.7 MB/s) - ‘data-week-3.csv’ saved [977501/977501]



In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [10]:
df = pd.read_csv('data-week-3.csv')
df.columns=df.columns.str.lower().str.replace(' ','_')
strings = list(df.dtypes[df.dtypes=='object'].index)
for col in strings:
  df[col]= df[col].str.lower().str.replace(' ','_')

df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)
df.churn = (df.churn=='yes').astype(int)

In [11]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [12]:
numerical=['tenure','monthlycharges','totalcharges']
categorical = [ 'gender', 'seniorcitizen', 'partner', 'dependents',
        'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [17]:
def train(df_train,y_train,C=1.0):
  dicts=df_train[categorical+numerical].to_dict(orient='records')

  dv = DictVectorizer(sparse=False)
  X_train= dv.fit_transform(dicts)

  model = LogisticRegression(C=C, max_iter=1000)
  model.fit(X_train,y_train)
  return dv,model


In [14]:
def predict(df,dv,model):
  dicts=df[categorical+numerical].to_dict(orient='records')

  dv = DictVectorizer(sparse=False)
  X= dv.fit_transform(dicts)

  y_pred = model.predict_proba(X)[:,1]
  return y_pred

In [15]:
C=1.0
n_splits = 5

In [29]:
kfold =KFold(n_splits=n_splits, shuffle=True,random_state=1)
scores =[]
for train_idx, val_idx in kfold.split(df_full_train):
  df_train = df_full_train.iloc[train_idx]
  df_val = df_full_train.iloc[val_idx]

  y_train = df_train.churn.values
  y_val = df_val.churn.values

  dv,model = train(df_train,y_train, C=C)
  y_pred = predict(df_val,dv,model)

  auc_score = roc_auc_score(y_val,y_pred)
  scores.append(auc_score)
print('C=%s %.3f +- %.3f'%(C,np.mean(scores),np.std(scores)))

C=1.0 0.841 +- 0.009


In [30]:
  dv,model = train(df_full_train,df_full_train.churn.values, C=C)
  y_pred = predict(df_test,dv,model)
  y_test= df_test.churn.values
  auc_score = roc_auc_score(y_test,y_pred)
  auc_score

0.8572386167896259

save the model

In [20]:
import pickle

In [31]:
output_file = f'model_C={C}.bin'
with open(output_file,'wb') as f_out:
  pickle.dump((dv,model),f_out)


load the model

In [32]:
import pickle
model_file = f'model_C={C}.bin'

with open(model_file,'rb') as f_in:
  dv,model = pickle.load(f_in)

In [27]:
customer = {
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'yes',
    'dependents': 'no',
    'phoneservice': 'no',
    'multiplelines': 'no_phone_service',
    'internetservice': 'dsl',
    'onlinesecurity': 'no',
    'onlinebackup': 'yes',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'no',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 29.85,
    'totalcharges': 29.85
}

In [33]:
X=dv.transform([customer])
model.predict_proba(X)[0,1]

0.636358415272119