In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

import helper as h

In [42]:
from sklearn.metrics import roc_auc_score

In [5]:
# prepare data
categorical = h.get_categorical(explore=False)
numerical = h.get_numerical(explore=False)
#train, val, test, y_train, y_val, y_test = h.split_telco_data(explore=False)
df_train, df_test = h.train_test_split(h.get_telco_data(), test_size=0.2, random_state=42)
y_train = df_train.churn.values
y_test = df_test.churn.values
train = df_train[categorical + numerical]
test = df_test[categorical + numerical]

In [36]:
def train_model(train, y):
    d = train.to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    dv.fit(d)

    X = dv.transform(d)

    model = LogisticRegression(solver='liblinear')
    model.fit(X, y)

    return dv, model

In [19]:
def predict(df, dv, model):
    d = df.to_dict(orient='records')
    
    X = dv.transform(d)

    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [20]:
from sklearn.model_selection import KFold

In [21]:
kfold = KFold(n_splits=10, shuffle=True, random_state=1)

In [22]:
# creates a generator we can loop over it with n_splits (10) iterations
kfold.split(train)

<generator object _BaseKFold.split at 0x1b4fa2110>

In [28]:
# returns train indexes and validate indexes
next(kfold.split(train))[0]

array([   0,    1,    2, ..., 5631, 5632, 5633])

In [31]:
train_idx, val_idx = next(kfold.split(train))
train_idx

array([   0,    1,    2, ..., 5631, 5632, 5633])

In [32]:
len(train_idx), len(val_idx)

(5070, 564)

In [34]:
# get 1st fold train data
train.iloc[train_idx].head()

Unnamed: 0,seniorcitizen,partner,dependents,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,tenure,monthlycharges
2142,0,no,yes,dsl,yes,no,yes,no,no,yes,one_year,no,21,64.85
1623,0,no,no,fiber_optic,no,yes,no,no,yes,yes,two_year,yes,54,97.2
6074,0,yes,no,dsl,no,no,no,no,no,no,month-to-month,yes,1,23.45
1362,0,no,no,fiber_optic,no,no,no,no,no,no,month-to-month,yes,4,70.2
1212,0,no,no,fiber_optic,no,no,no,no,no,no,month-to-month,yes,7,69.55


In [46]:
!pip install tqdm



In [47]:
from tqdm.auto import tqdm

In [48]:
scores = []

for train_idx, val_idx in tqdm(kfold.split(train)):
    X_train = train.iloc[train_idx]
    X_val = train.iloc[val_idx]

    yt = y_train[train_idx]
    yv = y_train[val_idx]

    dv, model = train_model(X_train, yt)
    y_pred = predict(X_val, dv, model)
    
    auc = roc_auc_score(yv, y_pred)
    scores.append(auc)

10it [00:02,  4.79it/s]


In [45]:
scores

[0.8023935684268226,
 0.8584746431106554,
 0.8353623188405798,
 0.8494983277591972,
 0.8497714230133256,
 0.8225851896048793,
 0.8542695722356738,
 0.8372645332814577,
 0.8425943425943426,
 0.8179211347902811]

In [50]:
np.mean(scores), np.std(scores)

(0.8370135053657213, 0.01697691859464629)

In [51]:
print('%.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

0.837 +/- 0.017


Tune `C` value (regularization) in Logistic regression. Default is 1.

In [53]:
def train_model(train, y, C=1):
    d = train.to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    dv.fit(d)

    X = dv.transform(d)

    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
    model.fit(X, y)

    return dv, model

In [54]:
n_splits = 5
for C in [0.001, 0.01, 0.1, 1, 5, 10]:
    kfold = KFold(n_splits=10, shuffle=True, random_state=1)
    scores = []

    for train_idx, val_idx in \
        tqdm(kfold.split(train), total = n_splits):
        X_train = train.iloc[train_idx]
        X_val = train.iloc[val_idx]

        yt = y_train[train_idx]
        yv = y_train[val_idx]

        dv, model = train_model(X_train, yt, C)
        y_pred = predict(X_val, dv, model)
        
        auc = roc_auc_score(yv, y_pred)
        scores.append(auc)
    print('%s %.3f +/- %.3f' % (C, np.mean(scores), np.std(scores)))

10it [00:02,  4.82it/s]                      


0 0.837 +/- 0.017


10it [00:02,  4.74it/s]                      


0.001 0.837 +/- 0.017


10it [00:02,  4.75it/s]                      


0.01 0.837 +/- 0.017


10it [00:02,  4.81it/s]                      


0.1 0.837 +/- 0.017


10it [00:02,  4.82it/s]                      


1 0.837 +/- 0.017


10it [00:02,  4.81it/s]                      


5 0.837 +/- 0.017


10it [00:02,  4.81it/s]                      

10 0.837 +/- 0.017





In [57]:
n_splits = 5
for C in tqdm([0.001, 0.01, 0.1, 1, 5, 10]):
    kfold = KFold(n_splits=10, shuffle=True, random_state=1)
    scores = []

    for train_idx, val_idx in kfold.split(train):
        X_train = train.iloc[train_idx]
        X_val = train.iloc[val_idx]

        yt = y_train[train_idx]
        yv = y_train[val_idx]

        dv, model = train_model(X_train, yt, C)
        y_pred = predict(X_val, dv, model)
        
        auc = roc_auc_score(yv, y_pred)
        scores.append(auc)
    print('%s %.3f +/- %.3f' % (C, np.mean(scores), np.std(scores)))

 17%|█▋        | 1/6 [00:01<00:09,  2.00s/it]

0.001 0.811 +/- 0.018


 33%|███▎      | 2/6 [00:04<00:08,  2.04s/it]

0.01 0.834 +/- 0.017


 50%|█████     | 3/6 [00:06<00:06,  2.07s/it]

0.1 0.837 +/- 0.017


 67%|██████▋   | 4/6 [00:08<00:04,  2.17s/it]

1 0.837 +/- 0.017


 83%|████████▎ | 5/6 [00:10<00:02,  2.15s/it]

5 0.837 +/- 0.017


100%|██████████| 6/6 [00:12<00:00,  2.12s/it]

10 0.837 +/- 0.017





In [60]:
# train on full. Leav C-value default
dv, model = train_model(train, y_train, C=1)
y_pred = predict(test, dv, model)
roc_auc_score(y_test, y_pred)

0.8586140238285012