# 04 Cross Validation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
data = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv"

In [3]:
df = pd.read_csv(data)

df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

df.churn = (df.churn == 'yes').astype(int)

df['tenure'] = df.tenure.astype(float)

In [4]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

del df_train['churn']
del df_val['churn']
del df_test['churn']

In [5]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

categorical = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]

In [6]:
def train(df_train, y_train, C=1.0):
    dicts = df_train[categorical + numerical].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
    model.fit(X_train, y_train)
    
    return dv, model

In [7]:
def predict(df, dv, model):
    dicts = df[categorical + numerical].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [8]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)

auc_scores = []

for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.churn.values
    y_val = df_val.churn.values

    dv, model = train(df_train, y_train)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    auc_scores.append(auc)

print(auc_scores)
print("Mean AUC:", np.mean(auc_scores))
print("Std AUC:", np.std(auc_scores))

[0.8423279509541489, 0.8453247086478611, 0.8335059201284366, 0.8323627454115241, 0.8521736060995889]
Mean AUC: 0.841138986248312
Std AUC: 0.0074294393591401095


Try adding StandardScaler

In [9]:
def train_sc(dfx_train, y_train, C=1.0):
    #df_train.loc[:, numerical] = df_train[numerical].astype(float)
    df_train = dfx_train.copy()
    # for numerical columns, apply StandardScaler
    scaler = StandardScaler()
    scaler.fit(df_train[numerical])
    df_train.loc[:, numerical] = scaler.transform(df_train[numerical])

    dicts = df_train[categorical + numerical].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
    model.fit(X_train, y_train)
    
    return dv, model, scaler

In [10]:
def predict_sc(dfx, dv, model, scaler):
    df = dfx.copy()
    #df.loc[:, numerical] = df[numerical].astype(float)
    df.loc[:, numerical] = scaler.transform(df[numerical])
    dicts = df[categorical + numerical].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [11]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)

auc_scores_sc = []

for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.churn.values
    y_val = df_val.churn.values

    dv, model, scaler = train_sc(df_train, y_train)
    y_pred = predict_sc(df_val, dv, model, scaler)

    auc = roc_auc_score(y_val, y_pred)
    auc_scores_sc.append(auc)

print(auc_scores_sc)
print("Mean AUC:", np.mean(auc_scores_sc))
print("Std AUC:", np.std(auc_scores_sc))

[0.8445337582717775, 0.8449025791285922, 0.8334136062612884, 0.8344251614759866, 0.8515068557622917]
Mean AUC: 0.8417563921799873
Std AUC: 0.006870703243682796


Change C value

In [14]:
array_c = [0.01, 0.1, 1.0, 10.0, 100]

for c in array_c:
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)

    auc_scores_c = []

    for train_idx, val_idx in kfold.split(df_full_train):
        df_train = df_full_train.iloc[train_idx]
        df_val = df_full_train.iloc[val_idx]

        y_train = df_train.churn.values 
        y_val = df_val.churn.values

        dv, model, scaler= train_sc(df_train, y_train, C=c)
        y_pred = predict_sc(df_val, dv, model, scaler)

        auc = roc_auc_score(y_val, y_pred)
        auc_scores_c.append(auc)

    print(f"C={c}")
    print(auc_scores_c)
    print("Mean AUC:", np.mean(auc_scores_c))
    print("Std AUC:", np.std(auc_scores_c))
    print("-----")

C=0.01
[0.8446044069047264, 0.8447577307641372, 0.8296207104154124, 0.8274065517682421, 0.8467964607323272]
Mean AUC: 0.838637172116969
Std AUC: 0.008331464876085558
-----
C=0.1
[0.8450950224113163, 0.8445963283008873, 0.8322175396347582, 0.8326225458847606, 0.8497458504596654]
Mean AUC: 0.8408554573382776
Std AUC: 0.007119010755357397
-----
C=1.0
[0.8445337582717775, 0.8449025791285922, 0.8334136062612884, 0.8344251614759866, 0.8515068557622917]
Mean AUC: 0.8417563921799873
Std AUC: 0.006870703243682796
-----
C=10.0
[0.844007818448713, 0.8448322242087141, 0.8336584386915513, 0.8347529097653001, 0.8519971133632456]
Mean AUC: 0.8418497008955048
Std AUC: 0.006840820379419872
-----
C=100
[0.8438586713347096, 0.844658406171368, 0.8335460565924142, 0.8348248545117349, 0.8519814251200151]
Mean AUC: 0.8417738827460483
Std AUC: 0.006824175507453879
-----


Train final model with full dataset, train+val

In [15]:
# combine train and val
df_full_train = pd.concat([df_train, df_val]).reset_index(drop=True)
y_full_train = df_full_train.churn.values


In [17]:
# use C = 1
dv_full, model_full, scaler_full = train_sc(df_train, y_train, C=1)
y_pred_full = predict_sc(df_test, dv_full, model_full, scaler_full)

auc_full = roc_auc_score(y_test, y_pred_full)

print(auc_full)

0.8575284106297465


In [16]:
# use C = 10
dv_full, model_full, scaler_full = train_sc(df_train, y_train, C=10)
y_pred_full = predict_sc(df_test, dv_full, model_full, scaler_full)

auc_full = roc_auc_score(y_test, y_pred_full)

print(auc_full)

0.8577234120922574


AUC with C=10, is better. Choose model with C=10