# **Library**
-----------------------------------------------------------------------

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

import pickle

# Data Preparation
------------------------------------------------------------------

In [2]:
data = 'https://raw.githubusercontent.com/madityarafip/My-Machine-Learning/main/Dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [3]:
!wget --no-check-certificate $data -O data-week3.csv

--2021-10-05 18:41:00--  https://raw.githubusercontent.com/madityarafip/My-Machine-Learning/main/Dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 977501 (955K) [text/plain]
Saving to: 'data-week3.csv'

     0K .......... .......... .......... .......... ..........  5% 3.59K 4m12s
    50K .......... .......... .......... .......... .......... 10% 6.30K 3m7s
   100K .......... .......... .......... .......... .......... 15% 8.16K 2m30s
   150K .......... .......... .......... .......... .......... 20% 8.41K 2m8s
   200K .......... .......... .......... .......... .......... 26% 12.6K 1m47s
   250K .......... .......... .......... .......... .......... 31% 9.17K 95s
 

In [4]:
df = pd.read_csv('data-week3.csv')
df.head(3)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [5]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [6]:
tc = pd.to_numeric(df.totalcharges, errors='coerce')

In [7]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

In [8]:
df.totalcharges = df.totalcharges.fillna(0)

In [9]:
df.churn.head()

0     no
1     no
2    yes
3     no
4    yes
Name: churn, dtype: object

In [10]:
df.churn = (df.churn == 'yes').astype(int)

In [11]:
df.head(3)

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,0
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,0
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,1


# Train and Predict 
-----------------------------------------------------------------

In [12]:
def train(df_train, y_train, col, C=1.0):
    dicts = df_train[col].to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
    
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
    model.fit(X_train, y_train)
    
    return dv, model
#--------------------------------------------------------------------------
def predict(df, dv, model, col):
    dicts = df[col].to_dict(orient='records')
    
    X_data = dv.transform(dicts)
    y_pred = model.predict_proba(X_data)[:, 1]

    return y_pred

# Split Train, Val, and Test
-------------------------------------------------------------------------------

In [13]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [14]:
C = 1.0
n_splits = 5
col = ['tenure', 'monthlycharges', 'contract']

In [15]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)
scores = []
for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.churn.values
    y_val = df_val.churn.values

    dv, model = train(df_train, y_train, col, C=C)
    y_pred = predict(df_val, dv, model, col)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

print('C = %5s | Mean = %.3f | STD = +- %.3f' % (C, np.mean(scores), np.std(scores)))

C =   1.0 | Mean = 0.826 | STD = +- 0.007


# Model
-----------------------------------------------------------------------

In [16]:
dicts = df[col].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X = dv.fit_transform(dicts)
y = df.churn.values

model = LogisticRegression().fit(X, y)

# Save Model
---------------------------------------------------------------

In [17]:
output_file = f'model_C={C}.bin'
output_file

'model_C=1.0.bin'

In [18]:
with open(output_file, 'wb') as f_out:
    pickle.dump((model, dv), f_out)

# Load Model
-----------------------------------------------------------------------------

In [1]:
import pickle

In [2]:
customer = {'tenure': 12, 'monthlycharges': 19.7, 'contract': 'two-year'}

In [3]:
model_file = 'model_C=1.0.bin'

In [4]:
with open(model_file, 'rb') as f_in:
    model, dv = pickle.load(f_in)

In [5]:
dv, model

(DictVectorizer(sparse=False), LogisticRegression())

In [6]:
X = dv.transform(customer)

In [8]:
y_pred = model.predict_proba(X)[0,1]
y_pred

0.1374092267741354