### Week 3 - Explore more

Exploring the accuracy of the model when using all features for prediction Vs after removing the least important features

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.metrics import mean_squared_error, mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

import os

In [2]:
# filename = "telco-churn.csv"
# url = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv -O telco-churn.csv"
# if not os.path.isfile(filename):
#     !wget $url -O $filename

In [3]:
df = pd.read_csv("telco-churn.csv")

In [4]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [6]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [7]:
df.columns = df.columns.str.lower().str.replace(' ','_')

In [8]:
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [9]:
#Features without customerid and churn(the target)
consider_features = ['gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges']

target = ['churn']

#Columns to process for removing spaces and to make everything lowercase
process_cols = ['gender', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod','churn']

categorical_cols = ['gender', 'seniorcitizen', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

numerical_cols = ['tenure', 'monthlycharges', 'totalcharges']

In [10]:
for col in process_cols:
    df[col] = df[col].str.lower().str.replace(' ','_')

In [11]:
df

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-VHVEG,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,no
1,5575-GNVDE,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,no
2,3668-QPYBK,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,yes
3,7795-CFOCW,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.30,1840.75,no
4,9237-HQITU,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.70,151.65,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,male,0,yes,yes,24,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,yes,mailed_check,84.80,1990.5,no
7039,2234-XADUH,female,0,yes,yes,72,yes,yes,fiber_optic,no,...,yes,no,yes,yes,one_year,yes,credit_card_(automatic),103.20,7362.9,no
7040,4801-JZAZL,female,0,yes,yes,11,no,no_phone_service,dsl,yes,...,no,no,no,no,month-to-month,yes,electronic_check,29.60,346.45,no
7041,8361-LTMKD,male,1,yes,no,4,yes,yes,fiber_optic,no,...,no,no,no,no,month-to-month,yes,mailed_check,74.40,306.6,yes


In [12]:
df['churn'] = (df['churn'] == 'yes').astype(int)

In [13]:
df['totalcharges'] = pd.to_numeric(df['totalcharges'],errors='coerce')

In [14]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

In [15]:
df['totalcharges'] = df['totalcharges'].fillna(0)

In [16]:
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [17]:
# df['seniorcitizen'] = df['seniorcitizen'].astype(object)

In [18]:
# df.dtypes

In [19]:
#Find mutual information score for categorical features
def get_mutual_info_score(series):
    return mutual_info_score(series,df['churn'])

mi_score = []
for col in categorical_cols:
    entry = get_mutual_info_score(df[col])
    mi_score.append({'feature': col, 'score':entry})
    
mi_score

[{'feature': 'gender', 'score': 3.7082914405128786e-05},
 {'feature': 'seniorcitizen', 'score': 0.010577263953987642},
 {'feature': 'partner', 'score': 0.011453657253317984},
 {'feature': 'dependents', 'score': 0.014467261139424592},
 {'feature': 'phoneservice', 'score': 7.215949186982484e-05},
 {'feature': 'multiplelines', 'score': 0.0008012658524292199},
 {'feature': 'internetservice', 'score': 0.05557418477268879},
 {'feature': 'onlinesecurity', 'score': 0.06467728245735829},
 {'feature': 'onlinebackup', 'score': 0.04679232253922637},
 {'feature': 'deviceprotection', 'score': 0.04391690927485155},
 {'feature': 'techsupport', 'score': 0.06302103606897548},
 {'feature': 'streamingtv', 'score': 0.031907975162527094},
 {'feature': 'streamingmovies', 'score': 0.03200094959522297},
 {'feature': 'contract', 'score': 0.09845305342598942},
 {'feature': 'paperlessbilling', 'score': 0.019194399646111526},
 {'feature': 'paymentmethod', 'score': 0.044518668630902994}]

In [20]:
#Sort scores to find the least important feature
def get_score(entry):
    return entry['score']

mi_score.sort(key=get_score,reverse=True)
mi_score

[{'feature': 'contract', 'score': 0.09845305342598942},
 {'feature': 'onlinesecurity', 'score': 0.06467728245735829},
 {'feature': 'techsupport', 'score': 0.06302103606897548},
 {'feature': 'internetservice', 'score': 0.05557418477268879},
 {'feature': 'onlinebackup', 'score': 0.04679232253922637},
 {'feature': 'paymentmethod', 'score': 0.044518668630902994},
 {'feature': 'deviceprotection', 'score': 0.04391690927485155},
 {'feature': 'streamingmovies', 'score': 0.03200094959522297},
 {'feature': 'streamingtv', 'score': 0.031907975162527094},
 {'feature': 'paperlessbilling', 'score': 0.019194399646111526},
 {'feature': 'dependents', 'score': 0.014467261139424592},
 {'feature': 'partner', 'score': 0.011453657253317984},
 {'feature': 'seniorcitizen', 'score': 0.010577263953987642},
 {'feature': 'multiplelines', 'score': 0.0008012658524292199},
 {'feature': 'phoneservice', 'score': 7.215949186982484e-05},
 {'feature': 'gender', 'score': 3.7082914405128786e-05}]

In [21]:
df[numerical_cols].corrwith(df['churn'])

tenure           -0.352229
monthlycharges    0.193356
totalcharges     -0.198324
dtype: float64

In [22]:
use_categorical_features = ['seniorcitizen', 'partner', 'dependents', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

use_numerical_features = ['tenure', 'monthlycharges', 'totalcharges']

all_features = use_categorical_features + use_numerical_features + target
all_features

['seniorcitizen',
 'partner',
 'dependents',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod',
 'tenure',
 'monthlycharges',
 'totalcharges',
 'churn']

#### Model prediction with all features

In [23]:
df_full_train,df_test = train_test_split(df[all_features],test_size=0.2,random_state=42)
df_train,df_val = train_test_split(df_full_train,test_size=0.25,random_state=42)

y_train = df_train['churn'].values
y_val = df_val['churn'].values
y_test = df_test['churn'].values

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
                                            
del df_train['churn']
del df_val['churn']
del df_test['churn']

In [24]:
dv = DictVectorizer(sparse=False)

dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')
dict_test = df_test.to_dict(orient='records')

X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)
X_test = dv.transform(dict_test)

In [25]:
#LogisticRegression

model = LogisticRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_val)

accuracy_all_features = (y_pred == y_val).mean()
accuracy_all_features

0.8019872249822569

#### Model prediction removing least useful features

In [26]:
df_full_train,df_test = train_test_split(df[all_features],test_size=0.2,random_state=42)
df_train,df_val = train_test_split(df_full_train,test_size=0.25,random_state=42)

y_train = df_train['churn'].values
y_val = df_val['churn'].values
y_test = df_test['churn'].values

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
                                            
del df_train['churn']
del df_val['churn']
del df_test['churn']

In [27]:
dv = DictVectorizer(sparse=False)

dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')
dict_test = df_test.to_dict(orient='records')

X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)
X_test = dv.transform(dict_test)

In [28]:
#LogisticRegression

model = LogisticRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_val)

accuracy_important_features = (y_pred == y_val).mean()
accuracy_important_features

0.8019872249822569

In [29]:
print(accuracy_all_features, accuracy_important_features, abs(accuracy_all_features - accuracy_important_features))

0.8019872249822569 0.8019872249822569 0.0


#### Accuracy of prediction using all features is exactly the same as that excluding the least useful features