# 1. Load Data and Initial Read

## 1.1 Load Libraries

In [1]:
# Python libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
%matplotlib inline
import itertools
import lightgbm as lgbm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, confusion_matrix,  roc_curve, precision_recall_curve, accuracy_score, roc_auc_score
from datetime import datetime
import lightgbm as lgbm
import warnings
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import warnings

from contextlib import contextmanager

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

warnings.filterwarnings('ignore') #ignore warning messages 

## 1.2 Load Dataset

In [2]:
telcom = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
telcom.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## 1.3 Dataset Info

In [3]:
display(telcom.shape)
display(telcom.info())

(7043, 21)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: float64(1), int64(2), obj

None

## 1.4 Initial Data Processing

In [4]:
# Reassign target
telcom.Churn.replace(to_replace = dict(Yes = 1, No = 0), inplace = True)

# Encode as object
col_name = ['SeniorCitizen', 'Churn']
telcom[col_name] = telcom[col_name].astype(object)

# Encode as float
telcom['TotalCharges'] = telcom['TotalCharges'].replace(" ", 0).astype('float64')

# 2. Feature Engineering and Selection

## 2.1 New Features

In [5]:
telcom.loc[:,'Engaged']=1 
telcom.loc[(telcom['Contract']=='Month-to-month'),'Engaged']=0

telcom.loc[:,'YandNotE']=0
telcom.loc[(telcom['SeniorCitizen']==0) & (telcom['Engaged']==0),'YandNotE']=1

telcom.loc[:,'ElectCheck']=0 
telcom.loc[(telcom['PaymentMethod']=='Electronic check') & (telcom['Engaged']==0),'ElectCheck']=1

telcom.loc[:,'fiberopt']=1 
telcom.loc[(telcom['InternetService']!='Fiber optic'),'fiberopt']=0

telcom.loc[:,'StreamNoInt']=1 
telcom.loc[(telcom['StreamingTV']!='No internet service'),'StreamNoInt']=0

telcom.loc[:,'NoProt']=1 
telcom.loc[(telcom['OnlineBackup']!='No') | (telcom['DeviceProtection']!='No') | (telcom['TechSupport']!='No'),'NoProt']=0

telcom['TotalServices'] = (telcom[['PhoneService', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']]== 'Yes').sum(axis=1)

In [6]:
telcom['tenure'] = pd.cut(telcom['tenure'], 5)

## 2.2 Drop Irrelavant Features

In [7]:
telcom = telcom.drop(columns = [
                            'Contract',
                            'DeviceProtection', 
                            'Partner'
                           ])

## 2.3 Feature encoding and scaling

In [8]:
#customer id col
Id_col     = ['customerID']
#Target columns
target_col = ["Churn"]
#categorical columns
cat_cols   = telcom.nunique()[telcom.nunique() < 10].keys().tolist()
cat_cols   = [x for x in cat_cols if x not in target_col]
#numerical columns
num_cols   = [x for x in telcom.columns if x not in cat_cols + target_col + Id_col]
#Binary columns with 2 values
bin_cols   = telcom.nunique()[telcom.nunique() == 2].keys().tolist()
#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]

#Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    telcom[i] = le.fit_transform(telcom[i])
    
#Duplicating columns for multi value columns
telcom = pd.get_dummies(data = telcom,columns = multi_cols )

#Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(telcom[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

#dropping original values merging scaled values for numerical columns
df_telcom_og = telcom.copy()
telcom = telcom.drop(columns = num_cols,axis = 1)
telcom = telcom.merge(scaled,left_index=True,right_index=True,how = "left")
telcom = telcom.drop(['customerID'],axis = 1)

## 2.4 Correlation Matrix

In [9]:
def correlation_plot():
    #correlation
    correlation = telcom.corr()
    #tick labels
    matrix_cols = correlation.columns.tolist()
    #convert to array
    corr_array  = np.array(correlation)
    trace = go.Heatmap(z = corr_array,
                       x = matrix_cols,
                       y = matrix_cols,
                       colorscale='Viridis',
                       colorbar   = dict() ,
                      )
    layout = go.Layout(dict(title = 'Correlation Matrix for variables',
                            #autosize = False,
                            #height  = 1400,
                            #width   = 1600,
                            margin  = dict(r = 0 ,l = 210,
                                           t = 25,b = 210,
                                         ),
                            yaxis   = dict(tickfont = dict(size = 9)),
                            xaxis   = dict(tickfont = dict(size = 9)),
                           )
                      )
    fig = go.Figure(data = [trace],layout = layout)
    py.iplot(fig)

## 2.5 Remove Collinear Features

In [10]:
#Threshold for removing correlated variables
threshold = 0.9

# Absolute value correlation matrix
corr_matrix = telcom.corr().abs()
corr_matrix.head()

# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()

# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d columns to remove :' % (len(to_drop)))

telcom = telcom.drop(columns = to_drop)

to_drop

There are 8 columns to remove :


['MultipleLines_No phone service',
 'InternetService_Fiber optic',
 'InternetService_No',
 'OnlineSecurity_No internet service',
 'OnlineBackup_No internet service',
 'TechSupport_No internet service',
 'StreamingTV_No internet service',
 'StreamingMovies_No internet service']

In [11]:
correlation_plot()

# 3. Prepare Dataset 

In [12]:
# Def X and Y
y = telcom.Churn
X = telcom.drop('Churn', 1)

# Train_test split
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)

In [13]:
# Cross val metric
def cross_val_metrics(model) :
    scores = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    for sc in scores:
        scores = cross_val_score(model, X, y, cv = 5, scoring = sc)
        print('[%s] : %0.5f (+/- %0.5f)'%(sc, scores.mean(), scores.std()))

# 4. LightGBM Model

In [21]:
%%time
lgbm_clf = lgbm.LGBMClassifier(n_estimators=1000, random_state = 42)

fit = lgbm_clf.fit(X_train, y_train)
predictions = fit.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
roc = roc_auc_score(y_test, predictions)
print(confusion_matrix(y_test, predictions))
print(accuracy)
print(roc)

[[918 118]
 [188 185]]
0.7828246983676366
0.6910394691896034
Wall time: 942 ms


### RandomizedSearchCV to optimiza hyperparameters

In [22]:
fit_params = {"early_stopping_rounds" : 50, 
             "eval_metric" : 'binary', 
             "eval_set" : [(X_test,y_test)],
             'eval_names': ['valid'],
             'verbose': 0,
             'categorical_feature': 'auto'}

param_test = {'learning_rate' : [0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1, 0.2, 0.3, 0.4],
              'n_estimators' : [100, 200, 300, 400, 500, 600, 800, 1000, 1500, 2000, 3000, 5000],
              'num_leaves': sp_randint(6, 50), 
              'min_child_samples': sp_randint(100, 500), 
              'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
              'subsample': sp_uniform(loc=0.2, scale=0.8), 
              'max_depth': [-1, 1, 2, 3, 4, 5, 6, 7],
              'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
              'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
              'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

#number of combinations
n_iter = 200

#intialize lgbm and lunch the search
lgbm_clf = lgbm.LGBMClassifier(random_state=random_state, silent=True, metric='None', n_jobs=4)
grid_search = RandomizedSearchCV(
    estimator=lgbm_clf, param_distributions=param_test, 
    n_iter=n_iter,
    scoring='accuracy',
    cv=5,
    refit=True,
    random_state=random_state,
    verbose=True)

grid_search.fit(X_train, y_train, **fit_params)
print('Best params: {} '.format(grid_search.best_params_))

opt_parameters =  grid_search.best_params_

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:  2.6min finished


Best params: {'colsample_bytree': 0.6982071692927494, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_samples': 231, 'min_child_weight': 1e-05, 'n_estimators': 1000, 'num_leaves': 34, 'reg_alpha': 10, 'reg_lambda': 50, 'subsample': 0.8025969765979608} 


In [24]:
%%time
lgbm_clf = lgbm.LGBMClassifier(**opt_parameters)

fit = lgbm_clf.fit(X_train, y_train)
predictions = fit.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
roc = roc_auc_score(y_test, predictions)
print(confusion_matrix(y_test, predictions))
print(accuracy)
print(roc)

[[946  90]
 [169 204]]
0.8161816891412349
0.7300221516039209
Wall time: 337 ms


accuracy increased to 81.6%

### Cross validation 

In [25]:
cross_val_metrics(lgbm_clf)

[accuracy] : 0.79356 (+/- 0.00801)
[precision] : 0.63920 (+/- 0.02227)
[recall] : 0.51204 (+/- 0.00738)
[f1] : 0.56842 (+/- 0.01079)
[roc_auc] : 0.83378 (+/- 0.00829)


5 fold cross validation has increased roc_auc score to 83.4%