In [13]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import f_classif, SelectPercentile
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

%matplotlib inline

# Import dataset 

In [2]:
df = pd.read_csv("Customer-Churn.csv")
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


# Missing values

In [5]:
df.isnull().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [3]:
# Dropping columns with missing values 
df.dropna(inplace=True)

# Splitting data

In [4]:
#Separate features and label

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Churn']), df["Churn"], 
                                                    train_size = 0.7, stratify = df["Churn"] )

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(4922, 19) (4922,)
(2110, 19) (2110,)


# Preprocessing 

In [5]:
#preprocessor for feature matrix X
preprocessor = ColumnTransformer([('scaler', StandardScaler(), make_column_selector(dtype_include = 'number')),
                                 ('encoder', OneHotEncoder(drop = 'first'), make_column_selector(dtype_include = 'object'))])

In [6]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [10]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(4922, 30) (4922,)
(2110, 30) (2110,)


# Model Selection 

Testing different baseline models

In [7]:
# Iterate through the groups of model to find the model with higest accuracy
# using 5 fold cross validation
for model in [GaussianNB(),KNeighborsClassifier(),GradientBoostingClassifier(), RandomForestClassifier(),SVC()]:
    cv_score = cross_val_score(model, X_train, y_train, scoring='accuracy' ,cv=5)
    print("Accuracy score for %s: %.3f"%(model.__class__.__name__, cv_score.mean()))

Accuracy score for GaussianNB: 0.659
Accuracy score for KNeighborsClassifier: 0.764
Accuracy score for GradientBoostingClassifier: 0.804
Accuracy score for RandomForestClassifier: 0.786
Accuracy score for SVC: 0.795


Cross validation score indicates that SVC and GradientBoostinglassifier are the best performing models 

# Model Optimization

## Feature Selection 

In [21]:
Selector = SelectPercentile(f_classif,percentile=80)

In [22]:
X_train = Selector.fit_transform(X_train,y_train)
X_test = Selector.transform(X_test)

## Fine Tuning Model 

Support Vector Machine 

In [23]:
rbf_svc = SVC(kernel='rbf')

# setting possible values for the penalty parameter of LogisticRegression()
rbf_param = {'C':np.arange(0.1,1,0.1),
            'gamma':np.arange(0.01,0.1,0.01)}

rbf_tune = GridSearchCV(estimator = rbf_svc, param_grid = rbf_param,
                       scoring='accuracy',n_jobs=-1, cv=5,
                       return_train_score=True)
# Fit the training data into it,
#It will determine which parameter 
#returns the best model using cross validation
rbf_tune.fit(X_train,y_train)

# Get the best hyper parameters for the model
rbf_tune.best_params_

{'C': 0.9, 'gamma': 0.09}

In [24]:
# Get the best models from GridSearch
svc_model = rbf_tune.best_estimator_

svc_model.fit(X_train, y_train)

SVC(C=0.9, gamma=0.09)

___

GradientBoostinglassifier

In [29]:
gbc_param_grid = [{'loss':['deviance', 'exponential'],
                   'learning_rate':np.arange(0,1,0.1),
                   'n_estimators':np.arange(50,400,50)}]

gbc_tune = GridSearchCV(GradientBoostingClassifier(), gbc_param_grid, 
                        cv=5,scoring='accuracy',n_jobs=-1,
                        return_train_score=True)

gbc_tune.fit(X_train,y_train)

gbc_tune.best_params_

{'learning_rate': 0.1, 'loss': 'deviance', 'n_estimators': 100}

In [30]:
gbr_model = gbc_tune.best_estimator_

gbr_model.fit(X_train, y_train)

GradientBoostingClassifier()

# Evaluate Classification model 

## Support Vector Classifier

In [25]:
#fitting training set to data
y_predicted = svc_model.predict(X_test)

In [26]:
accuracy_score(y_true= y_test, y_pred= y_predicted)

0.8028436018957346

In [27]:
confusion_matrix(y_true = y_test,y_pred = y_predicted)

array([[1432,  117],
       [ 299,  262]])

In [28]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

          No       0.83      0.92      0.87      1549
         Yes       0.69      0.47      0.56       561

    accuracy                           0.80      2110
   macro avg       0.76      0.70      0.72      2110
weighted avg       0.79      0.80      0.79      2110



## Gradient Boosting Classifier 

In [31]:
y_predicted = gbr_model.predict(X_test)

In [32]:
accuracy_score(y_true= y_test, y_pred= y_predicted)

0.8028436018957346

In [33]:
confusion_matrix(y_true = y_test,y_pred = y_predicted)

array([[1390,  159],
       [ 257,  304]])

In [34]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

          No       0.84      0.90      0.87      1549
         Yes       0.66      0.54      0.59       561

    accuracy                           0.80      2110
   macro avg       0.75      0.72      0.73      2110
weighted avg       0.79      0.80      0.80      2110

