In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/srivatsan88/YouTubeLI/master/dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
X = df.drop(columns=['Churn'])
y = df['Churn']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


In [5]:
DROP_FEAT = ['customerID', 'gender', 'PhoneService', 'MultipleLines', 'PaperlessBilling', 'PaymentMethod']
NUMERIC_FEAT = ['tenure', 'TotalCharges']
CATEGORICAL_FEAT = ['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract']

In [6]:
drop_transformer = ColumnTransformer(transformers=[('drop_columns', 'drop', DROP_FEAT)], remainder='passthrough')

In [7]:
pipeline = Pipeline([('drop_column', drop_transformer)])
pipeline.fit(X_train)


Pipeline(memory=None,
         steps=[('drop_column',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('drop_columns', 'drop',
                                                  ['customerID', 'gender',
                                                   'PhoneService',
                                                   'MultipleLines',
                                                   'PaperlessBilling',
                                                   'PaymentMethod'])],
                                   verbose=False))],
         verbose=False)

In [8]:
transformed_train = pipeline.transform(X_train)

In [9]:
transformed_train

array([[0, 'Yes', 'Yes', ..., 'One year', 70.7, '3770'],
       [0, 'No', 'No', ..., 'Month-to-month', 80.55, '80.55'],
       [0, 'No', 'No', ..., 'One year', 19.3, '259.65'],
       ...,
       [0, 'Yes', 'Yes', ..., 'Month-to-month', 21.15, '306.05'],
       [1, 'No', 'No', ..., 'Month-to-month', 99.45, '1200.15'],
       [0, 'No', 'No', ..., 'One year', 19.8, '457.3']], dtype=object)

In [10]:
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [11]:
df.replace(r"^\s*$", np.nan, regex=True).isna().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [12]:
def remove_spaces(input_df):
  input_df['TotalCharges'] = input_df['TotalCharges'].replace(r"^\s*$", np.nan, regex=True)
  return input_df

In [13]:
class SpaceImputeTransformer():

  def __init__(self, func):
    self.func = func

  def transform(self, input_df, **transform_params):
    return self.func(input_df)

  def fit(self, X, y=None, **fit_params):
    return self

In [14]:
pipeline = Pipeline([
                     ('space_remover', SpaceImputeTransformer(remove_spaces)),
                     ('drop_column', drop_transformer)
                     ])

In [15]:
pipeline.fit(X_train)

Pipeline(memory=None,
         steps=[('space_remover',
                 <__main__.SpaceImputeTransformer object at 0x7ff46cb0b990>),
                ('drop_column',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('drop_columns', 'drop',
                                                  ['customerID', 'gender',
                                                   'PhoneService',
                                                   'MultipleLines',
                                                   'PaperlessBilling',
                                                   'PaymentMethod'])],
                                   verbose=False))],
         verbose=False)

In [16]:
transformed_train = pipeline.transform(X_train)

In [17]:
transformed_train

array([[0, 'Yes', 'Yes', ..., 'One year', 70.7, '3770'],
       [0, 'No', 'No', ..., 'Month-to-month', 80.55, '80.55'],
       [0, 'No', 'No', ..., 'One year', 19.3, '259.65'],
       ...,
       [0, 'Yes', 'Yes', ..., 'Month-to-month', 21.15, '306.05'],
       [1, 'No', 'No', ..., 'Month-to-month', 99.45, '1200.15'],
       [0, 'No', 'No', ..., 'One year', 19.8, '457.3']], dtype=object)

In [18]:
np.isnan(transformed_train[:, -1].astype(np.float)).sum()

8

In [19]:
np.isnan(pipeline.transform(X_test)[:, -1].astype(np.float)).sum()

3

In [20]:
numeric_transformer = Pipeline(steps=[
                                      ('mean_imputer', SimpleImputer(strategy='mean')),
                                      ('std_scaler', StandardScaler())
                                      ])

categorical_transformer = Pipeline(steps=[
                                          ('onehotenc', OneHotEncoder(handle_unknown='ignore'))
                                          ])

In [21]:
col_transformer = ColumnTransformer(transformers=[('drop_cols', 'drop', DROP_FEAT),
                                                  ('numeric_processing', numeric_transformer, NUMERIC_FEAT),
                                                  ('categorical_processing', categorical_transformer, CATEGORICAL_FEAT)],
                                    remainder='drop')

In [22]:
pipeline = Pipeline([
                     ('space_remover', SpaceImputeTransformer(remove_spaces)),
                     ('transform_column', col_transformer),
                     ])

In [23]:
pipeline.fit(X_train)

Pipeline(memory=None,
         steps=[('space_remover',
                 <__main__.SpaceImputeTransformer object at 0x7ff4680c4bd0>),
                ('transform_column',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('drop_cols', 'drop',
                                                  ['customerID', 'gender',
                                                   'PhoneService',
                                                   'MultipleLines',
                                                   'PaperlessBilling',
                                                   'PaymentMethod']),
                                                 ('numeric_...
                                                           steps=[('onehotenc',
                                                                   OneHotEncoder

In [24]:
transformed_train = pipeline.transform(X_train)

In [25]:
pd.DataFrame(transformed_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33
0,0.881078,0.653404,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-1.284263,-0.976492,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
2,-0.793997,-0.897371,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,-0.344587,-0.012148,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,-1.079985,-0.812882,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4925,-1.284263,-0.970108,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4926,-0.385442,-0.040929,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4927,-0.834852,-0.876872,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4928,-0.834852,-0.481884,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0


In [26]:
pipeline.transform(X_test)

array([[-1.28426262, -1.00112086,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.34995661, -0.57187301,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.7993669 , -0.55630055,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.12621075,  0.62002807,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.50800848, -0.2251261 ,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.42629752, -0.467416  ,  1.        , ...,  1.        ,
         0.        ,  0.        ]])

In [32]:
CAT_COLS = pipeline.named_steps['transform_column'].transformers_[2][1].named_steps['onehotenc'].get_feature_names(CATEGORICAL_FEAT)

In [33]:
NUMERIC_FEAT + CAT_COLS.tolist()

['tenure',
 'TotalCharges',
 'SeniorCitizen_0',
 'SeniorCitizen_1',
 'Partner_No',
 'Partner_Yes',
 'Dependents_No',
 'Dependents_Yes',
 'PhoneService_No',
 'PhoneService_Yes',
 'InternetService_DSL',
 'InternetService_Fiber optic',
 'InternetService_No',
 'OnlineSecurity_No',
 'OnlineSecurity_No internet service',
 'OnlineSecurity_Yes',
 'OnlineBackup_No',
 'OnlineBackup_No internet service',
 'OnlineBackup_Yes',
 'DeviceProtection_No',
 'DeviceProtection_No internet service',
 'DeviceProtection_Yes',
 'TechSupport_No',
 'TechSupport_No internet service',
 'TechSupport_Yes',
 'StreamingTV_No',
 'StreamingTV_No internet service',
 'StreamingTV_Yes',
 'StreamingMovies_No',
 'StreamingMovies_No internet service',
 'StreamingMovies_Yes',
 'Contract_Month-to-month',
 'Contract_One year',
 'Contract_Two year']

In [34]:
for name, estimator, features in pipeline.named_steps['transform_column'].transformers_:
  print("=============================")
  print(name)
  print(features)

drop_cols
['customerID', 'gender', 'PhoneService', 'MultipleLines', 'PaperlessBilling', 'PaymentMethod']
numeric_processing
['tenure', 'TotalCharges']
categorical_processing
['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract']
remainder
[18]


In [36]:
pd.DataFrame(transformed_train, columns = NUMERIC_FEAT + CAT_COLS.tolist())

Unnamed: 0,tenure,TotalCharges,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year
0,0.881078,0.653404,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-1.284263,-0.976492,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
2,-0.793997,-0.897371,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,-0.344587,-0.012148,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,-1.079985,-0.812882,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4925,-1.284263,-0.970108,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4926,-0.385442,-0.040929,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4927,-0.834852,-0.876872,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4928,-0.834852,-0.481884,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0


In [37]:
pipeline = Pipeline([
                     ('space_remover', SpaceImputeTransformer(remove_spaces)),
                     ('transform_column', col_transformer),
                     ('logistics', LogisticRegression())
                     ])

In [38]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('space_remover',
                 <__main__.SpaceImputeTransformer object at 0x7ff467a9f910>),
                ('transform_column',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('drop_cols', 'drop',
                                                  ['customerID', 'gender',
                                                   'PhoneService',
                                                   'MultipleLines',
                                                   'PaperlessBilling',
                                                   'PaymentMethod']),
                                                 ('numeric_...
                                                   'DeviceProtection',
                                                   'TechSupport', 'StreamingTV',
        

In [39]:
pipeline.score(X_test, y_test)

0.8059630856601988

In [40]:
y_pred = pipeline.predict(X_test)

In [41]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[1381,  158],
       [ 252,  322]])