In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, PowerTransformer, PolynomialFeatures
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer

In [46]:
k10 = StratifiedKFold(n_splits=10, shuffle=True, random_state=560)

In [19]:
data_file = "data/WA_Fn-UseC_-Telco-Customer-Churn.csv"

def get_data(filename):
    data = pd.read_csv(filename, index_col=0)
    data['TotalCharges'] = data.TotalCharges.map(                                  # convert " " to nan and 
                                        lambda x: float(x) if x!=" " else np.nan)  # change dtype to float
    X = data.drop("Churn", axis=1)
    y = data['Churn']
    return X, y

In [21]:
X, y = get_data(data_file)
X.shape, y.shape

((7043, 19), (7043,))

In [23]:
# ratio of customers who churn
y.value_counts(normalize=True)

No     0.73463
Yes    0.26537
Name: Churn, dtype: float64

In [24]:
X_train, X_test, y_train , y_test = train_test_split(X, y, test_size=.25, 
                                                     stratify=y, random_state=42)

In [25]:
X_train.shape, X_test.shape

((5282, 19), (1761, 19))

In [26]:
y_train.value_counts(normalize=True)

No     0.73457
Yes    0.26543
Name: Churn, dtype: float64

In [27]:
y_test.value_counts(normalize=True)

No     0.73481
Yes    0.26519
Name: Churn, dtype: float64

In [28]:
X.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0,7032.0
mean,0.162147,32.371149,64.761692,2283.300441
std,0.368612,24.559481,30.090047,2266.771362
min,0.0,0.0,18.25,18.8
25%,0.0,9.0,35.5,401.45
50%,0.0,29.0,70.35,1397.475
75%,0.0,55.0,89.85,3794.7375
max,1.0,72.0,118.75,8684.8


In [30]:
X.describe(include="O").T

Unnamed: 0,count,unique,top,freq
gender,7043,2,Male,3555
Partner,7043,2,No,3641
Dependents,7043,2,No,4933
PhoneService,7043,2,Yes,6361
MultipleLines,7043,3,No,3390
InternetService,7043,3,Fiber optic,3096
OnlineSecurity,7043,3,No,3498
OnlineBackup,7043,3,No,3088
DeviceProtection,7043,3,No,3095
TechSupport,7043,3,No,3473


In [39]:
# three set of columns: numeric, binary, nominal

cols_by_dtype = X.dtypes

num_cols = cols_by_dtype.index[cols_by_dtype != "object"].tolist()
cat_cols = cols_by_dtype.index[cols_by_dtype == "object"].tolist()

cols_by_nunqiue = X[cat_cols].nunique()

bin_cols = cols_by_nunqiue.index[cols_by_nunqiue == 2].tolist()
cat_cols = cols_by_nunqiue.index[cols_by_nunqiue > 2].tolist()

In [47]:
# classifier candidates

gb = GradientBoostingClassifier()

In [68]:
def get_cv_score(pipe, X, y, cv, scoring, text=""):
    scores = cross_val_score(pipe, X, y, cv=cv, scoring=scoring)
    print("CV-score: {}\n- mean = {:.3f}, std = {:.3f}".format(text, 
                                                               np.mean(scores), 
                                                               np.std(scores)))

class FeatureSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns=None):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.columns is None:
            return X.copy()
        return X.copy()[self.columns]
    
class OrdinalEncoderDf(OrdinalEncoder):
    
    def __init__(self, categories="auto", dtype=np.float64):
        super().__init__(categories, dtype)
        
    def fit(self, X, y=None):
        super().fit(X)
        self.column_names_ = X.columns.tolist()
        return self
    
    def transform(self, X):
        Xt = super().transform(X)
        idx = X.index
        return pd.DataFrame(Xt, columns=self.column_names_, index=idx)
    
    def get_feature_names(self):
        return self.column_names_
    
class MySimpleImputer(SimpleImputer):
    
    def __init__(self, missing_values=np.nan, strategy="mean",
                 fill_value=None, verbose=0, copy=True, add_indicator=False):
        super().__init__(missing_values, strategy, fill_value, verbose, copy, add_indicator)
        
    def fit(self, X, y=None):
        super().fit(X)
        self.column_names_ = X.columns.tolist()
        return self
    
    def transform(self, X):
        Xt = super().transform(X)
        idx = X.index
        return pd.DataFrame(Xt, columns=self.column_names_, index=idx)
    
    def get_feature_names(self):
        return self.column_names_

In [89]:
# pipeline


# numeric preprocessing: select feature -> impute missing value
num_prep = Pipeline(steps=[
    ('selector', FeatureSelector(num_cols)),
    ('imputer', MySimpleImputer(strategy='median'))   # nan in TotalCharges column
])

# binary preprocessing: select feature -> ordinal encoding
bin_prep = Pipeline(steps=[
    ('selector', FeatureSelector(bin_cols)),
    ('encoder', OrdinalEncoderDf()),
])

# nominal preprocessing: select feature -> onehot encoding
cat_prep = Pipeline(steps=[
    ('selector', FeatureSelector(cat_cols)),
    ('onehot', OneHotEncoder(sparse=False))
])

# numeric polytransform: select -> power transform -> polytransform
poly_prep = Pipeline(steps=[
    ('selector', FeatureSelector(num_cols)),
    ('yeo', PowerTransformer()),
    ('impute', SimpleImputer()),
    ('poly', PolynomialFeatures(degree=2, interaction_only=True))
])

# all preprocessing
all_prep = FeatureUnion(transformer_list=[
    ("num_prep", num_prep),
    ("bin_prep", bin_prep),
    ("cat_prep", cat_prep),
    ("poly_prep", poly_prep)
], n_jobs=4)

# full pipeline
full_pipe = Pipeline(steps=[
    ("prep", all_prep),
    ("clf", gb)
])

In [70]:
get_cv_score(full_pipe, X_train, y_train, cv=k10, scoring='accuracy', 
             text="[numeric] + gb")

CV-score: [numeric] + gb
- mean = 0.791, std = 0.009


In [73]:
get_cv_score(full_pipe, X_train, y_train, cv=k10, scoring='accuracy', 
             text="[numeric, binary] + gb")

CV-score: [numeric, binary] + gb
- mean = 0.794, std = 0.012


In [77]:
get_cv_score(full_pipe, X_train, y_train, cv=k10, scoring='accuracy', 
             text="[numeric, binary, cat_onehot] + gb")

CV-score: [numeric, binary, cat_onehot] + gb
- mean = 0.806, std = 0.011


In [90]:
get_cv_score(full_pipe, X_train, y_train, cv=k10, scoring='accuracy', 
             text="[numeric, binary, cat_onehot, poly] + gb")

CV-score: [numeric, binary, cat_onehot, poly] + gb
- mean = 0.803, std = 0.009
