In [2]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = None
pd.options.display.max_rows = 50

In [3]:
# імопртвання навчальної вибірки
df = pd.read_csv("./raw_data/churn_sample.csv")

# вивід перших 5-и обьектів
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [22]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline,make_pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,RandomizedSearchCV,StratifiedKFold
from sklearn.metrics import accuracy_score,roc_auc_score,classification_report,confusion_matrix

In [13]:
cat_features = [
    'gender', 'SeniorCitizen', 'Partner', 'Dependents',
     'PhoneService', 'MultipleLines', 'InternetService',
     'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
     'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling','PaymentMethod'
]
num_features = [
    'tenure',
    'MonthlyCharges',
    'TotalCharges'
]

In [14]:
df.head()
df = df.replace(" ", np.nan)

In [15]:
X = df.drop(['customerID','Churn'],1)
y = df['Churn'].map({'Yes':1,'No':0})

In [16]:
class MultiColumnLabelEncoder(BaseEstimator,TransformerMixin):
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)
    
class VariableSelector(BaseEstimator, TransformerMixin):
    def __init__(self, names=None):
        self.names = names

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X):
        return X[self.names]

In [54]:
pipe = Pipeline([
    ("features", FeatureUnion([
        ('categorical', make_pipeline(VariableSelector(names = cat_features), MultiColumnLabelEncoder())),
        ('numeric', make_pipeline(VariableSelector(names = num_features), SimpleImputer(), StandardScaler()))    
    ])
    )
]
)

In [55]:
X_t = pipe.fit(X).transform(X)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X_t ,y, test_size = .20 ,random_state = 111,stratify = y)

In [57]:
cv = StratifiedKFold(n_splits=3,shuffle=True,random_state=111)

In [71]:
params = {
    'n_estimators':[100,250,500],
    'max_depth':[2,5,7,10,12],
    'max_features':['auto','sqrt'],
    'min_samples_split':[2,5,10],
    'min_samples_leaf':[1,2,5,10,12],
    'class_weight':['balanced',None]
}

grid = RandomizedSearchCV(RandomForestClassifier(), params, scoring='f1', cv=cv, verbose=1).fit(X_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   23.0s finished


In [72]:
def validate(estimator,X_train,y_train,X_test, y_test):
    model = estimator
    model.fit(X_train, y_train)
    y_hat_proba = model.predict_proba(X_test)[:,1] > 0.5
    y_hat_train = model.predict(X_train)
    accuracy_score_train = accuracy_score(y_train, y_hat_train)
    accuracy_score_test =  accuracy_score(y_test, y_hat_proba)
    roc_auc_train,roc_auc_test = roc_auc_score(y_train,y_hat_train), roc_auc_score(y_test, y_hat_proba)
    print("---------------------- TRAINING REPORT -------------------------\n")
    print(f'Accuracy score: train sample={accuracy_score_train:.3f} vs test sample={accuracy_score_test:.3f}')
    print(f'ROC_AUC score: train sample={roc_auc_train:.3f} vs test sample={roc_auc_test:.3f}')
    print("\n==================== Classification report ====================")
    print(classification_report(y_test,y_hat_proba))
    print("\n==================== Confusion matrix =========================")
    print(confusion_matrix(y_test,y_hat_proba))

In [73]:
grid.best_params_

{'n_estimators': 500,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'class_weight': 'balanced'}

In [74]:
validate(grid.best_estimator_, X_train ,y_train, X_test,y_test)

---------------------- TRAINING REPORT -------------------------

Accuracy score: train sample=0.863 vs test sample=0.781
ROC_AUC score: train sample=0.874 vs test sample=0.754

              precision    recall  f1-score   support

           0       0.88      0.81      0.85      1035
           1       0.57      0.70      0.63       374

    accuracy                           0.78      1409
   macro avg       0.73      0.75      0.74      1409
weighted avg       0.80      0.78      0.79      1409


[[841 194]
 [114 260]]


In [77]:
pipe = Pipeline([
    ("features", FeatureUnion([
        ('categorical', make_pipeline(VariableSelector(names = cat_features), MultiColumnLabelEncoder())),
        ('numeric', make_pipeline(VariableSelector(names = num_features), SimpleImputer(strategy='mean'), StandardScaler()))    
    ])
    ),
    ('prediction', grid.best_estimator_)
]
)

In [78]:
pipe.fit(X,y)

Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('categorical',
                                                 Pipeline(memory=None,
                                                          steps=[('variableselector',
                                                                  VariableSelector(names=['gender',
                                                                                          'SeniorCitizen',
                                                                                          'Partner',
                                                                                          'Dependents',
                                                                                          'PhoneService',
                                                                                          'MultipleLines',
                                                            