In [173]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import itertools

import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [174]:
df = pd.read_csv('churn_data.csv')
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [175]:
df['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [176]:
# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(df, df['Exited'], random_state=0)

In [177]:
#соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [178]:
# Зададим списки признаков
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [179]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

gender = Pipeline([
                ('selector', FeatureSelector(column='Gender')),
                ('ohe', OHEEncoder(key='Gender'))
            ])
gender.fit(X_train)
gender.transform(X_test).head(3)

Unnamed: 0,Gender_Female,Gender_Male
9394,1,0
898,1,0
2398,1,0


In [180]:
# Теперь нам нужно под каждый признак создать трансформер и объединить их в список 
# (сделаем это в цикле, чтобы не мучиться)

from sklearn.preprocessing import StandardScaler

final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('scaler', StandardScaler())
            ])
    
    final_transformers.append((cont_col, cont_transformer))


In [181]:
# Объединим все это в единый пайплайн
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [182]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

In [183]:
# Создаем пустой датафрейм для хранения результатов
results_df = pd.DataFrame(columns=['Model', 'Thresholds', 'Precision', 'Recall', 'F1-score', 'roc_auc'])

### бустинг

In [184]:
# Добавим модель
from xgboost import XGBClassifier

pipeline_xgb = Pipeline([
    ('features', feats),
    ('classifier', XGBClassifier(random_state=42)),
])

In [185]:
# обучим наш пайплайн
pipeline_xgb.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [186]:
# прогнозы для тестовой выборки
preds = pipeline_xgb.predict_proba(X_test)[:, 1]
preds[:10]

array([0.47611395, 0.23087966, 0.07372608, 0.03141731, 0.02561874,
       0.9679959 , 0.06951059, 0.2885457 , 0.11367497, 0.4902508 ],
      dtype=float32)

In [187]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.373318, F-Score=0.626, Precision=0.619, Recall=0.633


In [188]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.8609212971140268

In [189]:
results_df = results_df.append({'Model': 'XGBClassifier', 'Thresholds': thresholds[ix],
                                'Precision': precision[ix], 'Recall': recall[ix], 'F1-score': fscore[ix],
                                'roc_auc':roc_auc}, ignore_index=True)

  results_df = results_df.append({'Model': 'XGBClassifier', 'Thresholds': thresholds[ix],


In [190]:
# Добавим модель
from lightgbm import LGBMClassifier


pipeline_lgbm = Pipeline([
    ('features', feats),
    ('classifier', LGBMClassifier(random_state=42)),
])
# обучим наш пайплайн
pipeline_lgbm.fit(X_train, y_train)
# прогнозы для тестовой выборки
preds = pipeline_lgbm.predict_proba(X_test)[:, 1]
preds[:10]

array([0.43369048, 0.24735086, 0.1632904 , 0.04189257, 0.05688283,
       0.92200055, 0.03408225, 0.09064776, 0.10556057, 0.69556762])

In [191]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))
roc_auc = roc_auc_score(y_test, preds)
results_df = results_df.append({'Model': 'LGBMClassifier', 'Thresholds': thresholds[ix],
                                'Precision': precision[ix], 'Recall': recall[ix], 'F1-score': fscore[ix],
                                'roc_auc':roc_auc}, ignore_index=True)

Best Threshold=0.371377, F-Score=0.653, Precision=0.661, Recall=0.646


  results_df = results_df.append({'Model': 'LGBMClassifier', 'Thresholds': thresholds[ix],


In [192]:
# Добавим модель
from catboost import CatBoostClassifier


pipeline_cat = Pipeline([
    ('features', feats),
    ('classifier', CatBoostClassifier(random_state=42)),
])
# обучим наш пайплайн
pipeline_cat.fit(X_train, y_train)
# прогнозы для тестовой выборки
preds = pipeline_cat.predict_proba(X_test)[:, 1]

Learning rate set to 0.024355
0:	learn: 0.6726374	total: 3.65ms	remaining: 3.65s
1:	learn: 0.6548174	total: 5.47ms	remaining: 2.73s
2:	learn: 0.6405685	total: 7.17ms	remaining: 2.38s
3:	learn: 0.6237743	total: 9.51ms	remaining: 2.37s
4:	learn: 0.6080844	total: 11.8ms	remaining: 2.35s
5:	learn: 0.5929243	total: 14.5ms	remaining: 2.4s
6:	learn: 0.5787577	total: 17.1ms	remaining: 2.42s
7:	learn: 0.5659958	total: 19.3ms	remaining: 2.4s
8:	learn: 0.5556154	total: 21.3ms	remaining: 2.35s
9:	learn: 0.5441443	total: 23.4ms	remaining: 2.31s
10:	learn: 0.5348181	total: 25.6ms	remaining: 2.3s
11:	learn: 0.5257042	total: 27.1ms	remaining: 2.23s
12:	learn: 0.5169250	total: 29.3ms	remaining: 2.23s
13:	learn: 0.5074365	total: 31.5ms	remaining: 2.22s
14:	learn: 0.4985522	total: 33.3ms	remaining: 2.18s
15:	learn: 0.4914367	total: 35.6ms	remaining: 2.19s
16:	learn: 0.4826085	total: 37.9ms	remaining: 2.19s
17:	learn: 0.4757232	total: 40.1ms	remaining: 2.19s
18:	learn: 0.4680217	total: 42.5ms	remaining: 2

165:	learn: 0.3191350	total: 396ms	remaining: 1.99s
166:	learn: 0.3188395	total: 398ms	remaining: 1.99s
167:	learn: 0.3187127	total: 401ms	remaining: 1.99s
168:	learn: 0.3185197	total: 403ms	remaining: 1.98s
169:	learn: 0.3182822	total: 406ms	remaining: 1.98s
170:	learn: 0.3181452	total: 410ms	remaining: 1.99s
171:	learn: 0.3180427	total: 413ms	remaining: 1.99s
172:	learn: 0.3178044	total: 415ms	remaining: 1.99s
173:	learn: 0.3176387	total: 418ms	remaining: 1.98s
174:	learn: 0.3174666	total: 420ms	remaining: 1.98s
175:	learn: 0.3173161	total: 422ms	remaining: 1.98s
176:	learn: 0.3170997	total: 425ms	remaining: 1.97s
177:	learn: 0.3170085	total: 427ms	remaining: 1.97s
178:	learn: 0.3167982	total: 430ms	remaining: 1.97s
179:	learn: 0.3165682	total: 432ms	remaining: 1.97s
180:	learn: 0.3164511	total: 435ms	remaining: 1.97s
181:	learn: 0.3162469	total: 438ms	remaining: 1.97s
182:	learn: 0.3160913	total: 441ms	remaining: 1.97s
183:	learn: 0.3159072	total: 444ms	remaining: 1.97s
184:	learn: 

402:	learn: 0.2879262	total: 990ms	remaining: 1.47s
403:	learn: 0.2878334	total: 992ms	remaining: 1.46s
404:	learn: 0.2877128	total: 994ms	remaining: 1.46s
405:	learn: 0.2875663	total: 997ms	remaining: 1.46s
406:	learn: 0.2874436	total: 999ms	remaining: 1.46s
407:	learn: 0.2873507	total: 1s	remaining: 1.45s
408:	learn: 0.2871910	total: 1s	remaining: 1.45s
409:	learn: 0.2870917	total: 1.01s	remaining: 1.45s
410:	learn: 0.2869697	total: 1.01s	remaining: 1.45s
411:	learn: 0.2868853	total: 1.01s	remaining: 1.44s
412:	learn: 0.2867556	total: 1.01s	remaining: 1.44s
413:	learn: 0.2865891	total: 1.02s	remaining: 1.44s
414:	learn: 0.2864611	total: 1.02s	remaining: 1.44s
415:	learn: 0.2863785	total: 1.02s	remaining: 1.44s
416:	learn: 0.2862088	total: 1.02s	remaining: 1.43s
417:	learn: 0.2860982	total: 1.03s	remaining: 1.43s
418:	learn: 0.2859766	total: 1.03s	remaining: 1.43s
419:	learn: 0.2857909	total: 1.03s	remaining: 1.43s
420:	learn: 0.2856790	total: 1.03s	remaining: 1.42s
421:	learn: 0.2855

636:	learn: 0.2622173	total: 1.59s	remaining: 906ms
637:	learn: 0.2620754	total: 1.59s	remaining: 904ms
638:	learn: 0.2619709	total: 1.59s	remaining: 902ms
639:	learn: 0.2618826	total: 1.6s	remaining: 899ms
640:	learn: 0.2617855	total: 1.6s	remaining: 896ms
641:	learn: 0.2616932	total: 1.6s	remaining: 894ms
642:	learn: 0.2615980	total: 1.6s	remaining: 891ms
643:	learn: 0.2614714	total: 1.61s	remaining: 889ms
644:	learn: 0.2613800	total: 1.61s	remaining: 886ms
645:	learn: 0.2613212	total: 1.61s	remaining: 884ms
646:	learn: 0.2612142	total: 1.62s	remaining: 882ms
647:	learn: 0.2611173	total: 1.62s	remaining: 880ms
648:	learn: 0.2609928	total: 1.62s	remaining: 877ms
649:	learn: 0.2608781	total: 1.62s	remaining: 875ms
650:	learn: 0.2607816	total: 1.63s	remaining: 872ms
651:	learn: 0.2607248	total: 1.63s	remaining: 869ms
652:	learn: 0.2606345	total: 1.63s	remaining: 867ms
653:	learn: 0.2605217	total: 1.63s	remaining: 864ms
654:	learn: 0.2603684	total: 1.64s	remaining: 862ms
655:	learn: 0.26

869:	learn: 0.2413696	total: 2.19s	remaining: 328ms
870:	learn: 0.2412852	total: 2.2s	remaining: 326ms
871:	learn: 0.2412021	total: 2.2s	remaining: 323ms
872:	learn: 0.2411157	total: 2.2s	remaining: 321ms
873:	learn: 0.2410512	total: 2.21s	remaining: 318ms
874:	learn: 0.2409808	total: 2.21s	remaining: 315ms
875:	learn: 0.2408490	total: 2.21s	remaining: 313ms
876:	learn: 0.2407881	total: 2.21s	remaining: 310ms
877:	learn: 0.2406894	total: 2.21s	remaining: 308ms
878:	learn: 0.2406367	total: 2.22s	remaining: 305ms
879:	learn: 0.2405835	total: 2.22s	remaining: 303ms
880:	learn: 0.2404916	total: 2.22s	remaining: 300ms
881:	learn: 0.2404056	total: 2.22s	remaining: 298ms
882:	learn: 0.2403417	total: 2.23s	remaining: 295ms
883:	learn: 0.2402392	total: 2.23s	remaining: 293ms
884:	learn: 0.2401893	total: 2.23s	remaining: 290ms
885:	learn: 0.2401013	total: 2.24s	remaining: 288ms
886:	learn: 0.2400366	total: 2.24s	remaining: 285ms
887:	learn: 0.2399462	total: 2.24s	remaining: 283ms
888:	learn: 0.2

In [193]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))
roc_auc = roc_auc_score(y_test, preds)
results_df = results_df.append({'Model': 'CatBoostClassifier', 'Thresholds': thresholds[ix],
                                'Precision': precision[ix], 'Recall': recall[ix], 'F1-score': fscore[ix],
                                'roc_auc':roc_auc}, ignore_index=True)

Best Threshold=0.384476, F-Score=0.645, Precision=0.661, Recall=0.629


  results_df = results_df.append({'Model': 'CatBoostClassifier', 'Thresholds': thresholds[ix],


### Логистическая регрессия

In [194]:
model_lr = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression(random_state=42)),
])
# Добавим модель

pipeline_lr = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression(random_state=42)),
])
# обучим наш пайплайн
pipeline_lr.fit(X_train, y_train)
# прогнозы для тестовой выборки
preds = pipeline_lr.predict_proba(X_test)[:, 1]



In [195]:
if np.isin(1, y_test):
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = np.zeros_like(precision)
    mask = (precision + recall) > 0
    fscore[mask] = (2 * precision[mask] * recall[mask]) / (precision[mask] + recall[mask])
    # locate the index of the largest f score
    ix = np.argmax(fscore)
    print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                            fscore[ix],
                                                                            precision[ix],
                                                                            recall[ix]))
    roc_auc = roc_auc_score(y_test, preds)
    new_row = pd.DataFrame({'Model': ['LogisticRegression'],
                            'Thresholds': [thresholds[ix]],
                            'Precision': [precision[ix]],
                            'Recall': [recall[ix]],
                            'F1-score': [fscore[ix]],
                            'roc_auc': [roc_auc]})
    results_df = pd.concat([results_df, new_row], ignore_index=True)
else:
    print('No positive samples in the test data')


Best Threshold=0.289522, F-Score=0.510, Precision=0.462, Recall=0.568


In [196]:
results_df

Unnamed: 0,Model,Thresholds,Precision,Recall,F1-score,roc_auc
0,XGBClassifier,0.373318,0.619231,0.632613,0.62585,0.860921
1,LGBMClassifier,0.371377,0.660643,0.646365,0.653426,0.870576
2,CatBoostClassifier,0.384476,0.661157,0.628684,0.644512,0.876613
3,LogisticRegression,0.289522,0.4624,0.56778,0.5097,0.772077


### Выводы

#### F1-score выше у модели LGBMClassifier, что при практически лучшим значением roc_auc(0.870576), делает  модель LGBMClassifier лучшей


Когда важным является баланс между точностью и полнотой предсказаний, поэтому F1-score более подходящая метрика. 

In [197]:
preds = pipeline_lgbm.predict_proba(X_test)[:, 1]

precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)

cnf_matrix = confusion_matrix(y_test, preds > thresholds[ix])
save = (cnf_matrix[0][1] + cnf_matrix[1][1]) * 1
income = cnf_matrix[1][1] * 2

income - save

159

In [198]:
from sklearn.model_selection import GridSearchCV

params = {
    'classifier__max_features': [0.3, 0.5, 0.7],
    'classifier__min_samples_leaf': [1, 15, 30, 50],
    'classifier__n_estimators': [50, 100, 150, 300]
}

In [199]:
%%time
grid = GridSearchCV(pipeline_lgbm,
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=False)

search = grid.fit(X_train, y_train)
search.best_params_











CPU times: user 27min 39s, sys: 8min 21s, total: 36min 1s
Wall time: 8min 37s


{'classifier__max_features': 0.3,
 'classifier__min_samples_leaf': 1,
 'classifier__n_estimators': 100}

In [200]:
pipeline_lgbm = Pipeline([
    ('features', feats),
    ('classifier', LGBMClassifier(n_estimators=100, min_samples_leaf=1, max_features=0.3, random_state=42)),
])
# обучим наш пайплайн
pipeline_lgbm.fit(X_train, y_train)
# прогнозы для тестовой выборки
preds = pipeline_lgbm.predict_proba(X_test)[:, 1]
preds[:10]



array([0.34635332, 0.2700885 , 0.15793721, 0.14126463, 0.02184439,
       0.92602278, 0.01784738, 0.08355752, 0.10653037, 0.84747103])

In [201]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))
roc_auc = roc_auc_score(y_test, preds)
results_df = results_df.append({'Model': 'LGBMClassifier New', 'Thresholds': thresholds[ix],
                                'Precision': precision[ix], 'Recall': recall[ix], 'F1-score': fscore[ix],
                                'roc_auc':roc_auc}, ignore_index=True)

Best Threshold=0.370187, F-Score=0.643, Precision=0.637, Recall=0.648


  results_df = results_df.append({'Model': 'LGBMClassifier New', 'Thresholds': thresholds[ix],


In [202]:
results_df

Unnamed: 0,Model,Thresholds,Precision,Recall,F1-score,roc_auc
0,XGBClassifier,0.373318,0.619231,0.632613,0.62585,0.860921
1,LGBMClassifier,0.371377,0.660643,0.646365,0.653426,0.870576
2,CatBoostClassifier,0.384476,0.661157,0.628684,0.644512,0.876613
3,LogisticRegression,0.289522,0.4624,0.56778,0.5097,0.772077
4,LGBMClassifier New,0.370187,0.637066,0.64833,0.642648,0.871427


In [203]:
preds = pipeline_lgbm.predict_proba(X_test)[:, 1]

precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)

cnf_matrix = confusion_matrix(y_test, preds > thresholds[ix])
save = (cnf_matrix[0][1] + cnf_matrix[1][1]) * 1
income = cnf_matrix[1][1] * 2

income - save

141