### 1. Для нашего пайплайна (Case1) поэкспериментировать с разными моделями: 1 - бустинг, 2 - логистическая регрессия (не забудьте здесь добавить в cont_transformer стандартизацию - нормирование вещественных признаков)

In [61]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#from sklearn.feature_extraction.text import TfidfVectorizer
import itertools

import matplotlib.pyplot as plt

%matplotlib inline

In [62]:
df = pd.read_csv("./Churn_Modelling.csv")
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [3]:
# Распределение классов
df['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

x, x_test, y, y_test = train_test_split(xtrain,labels,test_size=0.2,train_size=0.8)
x_train, x_cv, y_train, y_cv = train_test_split(x,y,test_size = 0.25,train_size =0.75)


In [63]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(df, df['Exited'], test_size = 0.3, random_state=42)

    -Категориальные признаки закодируем с помощью OneHotEncoding
    -Вещественные оставим пока как есть


In [5]:
#соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [6]:
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [7]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [8]:
# пример для одного признака
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

gender = Pipeline([
                ('selector', FeatureSelector(column='Gender')),
                ('ohe', OHEEncoder(key='Gender'))
            ])
gender.fit(X_train)
gender.transform(X_test).head(3)

Unnamed: 0,Gender_Female,Gender_Male
6252,0,1
4684,0,1
1731,1,0


In [9]:
# Теперь нам нужно под каждый признак создать трансформер и объединить их в список
# (сделаем это в цикле, чтобы не мучиться)

final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    final_transformers.append((cont_col, cont_transformer))

In [10]:
# Объединим все это в единый пайплайн

feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [11]:
# Теперь у нас есть пайплайн, который готовит признаки для моделирования.
# Добавим модель

from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

In [12]:
#обучим наш пайплайн
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [13]:
#наши прогнозы для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.07, 0.02, 0.27, 0.15, 0.12, 0.01, 0.11, 0.34, 0.42, 0.32])

Также нам нужно от вероятностей перейти к меткам классов. Для этого нужно подобрать порог, после которого мы считаем, что объект можно отнести к классу 1 (если вероятность больше порога - размечаем объект как класс 1, если нет - класс 0)

In [14]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

In [15]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))
thresholds_rf = thresholds[ix]
fscore_rf = fscore[ix]
precision_rf = precision[ix]
recall_rf = recall[ix]

Best Threshold=0.410000, F-Score=0.611, Precision=0.681, Recall=0.553


### 1 - бустинг,

In [16]:
from catboost import CatBoostClassifier


In [17]:
pipeline_cb = Pipeline([
    ('features',feats),
    ('classifier', CatBoostClassifier(random_state = 42)),
])

In [18]:
#обучим наш пайплайн
pipeline_cb.fit(X_train, y_train)

Learning rate set to 0.023648
0:	learn: 0.6740742	total: 102ms	remaining: 1m 41s
1:	learn: 0.6565105	total: 116ms	remaining: 57.8s
2:	learn: 0.6425377	total: 130ms	remaining: 43.2s
3:	learn: 0.6262563	total: 140ms	remaining: 34.8s
4:	learn: 0.6138627	total: 153ms	remaining: 30.4s
5:	learn: 0.5980912	total: 173ms	remaining: 28.6s
6:	learn: 0.5837919	total: 195ms	remaining: 27.6s
7:	learn: 0.5720954	total: 213ms	remaining: 26.4s
8:	learn: 0.5603974	total: 222ms	remaining: 24.5s
9:	learn: 0.5486632	total: 234ms	remaining: 23.1s
10:	learn: 0.5375073	total: 243ms	remaining: 21.9s
11:	learn: 0.5267146	total: 254ms	remaining: 20.9s
12:	learn: 0.5178099	total: 273ms	remaining: 20.7s
13:	learn: 0.5099433	total: 291ms	remaining: 20.5s
14:	learn: 0.5033366	total: 308ms	remaining: 20.2s
15:	learn: 0.4952356	total: 327ms	remaining: 20.1s
16:	learn: 0.4885766	total: 335ms	remaining: 19.4s
17:	learn: 0.4824193	total: 355ms	remaining: 19.4s
18:	learn: 0.4754619	total: 374ms	remaining: 19.3s
19:	learn:

179:	learn: 0.3187932	total: 2.04s	remaining: 9.3s
180:	learn: 0.3186612	total: 2.06s	remaining: 9.32s
181:	learn: 0.3184974	total: 2.07s	remaining: 9.3s
182:	learn: 0.3183030	total: 2.08s	remaining: 9.3s
183:	learn: 0.3182108	total: 2.09s	remaining: 9.28s
184:	learn: 0.3180004	total: 2.11s	remaining: 9.29s
185:	learn: 0.3179204	total: 2.12s	remaining: 9.27s
186:	learn: 0.3177868	total: 2.14s	remaining: 9.29s
187:	learn: 0.3176496	total: 2.15s	remaining: 9.3s
188:	learn: 0.3175065	total: 2.16s	remaining: 9.28s
189:	learn: 0.3173839	total: 2.19s	remaining: 9.32s
190:	learn: 0.3172320	total: 2.21s	remaining: 9.35s
191:	learn: 0.3169850	total: 2.22s	remaining: 9.33s
192:	learn: 0.3168190	total: 2.22s	remaining: 9.3s
193:	learn: 0.3167294	total: 2.23s	remaining: 9.27s
194:	learn: 0.3165902	total: 2.24s	remaining: 9.24s
195:	learn: 0.3164434	total: 2.25s	remaining: 9.22s
196:	learn: 0.3161603	total: 2.25s	remaining: 9.19s
197:	learn: 0.3159913	total: 2.26s	remaining: 9.16s
198:	learn: 0.315

342:	learn: 0.2961868	total: 4.07s	remaining: 7.8s
343:	learn: 0.2961740	total: 4.08s	remaining: 7.79s
344:	learn: 0.2959884	total: 4.1s	remaining: 7.78s
345:	learn: 0.2958077	total: 4.11s	remaining: 7.77s
346:	learn: 0.2956645	total: 4.12s	remaining: 7.76s
347:	learn: 0.2955566	total: 4.14s	remaining: 7.75s
348:	learn: 0.2954425	total: 4.16s	remaining: 7.75s
349:	learn: 0.2952151	total: 4.17s	remaining: 7.74s
350:	learn: 0.2950993	total: 4.18s	remaining: 7.73s
351:	learn: 0.2949016	total: 4.19s	remaining: 7.71s
352:	learn: 0.2947458	total: 4.2s	remaining: 7.7s
353:	learn: 0.2945367	total: 4.21s	remaining: 7.68s
354:	learn: 0.2943891	total: 4.22s	remaining: 7.67s
355:	learn: 0.2942867	total: 4.23s	remaining: 7.65s
356:	learn: 0.2942025	total: 4.24s	remaining: 7.63s
357:	learn: 0.2941513	total: 4.25s	remaining: 7.61s
358:	learn: 0.2940537	total: 4.26s	remaining: 7.61s
359:	learn: 0.2939753	total: 4.27s	remaining: 7.6s
360:	learn: 0.2938840	total: 4.28s	remaining: 7.58s
361:	learn: 0.293

502:	learn: 0.2774787	total: 7.07s	remaining: 6.99s
503:	learn: 0.2773804	total: 7.09s	remaining: 6.98s
504:	learn: 0.2772033	total: 7.12s	remaining: 6.98s
505:	learn: 0.2770880	total: 7.14s	remaining: 6.97s
506:	learn: 0.2769412	total: 7.16s	remaining: 6.96s
507:	learn: 0.2768183	total: 7.18s	remaining: 6.95s
508:	learn: 0.2767189	total: 7.23s	remaining: 6.97s
509:	learn: 0.2766343	total: 7.26s	remaining: 6.97s
510:	learn: 0.2765509	total: 7.29s	remaining: 6.98s
511:	learn: 0.2764891	total: 7.3s	remaining: 6.96s
512:	learn: 0.2763487	total: 7.31s	remaining: 6.94s
513:	learn: 0.2762068	total: 7.32s	remaining: 6.92s
514:	learn: 0.2760602	total: 7.33s	remaining: 6.9s
515:	learn: 0.2760151	total: 7.34s	remaining: 6.89s
516:	learn: 0.2758910	total: 7.35s	remaining: 6.87s
517:	learn: 0.2758115	total: 7.37s	remaining: 6.86s
518:	learn: 0.2756998	total: 7.38s	remaining: 6.84s
519:	learn: 0.2755809	total: 7.39s	remaining: 6.83s
520:	learn: 0.2755290	total: 7.41s	remaining: 6.81s
521:	learn: 0.

666:	learn: 0.2598723	total: 9.31s	remaining: 4.65s
667:	learn: 0.2597823	total: 9.33s	remaining: 4.64s
668:	learn: 0.2596899	total: 9.35s	remaining: 4.63s
669:	learn: 0.2595792	total: 9.38s	remaining: 4.62s
670:	learn: 0.2594716	total: 9.4s	remaining: 4.61s
671:	learn: 0.2594065	total: 9.43s	remaining: 4.6s
672:	learn: 0.2593125	total: 9.44s	remaining: 4.59s
673:	learn: 0.2592413	total: 9.45s	remaining: 4.57s
674:	learn: 0.2591341	total: 9.47s	remaining: 4.56s
675:	learn: 0.2589617	total: 9.48s	remaining: 4.55s
676:	learn: 0.2588583	total: 9.51s	remaining: 4.54s
677:	learn: 0.2587676	total: 9.52s	remaining: 4.52s
678:	learn: 0.2586979	total: 9.53s	remaining: 4.51s
679:	learn: 0.2586303	total: 9.54s	remaining: 4.49s
680:	learn: 0.2584891	total: 9.57s	remaining: 4.48s
681:	learn: 0.2583798	total: 9.58s	remaining: 4.47s
682:	learn: 0.2583015	total: 9.59s	remaining: 4.45s
683:	learn: 0.2582320	total: 9.6s	remaining: 4.43s
684:	learn: 0.2581581	total: 9.61s	remaining: 4.42s
685:	learn: 0.2

826:	learn: 0.2448054	total: 10.9s	remaining: 2.28s
827:	learn: 0.2447159	total: 10.9s	remaining: 2.27s
828:	learn: 0.2446533	total: 10.9s	remaining: 2.26s
829:	learn: 0.2445431	total: 11s	remaining: 2.24s
830:	learn: 0.2444887	total: 11s	remaining: 2.23s
831:	learn: 0.2443873	total: 11s	remaining: 2.21s
832:	learn: 0.2442964	total: 11s	remaining: 2.2s
833:	learn: 0.2442189	total: 11s	remaining: 2.19s
834:	learn: 0.2441549	total: 11s	remaining: 2.17s
835:	learn: 0.2440210	total: 11s	remaining: 2.16s
836:	learn: 0.2439188	total: 11s	remaining: 2.15s
837:	learn: 0.2438486	total: 11s	remaining: 2.13s
838:	learn: 0.2437578	total: 11s	remaining: 2.12s
839:	learn: 0.2437128	total: 11.1s	remaining: 2.1s
840:	learn: 0.2436606	total: 11.1s	remaining: 2.09s
841:	learn: 0.2435461	total: 11.1s	remaining: 2.08s
842:	learn: 0.2434846	total: 11.1s	remaining: 2.06s
843:	learn: 0.2434158	total: 11.1s	remaining: 2.05s
844:	learn: 0.2433507	total: 11.1s	remaining: 2.04s
845:	learn: 0.2432585	total: 11.1s

986:	learn: 0.2316977	total: 12.3s	remaining: 162ms
987:	learn: 0.2315963	total: 12.3s	remaining: 150ms
988:	learn: 0.2315191	total: 12.3s	remaining: 137ms
989:	learn: 0.2314065	total: 12.4s	remaining: 125ms
990:	learn: 0.2313393	total: 12.4s	remaining: 112ms
991:	learn: 0.2312391	total: 12.4s	remaining: 99.8ms
992:	learn: 0.2311793	total: 12.4s	remaining: 87.3ms
993:	learn: 0.2311225	total: 12.4s	remaining: 74.8ms
994:	learn: 0.2310431	total: 12.4s	remaining: 62.3ms
995:	learn: 0.2309718	total: 12.4s	remaining: 49.8ms
996:	learn: 0.2309018	total: 12.4s	remaining: 37.3ms
997:	learn: 0.2308096	total: 12.4s	remaining: 24.9ms
998:	learn: 0.2307089	total: 12.4s	remaining: 12.4ms
999:	learn: 0.2305877	total: 12.4s	remaining: 0us


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [19]:
#наши прогнозы для тестовой выборки
preds_cb = pipeline_cb.predict_proba(X_test)[:, 1]
preds_cb[:10]

array([0.04989862, 0.03429373, 0.12175275, 0.15592254, 0.06786912,
       0.00864872, 0.06798309, 0.31159861, 0.236512  , 0.3672608 ])

In [20]:
precision, recall, thresholds = precision_recall_curve(y_test, preds_cb)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))
thresholds_cb = thresholds[ix]
fscore_cb = fscore[ix]
precision_cb = precision[ix]
recall_cb = recall[ix]

Best Threshold=0.376941, F-Score=0.636, Precision=0.683, Recall=0.594


### 2 - логистическая регрессия 

In [64]:
from sklearn.linear_model import LogisticRegression

In [65]:
from sklearn.preprocessing import StandardScaler

In [66]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(df, df['Exited'], test_size = 0.3, random_state=42)

In [67]:
##################################333
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('scaler', StandardScaler())
            ])
    final_transformers.append((cont_col, cont_transformer))

In [68]:
# Объединим все это в единый пайплайн

feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [69]:
pipeline_lr = Pipeline([
    ('features',feats),
    ('classifier', LogisticRegression(random_state = 42)),
])

In [70]:
#обучим наш пайплайн
pipeline_lr.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [71]:
#наши прогнозы для тестовой выборки
preds_lr = pipeline_lr.predict_proba(X_test)[:, 1]
preds_lr[:10]

array([0.2593683 , 0.08657959, 0.31768977, 0.44112458, 0.06354299,
       0.04599764, 0.27854161, 0.20118673, 0.32522912, 0.31827133])

In [72]:
precision, recall, thresholds = precision_recall_curve(y_test, preds_lr)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))
thresholds_lr = thresholds[ix]
fscore_lr = fscore[ix]
precision_lr = precision[ix]
recall_lr = recall[ix]

Best Threshold=0.913209, F-Score=nan, Precision=0.000, Recall=0.000


  This is separate from the ipykernel package so we can avoid doing imports until


In [73]:
print(fscore)

[0.3354394  0.33496122 0.33505747 ... 0.00341297        nan 0.        ]


In [74]:
ix = np.argmax(fscore)
ix

2897

In [53]:
fscore[2897]

nan

In [52]:
max(fscore)

0.4919043947571319

### 2. Отобрать лучшую модель по метрикам (кстати, какая по вашему мнению здесь наиболее подходящая DS-метрика)

Лучшие результаты показал CatBoost. F-Score наиболее подходящая метрика.

### 3.Для отобранной модели (на отложенной выборке) сделать оценку экономической эффективности при тех же вводных, как в вопросе 2 (1 доллар на привлечение, 2 доллара - с каждого правильно классифицированного (True Positive) удержанного). (подсказка) нужно посчитать FP/TP/FN/TN для выбранного оптимального порога вероятности и посчитать выручку и траты. 