In [1]:
import warnings
from tqdm import tqdm
from typing import List, Tuple

import numpy as np
import pandas as pd

import catboost as cb
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_rel

from sklearn.metrics import f1_score, roc_auc_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
warnings.simplefilter("ignore")
%matplotlib inline

In [2]:
train = pd.read_csv('assignment_2_train.csv')
test = pd.read_csv('assignment_2_test.csv')

print("gf_train.shape = {} rows, {} cols".format(*train.shape))
print("df_test.shape = {} rows, {} cols".format(*test.shape))
train.head(2)

gf_train.shape = 180000 rows, 394 cols
df_test.shape = 100001 rows, 394 cols


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,


**Задание 1:** сделать Hold-Out валидацию с разбиением, размер которого будет адеквтаным, по вашему мнению; разбиение проводить по id-транзакции (TransactionID), обучать модель градиетного бустинга любой реализации с подбором числа деревьев по early_stopping критерию до достижения сходимости. Оценить качество модели на валидационной выборке, оценить расхождение по сравнению с качеством на обучающей выборке и валидационной выборке. Оценить качество на ЛБ, сравнить с качеством на обучении и валидации. Сделать выводы.

In [3]:
target = train['isFraud']
data = train.set_index('TransactionID')
data.drop(['isFraud'], axis=1, inplace=True)

In [4]:
numerical_features = data.select_dtypes(include=[np.number])
print(f'Всего числовых признаков {numerical_features.shape[1]}')
numerical_features = numerical_features.columns.tolist()

Всего числовых признаков 378


In [5]:
categorical_features = data.select_dtypes(include=[np.object])
print(f'Всего категориальных признаков {categorical_features.shape[1]}')
categorical_features = categorical_features.columns.tolist()

Всего категориальных признаков 14


In [6]:
for col in categorical_features:
    data[col] = data[col].astype(str)

In [7]:
test_target = test['isFraud']
test_data = test.set_index('TransactionID')

for col in categorical_features:
    test_data[col] = test_data[col].astype(str)

In [8]:
x_train, x_valid, y_train, y_valid = train_test_split(
    data, target, train_size=0.75, shuffle=True, random_state=25,
)
print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))

x_train.shape = 135000 rows, 392 cols
x_valid.shape = 45000 rows, 392 cols


In [9]:
train_pool = cb.Pool(x_train, y_train, cat_features=categorical_features)
valid_pool = cb.Pool(x_valid, y_valid, cat_features=categorical_features)
test_pool = cb.Pool(test_data, cat_features=categorical_features)

In [10]:
cb_params = {
    "n_estimators": 2000,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 50,
    "verbose": 100,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42,
    "cat_features": categorical_features
}

In [11]:
model = cb.CatBoostClassifier(**cb_params)
model.fit(train_pool, eval_set=valid_pool)

0:	test: 0.6440887	best: 0.6440887 (0)	total: 291ms	remaining: 9m 42s
100:	test: 0.8678437	best: 0.8678437 (100)	total: 22.4s	remaining: 7m 1s
200:	test: 0.8838956	best: 0.8838956 (200)	total: 45.1s	remaining: 6m 43s
300:	test: 0.8913990	best: 0.8913995 (299)	total: 1m 6s	remaining: 6m 18s
400:	test: 0.8969465	best: 0.8969465 (400)	total: 1m 29s	remaining: 5m 55s
500:	test: 0.8987352	best: 0.8987361 (499)	total: 1m 50s	remaining: 5m 32s
600:	test: 0.9012847	best: 0.9012996 (596)	total: 2m 12s	remaining: 5m 9s
700:	test: 0.9035310	best: 0.9035451 (697)	total: 2m 35s	remaining: 4m 47s
800:	test: 0.9048830	best: 0.9048830 (800)	total: 2m 57s	remaining: 4m 25s
900:	test: 0.9060386	best: 0.9060386 (900)	total: 3m 19s	remaining: 4m 2s
1000:	test: 0.9067870	best: 0.9067870 (1000)	total: 3m 40s	remaining: 3m 40s
1100:	test: 0.9082953	best: 0.9082953 (1100)	total: 4m 3s	remaining: 3m 18s
1200:	test: 0.9098445	best: 0.9098445 (1200)	total: 4m 25s	remaining: 2m 56s
1300:	test: 0.9108992	best: 0.9

<catboost.core.CatBoostClassifier at 0x240b197e608>

In [12]:
predictions_train = model.predict(train_pool)
predictions_valid = model.predict(valid_pool)
predictions_lb = model.predict(test_pool)

In [16]:
print(f'train: {round(roc_auc_score(y_train, predictions_train), 4)}')
print(f'test: {round(roc_auc_score(y_valid, predictions_valid), 4)}')
print(f'l_board: {round(roc_auc_score(test_target, predictions_lb), 4)}')

train: 0.712
test: 0.6955
l_board: 0.6478


**Вывод**
* Модель переобучена, значение метрик падает на валидации и еще сильнее на лидерборде. 

**Задание 2:** сделать Hold-Out валидацию с разбиением на 3 выборки, разбиение проводить по id-транзакции (TransactionID), размер каждой выборки подобрать самостоятельно. Повторить процедуру из п.1. для каждой выборки.

In [17]:
x_train, x_valid, y_train, y_valid = train_test_split(
    data, target, train_size=0.7, shuffle=True, random_state=25,
)

x_valid, x_test, y_valid, y_test = train_test_split(
    x_valid, y_valid, train_size=0.7, shuffle=True, random_state=25
)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_test.shape))

x_train.shape = 125999 rows, 392 cols
x_valid.shape = 37800 rows, 392 cols
x_test.shape = 16201 rows, 392 cols


In [18]:
train_pool = cb.Pool(x_train, y_train, cat_features=categorical_features)
valid_pool = cb.Pool(x_valid, y_valid, cat_features=categorical_features)
test_pool = cb.Pool(x_test, y_test, cat_features=categorical_features)
lb_pool = cb.Pool(test_data, cat_features=categorical_features)

In [19]:
model = cb.CatBoostClassifier(**cb_params)
model.fit(train_pool, eval_set=valid_pool)

0:	test: 0.6045153	best: 0.6045153 (0)	total: 220ms	remaining: 7m 19s
100:	test: 0.8679537	best: 0.8680797 (99)	total: 21.4s	remaining: 6m 41s
200:	test: 0.8832491	best: 0.8832491 (200)	total: 42.8s	remaining: 6m 23s
300:	test: 0.8917414	best: 0.8917414 (300)	total: 1m 4s	remaining: 6m 3s
400:	test: 0.8954093	best: 0.8954093 (400)	total: 1m 24s	remaining: 5m 37s
500:	test: 0.8957401	best: 0.8957584 (486)	total: 1m 44s	remaining: 5m 13s
600:	test: 0.8997151	best: 0.8997151 (600)	total: 2m 5s	remaining: 4m 51s
700:	test: 0.9023949	best: 0.9023960 (698)	total: 2m 26s	remaining: 4m 30s
800:	test: 0.9038382	best: 0.9038382 (800)	total: 2m 46s	remaining: 4m 9s
900:	test: 0.9051102	best: 0.9051102 (900)	total: 3m 7s	remaining: 3m 48s
1000:	test: 0.9061223	best: 0.9061223 (1000)	total: 3m 27s	remaining: 3m 27s
1100:	test: 0.9078589	best: 0.9078589 (1100)	total: 3m 48s	remaining: 3m 6s
1200:	test: 0.9089206	best: 0.9089206 (1200)	total: 4m 9s	remaining: 2m 46s
1300:	test: 0.9094590	best: 0.9094

<catboost.core.CatBoostClassifier at 0x240afd5a488>

In [24]:
predictions_train = model.predict(train_pool)
predictions_valid = model.predict(valid_pool)
predictions_test = model.predict(test_pool)
predictions_lb = model.predict(lb_pool)

In [23]:
print(f'train: {round(roc_auc_score(y_train, predictions_train), 4)}')
print(f'valid: {round(roc_auc_score(y_valid, predictions_valid), 4)}')
print(f'test: {round(roc_auc_score(y_test, predictions_test), 4)}')
print(f'l_board: {round(roc_auc_score(test_target, predictions_lb), 4)}')

train: 0.701
valid: 0.6889
test: 0.6921
l_board: 0.6413


**Задание 3:** построить доверительный интервал на данных из п.2 на основе бутстреп выборок, оценить качество модели на ЛБ относительно полученного доверительного интервала. Сделать выводы.

In [27]:
def create_bootstrap_samples(data: np.array, n_samples: int = 1000) -> np.array:
    
    bootstrap_idx = np.random.randint(
        low=0, high=len(data), size=(n_samples, len(data))
    )
    return bootstrap_idx


def create_bootstrap_metrics(y_true: np.array,
                             y_pred: np.array,
                             metric: callable,
                             n_samlpes: int = 1000) -> List[float]:
   
    scores = []

    if isinstance(y_true, pd.Series):
        y_true = y_true.values

    bootstrap_idx = create_bootstrap_samples(y_true)
    for idx in bootstrap_idx:
        y_true_bootstrap = y_true[idx]
        y_pred_bootstrap = y_pred[idx]

        score = metric(y_true_bootstrap, y_pred_bootstrap)
        scores.append(score)

    return scores


def calculate_confidence_interval(scores: list, conf_interval: float = 0.95) -> Tuple[float]:
   
    left_bound = np.percentile(
        scores, ((1 - conf_interval) / 2) * 100
    )
    right_bound = np.percentile(
        scores, (conf_interval + ((1 - conf_interval) / 2)) * 100
    )

    return left_bound, right_bound

In [35]:
np.random.seed(27)
scores = create_bootstrap_metrics(test_target, predictions_lb, roc_auc_score)

calculate_confidence_interval(scores)

(0.6340762396143949, 0.6484384033200514)

**Вывод** 
* Значение метрики на лидерборде попадает в доверительный интервал, что показывает неплохое качество валидации. Валидация устойчивая. Ширина доверительного интервала допустима.

**Задание 4:** выполнить Adversarial Validation, подобрать объекты из обучающей выборки, которые сильно похожи на объекты из assignment_2_test.csv, и использовать их в качестве валидационного набора. Оценить качество модели на ЛБ, сделать выводы о полученных результатах.

In [29]:
x_adv = pd.concat([
    data, test_data], axis=0
)
y_adv = np.hstack((np.zeros(data.shape[0]), np.ones(test_data.shape[0])))
assert x_adv.shape[0] == y_adv.shape[0]

In [30]:
model = xgb.XGBClassifier(n_estimators=100)
model.fit(x_adv[numerical_features], y_adv)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [31]:
y_pred_adv = model.predict_proba(x_adv[numerical_features])
score = roc_auc_score(y_adv, y_pred_adv[:, 1])
print(round(score, 4))

1.0


In [32]:
y_pred = model.predict_proba(x_train[numerical_features])
y_pred

array([[9.9999470e-01, 5.2982914e-06],
       [9.9999470e-01, 5.2982914e-06],
       [9.9999470e-01, 5.2982914e-06],
       ...,
       [9.9999470e-01, 5.2982914e-06],
       [9.9999470e-01, 5.2982914e-06],
       [9.9999470e-01, 5.2982914e-06]], dtype=float32)

In [33]:
pd.cut(
    y_pred[:, 1], bins=np.arange(0, 1.01, 0.1)
).value_counts().sort_index()

(0.0, 0.1]    125999
(0.1, 0.2]         0
(0.2, 0.3]         0
(0.3, 0.4]         0
(0.4, 0.5]         0
(0.5, 0.6]         0
(0.6, 0.7]         0
(0.7, 0.8]         0
(0.8, 0.9]         0
(0.9, 1.0]         0
dtype: int64

**Вывод**
* Данные на трейне и тесте совсем не совпадают, что подтверждает roc_auc_score = 1.0 Модель идеально отличает данные трейна и теста

**Задание 5:** сделать KFold / StratifiedKFold валидацию (на ваше усмотрение), оценить получаемые качество и разброс по метрике качества. Сделать выводы об устойчивости кросс-валидации, сходимости оценки на кросс-валидации и отложенном наборе данных; Оценить качество на ЛБ, сделать выводы.

In [36]:
cv_data = cb.cv(
    cb.Pool(data, target, cat_features=categorical_features),
    cb_params,
)

0:	test: 0.6518597	best: 0.6518597 (0)	total: 895ms	remaining: 29m 49s
100:	test: 0.8631351	best: 0.8631351 (100)	total: 1m 14s	remaining: 23m 21s
200:	test: 0.8797378	best: 0.8797378 (200)	total: 2m 28s	remaining: 22m 7s
300:	test: 0.8874365	best: 0.8874365 (300)	total: 3m 39s	remaining: 20m 40s
400:	test: 0.8919986	best: 0.8919986 (400)	total: 4m 52s	remaining: 19m 25s
500:	test: 0.8934776	best: 0.8934776 (500)	total: 6m 2s	remaining: 18m 4s
600:	test: 0.8942905	best: 0.8942905 (600)	total: 7m 11s	remaining: 16m 44s
700:	test: 0.8953205	best: 0.8953206 (697)	total: 8m 19s	remaining: 15m 25s
800:	test: 0.8960420	best: 0.8960422 (799)	total: 9m 27s	remaining: 14m 9s
900:	test: 0.8970305	best: 0.8970305 (900)	total: 10m 34s	remaining: 12m 54s
1000:	test: 0.8991212	best: 0.8991246 (999)	total: 11m 45s	remaining: 11m 44s
1100:	test: 0.9008175	best: 0.9008175 (1100)	total: 12m 57s	remaining: 10m 35s
1200:	test: 0.9014755	best: 0.9014755 (1200)	total: 14m 7s	remaining: 9m 23s
1300:	test: 0.