## Задание 0

Задание 0: выбрать любую модель машнного обучения и зафиксировать любой тип валидации. Обучить базовую модель и зафиксировать базовое качество модели. В каждом следующем задании нужно будет обучить выбранную модель и оценивать ее качество на зафиксированной схеме валидации. После каждого задания, требуется сделать вывод о достигаемом качестве модели, по сравнению с качестом из предыдущего шага.

In [1]:
import warnings
from tqdm import tqdm
from typing import List, Tuple

import numpy as np
import pandas as pd
#import seaborn as sns
import xgboost as xgb
import catboost as cb
#import matplotlib.pyplot as plt
from scipy.stats import ttest_rel

from sklearn.metrics import r2_score
#from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score

warnings.simplefilter("ignore")
#%matplotlib inline

In [2]:
train = pd.read_csv("./data/assignment_2_train.csv")
test = pd.read_csv("./data/assignment_2_test.csv")

## Catboost

In [3]:
def fit_catboost(x_train, y_train, model_params, categorical, *args):
    """
    Обучение модели CatBoostClassifier.
    
    Parametes
    x_train: pandas.core.frame.DataFrame
        Матрица признаков для обучения модели.
    y_valid: pandas.core.frame.Series
        Вектор целевой переменной для валидации
    model_params: dict
        Словарь со значением гиперпараметров модели
    categorical: List[str]
        Список с названием категориальных признаков
    
    Returns
    model: catboost.core.CatBoostClassifier
        Экземпляр обученного классификатора.
    """
    eval_set = [(x_train, y_train)]
    
    if args == 2:
        eval_set.append((args[0], args[1]))
    
    model = cb.CatBoostClassifier(**model_params)
    model.fit(
        X=x_train,
        y=y_train,
        eval_set=eval_set,
        cat_features=categorical
    )
    
    return model

In [4]:
def evaluate_model(model, *args):
    """
    Оценка качества модели.
    
    Parameters
    model: catboost.core.CatBoostClassifier
        Экземпляр обученного классификатора.
        
    args: pandas.core.frame.DataFrame
        Пары из обучающей выборки и вектора истинных значений.
        Опциональные параметры.
    """
    eval_data = [(args[i], args[i+1]) for i in range(0, len(args), 2)]
    if eval_data:
        for sample, target in eval_data:
            y_pred = model.predict_proba(sample)[:,1]
            score = roc_auc_score(target, y_pred)
            print(f"score = {round(score,6)}")

In [10]:
def prepare_data(X, categorical, to_drop):
    """
    Преобразование данных для передачи в модель.
    Parameters
    X: pandas.core.frame.DataFrame
        Матрица признаков для передачи в модель.
    categorical: List[str]
        Список с названием категориальных признаков.
    drop_features: List[str]
        Список с названием признаков, которые не должны 
        участвововать в обучении.
    Returns
    X_transformed: pandas.core.frame.DataFrame
        Матрица признаков, подготовленная для передачи в модель.
    """
    X_transformed = X.copy()
    to_drop = set(X.columns) & set(to_drop)
    
    if to_drop:
        X_transformed = X_transformed.drop(to_drop, axis=1)
    
    X_transformed[categorical] = X_transformed[categorical].astype(str)
    return X_transformed

In [11]:
to_drop = [
    "TransactionID",
    "TransactionDT",
    "isFraud",
]

categorical = train.select_dtypes(include=["object"]).columns.tolist()

In [12]:
x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=42, shuffle=True
)

y_train, y_valid = train_test_split(
    train["isFraud"], train_size=0.7, random_state=42, shuffle=True)

In [13]:
x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))

x_train.shape = 125999 rows, 391 cols
x_valid.shape = 54001 rows, 391 cols
x_test.shape = 100001 rows, 391 cols


In [14]:
cb_params_1000 = {
    "n_estimators": 1000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds":50,
    "thread_count": 6,
    "random_seed": 42
}

In [15]:

model = fit_catboost(x_train, y_train, cb_params_1000, categorical, x_valid, y_valid)

0:	test: 0.6220983	best: 0.6220983 (0)	total: 1.26s	remaining: 20m 54s
10:	test: 0.7901006	best: 0.7901006 (10)	total: 11.1s	remaining: 16m 35s
20:	test: 0.7957635	best: 0.7975446 (16)	total: 20.4s	remaining: 15m 50s
30:	test: 0.8159232	best: 0.8159232 (30)	total: 29.7s	remaining: 15m 27s
40:	test: 0.8267817	best: 0.8267854 (39)	total: 39s	remaining: 15m 11s
50:	test: 0.8352521	best: 0.8358712 (47)	total: 48.8s	remaining: 15m 8s
60:	test: 0.8409616	best: 0.8409616 (60)	total: 59.5s	remaining: 15m 16s
70:	test: 0.8417169	best: 0.8427111 (67)	total: 1m 10s	remaining: 15m 25s
80:	test: 0.8448751	best: 0.8450100 (78)	total: 1m 20s	remaining: 15m 18s
90:	test: 0.8473364	best: 0.8473364 (90)	total: 1m 30s	remaining: 15m 7s
100:	test: 0.8486445	best: 0.8487345 (98)	total: 1m 40s	remaining: 14m 57s
110:	test: 0.8507777	best: 0.8507777 (110)	total: 1m 50s	remaining: 14m 46s
120:	test: 0.8519341	best: 0.8519341 (120)	total: 2m	remaining: 14m 33s
130:	test: 0.8530806	best: 0.8534658 (129)	total: 

In [16]:
evaluate_model(model, x_train, y_train, x_valid, y_valid, x_public_lb, test["isFraud"])

score = 0.89237
score = 0.885213
score = 0.863983


bestTest = 0.8923702909
bestIteration = 999

score = 0.89237
score = 0.885213
score = 0.863983

## Задание 1

Задание 1: признак TransactionDT - это смещение в секундах относительно базовой даты. Базовая дата - 2017-12-01, преобразовать признак TransactionDT в datetime, прибавив к базовой дате исходное значение признака. Из полученного признака выделить год, месяц, день недели, час, день.

In [17]:
import datetime as dt
from datetime import timedelta

In [18]:
train['TransactionDT'].head()

0    86400
1    86401
2    86469
3    86499
4    86506
Name: TransactionDT, dtype: int64

In [19]:
base_date = '2017-12-01'


In [20]:
base_date = dt.datetime.strptime(base_date, '%Y-%m-%d')

In [21]:
train['TransactionDT'].dtypes

dtype('int64')

In [22]:
train['TransactionDT'] = train['TransactionDT'].apply(lambda x: base_date + timedelta(seconds=x))

In [23]:
train['TransactionDT'].head()

0   2017-12-02 00:00:00
1   2017-12-02 00:00:01
2   2017-12-02 00:01:09
3   2017-12-02 00:01:39
4   2017-12-02 00:01:46
Name: TransactionDT, dtype: datetime64[ns]

In [None]:
#Из полученного признака выделить год, месяц, день недели, час, день.

In [24]:
train['year'] = train['TransactionDT'].dt.year
train['month'] = train['TransactionDT'].dt.month
train['day_of_week'] = train['TransactionDT'].dt.weekday
train['day'] = train['TransactionDT'].dt.day
train['hour'] = train['TransactionDT'].dt.hour

train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V335,V336,V337,V338,V339,year,month,day_of_week,day,hour
0,2987000,0,2017-12-02 00:00:00,68.5,W,13926,,150.0,discover,142.0,...,,,,,,2017,12,5,2,0
1,2987001,0,2017-12-02 00:00:01,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,2017,12,5,2,0
2,2987002,0,2017-12-02 00:01:09,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,2017,12,5,2,0
3,2987003,0,2017-12-02 00:01:39,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,2017,12,5,2,0
4,2987004,0,2017-12-02 00:01:46,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,2017,12,5,2,0


In [29]:
test['TransactionDT'] = test['TransactionDT'].apply(lambda x: base_date + timedelta(seconds=x))

In [30]:
test['year'] = test['TransactionDT'].dt.year
test['month'] = test['TransactionDT'].dt.month
test['day_of_week'] = test['TransactionDT'].dt.weekday
test['day'] = test['TransactionDT'].dt.day
test['hour'] = test['TransactionDT'].dt.hour


In [25]:
to_drop = [
    "TransactionID",
    #"TransactionDT",
    "isFraud",
]

categorical = train.select_dtypes(include=["object"]).columns.tolist()

In [26]:
categorical

['ProductCD',
 'card4',
 'card6',
 'P_emaildomain',
 'R_emaildomain',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9']

In [27]:
x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=42, shuffle=True
)

y_train, y_valid = train_test_split(
    train["isFraud"], train_size=0.7, random_state=42, shuffle=True)

In [31]:
x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))

x_train.shape = 125999 rows, 397 cols
x_valid.shape = 54001 rows, 397 cols
x_test.shape = 100001 rows, 397 cols


In [32]:
cb_params_1000 = {
    "n_estimators": 1000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds":50,
    "thread_count": 6,
    "random_seed": 42
}

In [33]:
model = fit_catboost(x_train, y_train, cb_params_1000, categorical, x_valid, y_valid)

0:	test: 0.5931492	best: 0.5931492 (0)	total: 1.22s	remaining: 20m 19s
10:	test: 0.7436064	best: 0.7461541 (8)	total: 12.7s	remaining: 19m 3s
20:	test: 0.7692455	best: 0.7692455 (20)	total: 23.5s	remaining: 18m 13s
30:	test: 0.7904620	best: 0.7911865 (27)	total: 34.2s	remaining: 17m 49s
40:	test: 0.7956304	best: 0.7956304 (40)	total: 44.9s	remaining: 17m 29s
50:	test: 0.8047363	best: 0.8047580 (49)	total: 56.1s	remaining: 17m 23s
60:	test: 0.8189490	best: 0.8189490 (60)	total: 1m 7s	remaining: 17m 23s
70:	test: 0.8222860	best: 0.8223531 (67)	total: 1m 18s	remaining: 17m 8s
80:	test: 0.8299961	best: 0.8299961 (80)	total: 1m 29s	remaining: 16m 55s
90:	test: 0.8349988	best: 0.8349988 (90)	total: 1m 40s	remaining: 16m 42s
100:	test: 0.8403358	best: 0.8403358 (100)	total: 1m 51s	remaining: 16m 33s
110:	test: 0.8415116	best: 0.8415116 (110)	total: 2m 2s	remaining: 16m 20s
120:	test: 0.8461814	best: 0.8461814 (120)	total: 2m 13s	remaining: 16m 11s
130:	test: 0.8478019	best: 0.8478019 (130)	to

In [34]:
evaluate_model(model, x_train, y_train, x_valid, y_valid, x_public_lb, test["isFraud"])

score = 0.896947
score = 0.891243
score = 0.858023


bestTest = 0.8969465533
bestIteration = 999

score = 0.896947
score = 0.891243
score = 0.858023

In [None]:
# Добавление новых признаков на основе даты не увеличило точность предсказания модели на лидерборде, по сравнению с исходными признаками
# Но увеличелась точность на valid. Test и train из разных временных промежутков.

## Задание 2

Задание 2: сделать конкатенацию признаков
* card1 + card2;
* card1 + card2 + card_3 + card_5;
* card1 + card2 + card_3 + card_5 + addr1 + addr2

Рассматривать их как категориальных признаки.

In [41]:
train = pd.read_csv("./data/assignment_2_train.csv")
test = pd.read_csv("./data/assignment_2_test.csv")

In [42]:
train['card1_card2'] = str(train['card1']) + "|" + str(train['card2'])

In [44]:
train['card_1_2_3_5'] = str(train['card1_card2']) + "|" + str(train['card3']) + "|" + str(train['card5'])

In [45]:
train['card_1235_addr_1_2'] = str(train['card_1_2_3_5']) + str(train['addr1']) + str(train['addr2'])

In [46]:
train['card1_card2'] = train.card1_card2.astype('object')
train['card_1_2_3_5'] = train.card_1_2_3_5.astype('object')
train['card_1235_addr_1_2'] = train.card_1235_addr_1_2.astype('object')

In [49]:
test['card1_card2'] = str(test['card1']) + "|" + str(test['card2'])
test['card_1_2_3_5'] = str(test['card1_card2']) + "|" + str(test['card3']) + "|" + str(test['card5'])
test['card_1235_addr_1_2'] = str(test['card_1_2_3_5']) + str(test['addr1']) + str(test['addr2'])

test['card1_card2'] = test.card1_card2.astype('object')
test['card_1_2_3_5'] = test.card_1_2_3_5.astype('object')
test['card_1235_addr_1_2'] = test.card_1235_addr_1_2.astype('object')

In [50]:
to_drop = [
    "TransactionID",
    "TransactionDT",
    "isFraud",
]

categorical = train.select_dtypes(include=["object"]).columns.tolist()

In [51]:
x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=42, shuffle=True
)

y_train, y_valid = train_test_split(
    train["isFraud"], train_size=0.7, random_state=42, shuffle=True)

In [52]:
x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))

x_train.shape = 125999 rows, 394 cols
x_valid.shape = 54001 rows, 394 cols
x_test.shape = 100001 rows, 394 cols


In [53]:
cb_params_1000 = {
    "n_estimators": 1000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds":50,
    "thread_count": 6,
    "random_seed": 42
}

In [54]:
model = fit_catboost(x_train, y_train, cb_params_1000, categorical, x_valid, y_valid)

0:	test: 0.6220983	best: 0.6220983 (0)	total: 1.03s	remaining: 17m 9s
10:	test: 0.7901006	best: 0.7901006 (10)	total: 12.1s	remaining: 18m 4s
20:	test: 0.7957635	best: 0.7975446 (16)	total: 22.9s	remaining: 17m 45s
30:	test: 0.8159232	best: 0.8159232 (30)	total: 33.5s	remaining: 17m 27s
40:	test: 0.8267817	best: 0.8267854 (39)	total: 44.2s	remaining: 17m 14s
50:	test: 0.8352521	best: 0.8358712 (47)	total: 55.4s	remaining: 17m 10s
60:	test: 0.8409616	best: 0.8409616 (60)	total: 1m 6s	remaining: 16m 57s
70:	test: 0.8417169	best: 0.8427111 (67)	total: 1m 17s	remaining: 16m 50s
80:	test: 0.8448751	best: 0.8450100 (78)	total: 1m 27s	remaining: 16m 38s
90:	test: 0.8473364	best: 0.8473364 (90)	total: 1m 38s	remaining: 16m 28s
100:	test: 0.8486445	best: 0.8487345 (98)	total: 1m 49s	remaining: 16m 17s
110:	test: 0.8507777	best: 0.8507777 (110)	total: 2m	remaining: 16m 8s
120:	test: 0.8519341	best: 0.8519341 (120)	total: 2m 11s	remaining: 15m 58s
130:	test: 0.8530806	best: 0.8534658 (129)	total:

In [56]:
evaluate_model(model, x_train, y_train, x_valid, y_valid, x_public_lb, test["isFraud"])

score = 0.89237
score = 0.885213
score = 0.863983


bestTest = 0.8923702909
bestIteration = 999

score = 0.89237
score = 0.885213
score = 0.863983

In [None]:
# Конкатенация не повлияла на точность модели

## Задание 3

Задание 3: Сделать FrequencyEncoder для признаков card1 - card6, addr1, addr2.

In [57]:
train = pd.read_csv("./data/assignment_2_train.csv")
test = pd.read_csv("./data/assignment_2_test.csv")

In [62]:
for col_name in ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2']:
    freq_encoder = train[col_name].value_counts(normalize=True)
    train[f'{col_name}_freq_enc'] = train[col_name].map(freq_encoder)
    train[f'{col_name}_freq_enc'] = train[f'{col_name}_freq_enc'].astype('object')
    


In [64]:
for col_name in ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2']:
    freq_encoder = test[col_name].value_counts(normalize=True)
    test[f'{col_name}_freq_enc'] = test[col_name].map(freq_encoder)
    test[f'{col_name}_freq_enc'] = test[f'{col_name}_freq_enc'].astype('object')

In [60]:
train.head(5)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
to_drop = [
    "TransactionID",
    "TransactionDT",
    "isFraud",
]

categorical = train.select_dtypes(include=["object"]).columns.tolist()

In [66]:
categorical

['ProductCD',
 'card4',
 'card6',
 'P_emaildomain',
 'R_emaildomain',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'card1_freq_enc',
 'card2_freq_enc',
 'card3_freq_enc',
 'card4_freq_enc',
 'card5_freq_enc',
 'card6_freq_enc',
 'addr1_freq_enc',
 'addr2_freq_enc']

In [67]:
x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=42, shuffle=True
)

y_train, y_valid = train_test_split(
    train["isFraud"], train_size=0.7, random_state=42, shuffle=True)

In [68]:
x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))

x_train.shape = 125999 rows, 399 cols
x_valid.shape = 54001 rows, 399 cols
x_test.shape = 100001 rows, 399 cols


In [69]:
model = fit_catboost(x_train, y_train, cb_params_1000, categorical, x_valid, y_valid)

0:	test: 0.6535578	best: 0.6535578 (0)	total: 1.81s	remaining: 30m 6s
10:	test: 0.7430234	best: 0.7441400 (9)	total: 17.2s	remaining: 25m 45s
20:	test: 0.7986847	best: 0.7986847 (20)	total: 31.7s	remaining: 24m 39s
30:	test: 0.8073731	best: 0.8073731 (30)	total: 47.1s	remaining: 24m 31s
40:	test: 0.8219596	best: 0.8219596 (40)	total: 1m 2s	remaining: 24m 11s
50:	test: 0.8240857	best: 0.8244230 (48)	total: 1m 17s	remaining: 24m 4s
60:	test: 0.8355746	best: 0.8355746 (60)	total: 1m 32s	remaining: 23m 49s
70:	test: 0.8386192	best: 0.8393172 (66)	total: 1m 48s	remaining: 23m 34s
80:	test: 0.8430040	best: 0.8430040 (80)	total: 2m 3s	remaining: 23m 19s
90:	test: 0.8452464	best: 0.8452464 (90)	total: 2m 18s	remaining: 23m 5s
100:	test: 0.8487669	best: 0.8487669 (100)	total: 2m 35s	remaining: 23m 1s
110:	test: 0.8501928	best: 0.8501928 (110)	total: 2m 52s	remaining: 23m
120:	test: 0.8512691	best: 0.8512691 (120)	total: 3m 8s	remaining: 22m 50s
130:	test: 0.8530500	best: 0.8530500 (130)	total: 

In [70]:
evaluate_model(model, x_train, y_train, x_valid, y_valid, x_public_lb, test["isFraud"])

score = 0.968176
score = 0.930302
score = 0.859754


bestTest = 0.9681756235
bestIteration = 993
score = 0.968176
score = 0.930302
score = 0.859754

In [None]:
# FrequencyEncoder учеличил точность модели на train, но на лидерборде точность ухудшилась.

## Задание 4

Задание 4: Создать признаки на основе отношения: TransactionAmt к вычисленной статистике. Статистика - среднее значение / стандартное отклонение TransactionAmt, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.