## Домашнее задание 5
Ссылка на данные - https://drive.google.com/file/d/1gMEVl47pIoV1-AseB9doQ6DZNJrY3NkW/view?usp=sharing

Продолжим работу с данными, которые были использованы в ДЗ2 и 3, продолжим решать задачу обнаружения мошеннических транзакций, что позволит получить полное решение задачи / полный пайплайн.

In [1]:
import warnings
from tqdm import tqdm
from typing import List, Tuple

import datetime as dt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
from scipy.stats import ttest_rel

from sklearn.metrics import r2_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_val_score
warnings.simplefilter("ignore")
%matplotlib inline

import os
from copy import deepcopy
from inspect import signature
from typing import List, Optional, Union
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from category_encoders.cat_boost import CatBoostEncoder

In [2]:
def get_date(ts, base_date_s=None):
    """
    Меняет дату с секунд (от базовой даты) на datetime
    """
    if base_date_s is None:
        base_date_s = dt.datetime.timestamp(
            dt.datetime(year=2017, month=12, day=1))
    return dt.datetime.fromtimestamp(ts + base_date_s)

In [3]:
def transform_x(dfs: list,
                column,
                func,
                new_column=None):
    """
    Создает новый признак new_column на основе признака column, 
    пропущенного через функцию func
    Отрабатывает для всех датасетов в dfs
    """
    if new_column is None:
        new_column = column
    for df in dfs:
        df[new_column] = df[column].apply(func)

In [4]:
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """
    Обработка категориальных признаков и подготовка матрицы
    признаков для передачи в любой алгоритм машинного обучения.

    Parameters
    ----------
    encoder: callable
        Объект для обработки категориальных признаков;
        объект должен поддерживать sklearn-API.

    categorical_features: List[str]
        Список с названием категориальных признаков.

    na_value: float or str, optional, default = None
        Значение для заполнения пропусков.
        Опциональный параметр, по умолчанию, не используется.

    """
    def __init__(self,
                 encoder: callable,
                 categorical_features: List[str],
                 na_value: Union[int, float, str] = None) -> None:
        self.encoder = encoder
        self.categorical_features = categorical_features
        self.na_value = na_value

        sig = signature(encoder.fit)
        self.nparams = len(sig.parameters)
        self.encoders = None

    def _prepare_data(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Подготовка данных для передачи в модель.

        """
        if self.na_value:
            return X[self.categorical_features].fillna(self.na_value)

        return X

    def _check_unique_values(self, y: pd.Series) -> pd.Series:
        """
        Проверка на ранее не встречающиеся значения признака.
        Если значения встречаются - заменяем на self.na_value,
        если self.na_value - не задан, то заменяем на моду.

        """
        encoder = self.encoders[y.name]
        missed_values = list(
            set(y.unique()) - set(encoder.classes_)
        )

        if missed_values:
            mask = y.isin(missed_values)
            if self.na_value:
                y[mask] = self.na_value
            else:
                mode = y.value_counts()[0].index
                y[mask] = mode

        return y

    def _fit_label_encoder(self, X: pd.DataFrame) -> None:
        """
        Обучение LabelEncoder'ов для всех признаков;

        """
        self.encoders = {}
        for feature in self.categorical_features:
            x = X[feature].astype(str)
            encoder = deepcopy(self.encoder)
            self.encoders[feature] = encoder.fit(x)

        return self

    def _transform_label_encoder(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Применение LabelEncoder'ов для всех признаков;

        """
        for feature in self.categorical_features:
            x = X[feature].astype(str)
            x = self._check_unique_values(x)
            encoder = self.encoders.get(feature)
            X[feature] = encoder.transform(x)

        return X

    def fit(self, X, y=None):
        """
        Обучение encoder'а категориальных признаков.

        Parameters
        ----------
        X: pandas.core.frame.DataFrame
            Матрица признаков.

        y: pandas.core.frame.Series
            Вектор целевой переменной.
            Опциональный параметр, по умолчанию, не требуется.

        Returns
        -------
        self
        """
        X = self._prepare_data(X)
        if self.nparams == 1:
            self._fit_label_encoder(X=X)
        else:
            self.encoder.fit(
                X[self.categorical_features].astype(str), y
            )
        return self

    def transform(self, X, y=None):
        """
        Преобразование категориальных признаков.

        Parameters
        ----------
        X: pandas.core.frame.DataFrame
            Матрица признаков.

        Returns
        -------
        X_transformed: pandas.core.frame.DataFrame
            Преобразованная матрица признаков с
            обработанными категориальными признаками.

        """
        X = self._prepare_data(X)
        if self.encoders:
            categorical = self._transform_label_encoder(X)
        else:
            categorical = self.encoder.transform(
                X[self.categorical_features].astype(str)
            )

        X = X.drop(self.categorical_features, axis=1)
        X = pd.concat([X, categorical], axis=1)

        return X

In [5]:
data = pd.read_csv('./assignment_2_train.csv')
lb_data = pd.read_csv('./assignment_2_test.csv')

In [6]:
data.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
target_name = 'isFraud'

In [8]:
data.shape, lb_data.shape

((180000, 394), (100001, 394))

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(4), object(14)
memory usage: 541.1+ MB


In [10]:
target = data[target_name]

In [11]:
numerical_features = data.select_dtypes(exclude=["object"]).columns.tolist()
categorical_features = data.select_dtypes(include=["object"]).columns.tolist()

In [12]:
data[categorical_features] = data[categorical_features].astype(str)
lb_data[categorical_features] = lb_data[categorical_features].astype(str)

In [13]:
x_train, x_valid = train_test_split(
    data.drop(["TransactionID", target_name], axis=1), train_size=0.8, shuffle=True, random_state=1,
)
y_train, y_valid = train_test_split(
    data[target_name], train_size=0.8, shuffle=True, random_state=1,
)

x_valid, x_test = train_test_split(
    x_valid, train_size=0.8, shuffle=True, random_state=27
)
y_valid, y_test = train_test_split(
    y_valid, train_size=0.8, shuffle=True, random_state=27
)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_test.shape))

x_train.shape = 144000 rows, 392 cols
x_valid.shape = 28800 rows, 392 cols
x_test.shape = 7200 rows, 392 cols


In [14]:
x_LB = lb_data.drop(["TransactionID", target_name], axis=1)
y_LB = lb_data[target_name]

In [15]:
models = dict()
model_count = 0
def evaluate(x_train, 
         x_valid, 
         x_test,
         model, 
         x_LB=None,
         metric=roc_auc_score, 
         name=None):
    preds = {
        'pred_train': model.predict_proba(x_train)[:,1],
        'pred_valid': model.predict_proba(x_valid)[:,1],
        'pred_test': model.predict_proba(x_test)[:,1],
    }
    if not x_LB is None:
        preds['pred_LB'](model.predict_proba(x_LB)[:,1])
        
    global model_count
    if name is None:
        name = f'model_{model_count}'
    models[name] = {
        'scores': {
            'train': (metric(y_train, preds['pred_train'])),
            'valid': (metric(y_valid, preds['pred_valid'])),
            'test': (metric(y_test, preds['pred_test'])),
            },
        'preds': preds,
        'model': model
    }
    if not x_LB is None:
        scores[name][scores]['LB']: (metric(y_LB, preds['pred_LB']))
    model_count += 1
    return f'\n{name}:\n{models[name]["scores"]}'

#### Задание 0: 
выбрать любую модель машнного обучения и зафиксировать любой тип валидации. Обучить базовую модель и зафиксировать базовое качество модели. В каждом следующем задании нужно будет обучить выбранную модель и оценивать ее качество на зафиксированной схеме валидации. После каждого задания, требуется сделать вывод о достигаемом качестве модели, по сравнению с качестом из предыдущего шага.

In [16]:
encoder = CategoricalEncoder(
    encoder=CatBoostEncoder(random_state=27),
    categorical_features=categorical_features,
)

x_train_ce = encoder.fit_transform(x_train, y_train)
x_valid_ce = encoder.transform(x_valid)
x_test_ce = encoder.transform(x_test)
x_LB_ce = encoder.transform(x_LB)

In [17]:
xgb_params = {
    "subsample": 0.5,
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "n_estimators": 1000,
    "learning_rate": 0.1,
    "reg_lambda": 10,
    "max_depth": 4,
    "gamma": 10,
#     "nthread": 6,
    "seed": 27,
    'predictor': 'gpu_predictor',
    'tree_method': 'gpu_hist'
}

model = xgb.XGBClassifier(**xgb_params)

In [18]:
def model_fit(model, 
              x_train, 
              y_train, 
              x_valid,
              y_valid,
              name=None):
    eval_sets = [
        (x_train, y_train),
        (x_valid, y_valid)
    ]

    model.fit(
        y=y_train,
        X=x_train,
        early_stopping_rounds=50,
        eval_set=eval_sets,
        eval_metric="auc",
        verbose=100
    )
    out = evaluate(x_train_ce, x_valid_ce, x_test_ce, model, name=name)
    print(out)

In [19]:
model_fit(model, 
          x_train_ce, 
          y_train, 
          x_valid_ce,
          y_valid,
          name='base_model')

[0]	validation_0-auc:0.65329	validation_1-auc:0.65424
[100]	validation_0-auc:0.91608	validation_1-auc:0.90411
[200]	validation_0-auc:0.92218	validation_1-auc:0.90940
[300]	validation_0-auc:0.92499	validation_1-auc:0.91152
[400]	validation_0-auc:0.92522	validation_1-auc:0.91155
[500]	validation_0-auc:0.92627	validation_1-auc:0.91251
[600]	validation_0-auc:0.92647	validation_1-auc:0.91279
[700]	validation_0-auc:0.92699	validation_1-auc:0.91313
[800]	validation_0-auc:0.92725	validation_1-auc:0.91335
[854]	validation_0-auc:0.92754	validation_1-auc:0.91341

base_model:
{'train': 0.9275598015956472, 'valid': 0.913433342932293, 'test': 0.9085548911243222}


#### Задание 1: 
признак TransactionDT - это смещение в секундах относительно базовой даты. Базовая дата - 2017-12-01, преобразовать признак TransactionDT в datetime, прибавив к базовой дате исходное значение признака. Из полученного признака выделить год, месяц, день недели, час, день.

In [20]:
new_features = ['TransactionDT_dt']

In [21]:
transform_x(dfs=[x_train_ce, x_valid_ce, x_test_ce, x_LB_ce],
            column='TransactionDT',
            func=get_date,
            new_column='TransactionDT_dt')
x_train_ce['TransactionDT_dt'].head(3)

5849     2017-12-03 02:57:41
113764   2017-12-26 16:10:44
118516   2017-12-27 19:14:01
Name: TransactionDT_dt, dtype: datetime64[ns]

In [22]:
current_features = ['year', 'month', 'dayofweek', 'hour', 'day']

In [23]:
for feature in current_features:
    transform_x(dfs=[x_train_ce, x_valid_ce, x_test_ce, x_LB_ce],
            column='TransactionDT_dt',
            func=lambda x: eval(f'x.{feature}'),
            new_column=f'TransactionDT_{feature}')

In [24]:
current_features = list(map(lambda x: 'TransactionDT_' + x, current_features))

In [25]:
x_train_ce[current_features].head(3)

Unnamed: 0,TransactionDT_year,TransactionDT_month,TransactionDT_dayofweek,TransactionDT_hour,TransactionDT_day
5849,2017,12,6,2,3
113764,2017,12,1,16,26
118516,2017,12,2,19,27


In [26]:
new_features += current_features

In [27]:
transform_x(dfs=[x_train_ce, x_valid_ce, x_test_ce, x_LB_ce],
            column='TransactionDT_dt',
            func=dt.datetime.timestamp)

In [28]:
model_fit(model, 
          x_train_ce, 
          y_train, 
          x_valid_ce,
          y_valid)

[0]	validation_0-auc:0.65329	validation_1-auc:0.65424
[100]	validation_0-auc:0.91607	validation_1-auc:0.90449
[200]	validation_0-auc:0.92419	validation_1-auc:0.91048
[300]	validation_0-auc:0.92612	validation_1-auc:0.91180
[400]	validation_0-auc:0.92704	validation_1-auc:0.91301
[500]	validation_0-auc:0.92762	validation_1-auc:0.91314
[600]	validation_0-auc:0.92783	validation_1-auc:0.91353
[700]	validation_0-auc:0.92839	validation_1-auc:0.91376
[739]	validation_0-auc:0.92845	validation_1-auc:0.91373

model_1:
{'train': 0.9283931351031662, 'valid': 0.9137600689942209, 'test': 0.9095193430847819}


In [29]:
models['base_model']['scores']

{'train': 0.9275598015956472,
 'valid': 0.913433342932293,
 'test': 0.9085548911243222}

Прирост всего 0,001. Может быть случайным

####  Задание 2: 
сделать конкатенацию признаков

* card1 + card2;

* card1 + card2 + card_3 + card_5;

* card1 + card2 + card_3 + card_5 + addr1 + addr2

Рассматривать их как категориальных признаки.

In [30]:
current_features = ['card1_2', 'card1_2_3_5', 'card1_2_3_5_addr1_2']

In [31]:
def temp_func_concat(dfs, lst, name, inplace=False):
    out_dfs = []
    for df in dfs:
        tmp_df = pd.DataFrame()
        tmp_df[name] = ['' for _ in range(df.shape[0])]
        for feature in lst:
            tmp_df[name] += '_' + df[feature].astype(str)
        if inplace:
            df[name] = tmp_df[name]
        else:
            out_dfs.append(tmp_df[name])
    if not inplace:
        return out_dfs

In [32]:
temp_func_concat(dfs=[x_train_ce, x_valid_ce, x_test_ce, x_LB_ce], 
                 lst=['card1', 'card2'], 
                 name='card1_2', 
                 inplace=True)

In [33]:
temp_func_concat(dfs=[x_train_ce, x_valid_ce, x_test_ce, x_LB_ce], 
                 lst=['card1', 'card2', 'card3', 'card5'], 
                 name='card1_2_3_5', 
                 inplace=True)

In [34]:
temp_func_concat(dfs=[x_train_ce, x_valid_ce, x_test_ce, x_LB_ce], 
                 lst=['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2'], 
                 name='card1_2_3_5_addr1_2', 
                 inplace=True)

In [35]:
x_train_ce[current_features].head(3)

Unnamed: 0,card1_2,card1_2_3_5,card1_2_3_5_addr1_2
5849,_7919_194.0,_7919_194.0_150.0_166.0,_7919_194.0_150.0_166.0_325.0_87.0
113764,_3109_390.0,_3109_390.0_150.0_224.0,_3109_390.0_150.0_224.0_498.0_87.0
118516,_6019_583.0,_6019_583.0_150.0_226.0,_6019_583.0_150.0_226.0_126.0_87.0


In [36]:
categorical_features += current_features

In [37]:
encoder = CategoricalEncoder(
    encoder=CatBoostEncoder(random_state=27),
    categorical_features=categorical_features,
)

x_train_ce = encoder.fit_transform(x_train_ce, y_train)
x_valid_ce = encoder.transform(x_valid_ce)
x_test_ce = encoder.transform(x_test_ce)
x_LB_ce = encoder.transform(x_LB_ce)

In [38]:
new_features += current_features

In [39]:
model_fit(model, 
          x_train_ce, 
          y_train, 
          x_valid_ce,
          y_valid)

[0]	validation_0-auc:0.74689	validation_1-auc:0.61405
[100]	validation_0-auc:0.96182	validation_1-auc:0.88500
[200]	validation_0-auc:0.96519	validation_1-auc:0.88973
[300]	validation_0-auc:0.96618	validation_1-auc:0.89189
[400]	validation_0-auc:0.96669	validation_1-auc:0.89284
[500]	validation_0-auc:0.96717	validation_1-auc:0.89290
[502]	validation_0-auc:0.96717	validation_1-auc:0.89290

model_2:
{'train': 0.9669344962919636, 'valid': 0.8933799484021945, 'test': 0.8885643055143918}


In [40]:
models['model_1']['scores']

{'train': 0.9283931351031662,
 'valid': 0.9137600689942209,
 'test': 0.9095193430847819}

In [41]:
models['base_model']['scores']

{'train': 0.9275598015956472,
 'valid': 0.913433342932293,
 'test': 0.9085548911243222}

Стало хуже предыдущих моделей

#### Задание 3: 
Сделать FrequencyEncoder для признаков card1 - card6, addr1, addr2.

In [42]:
current_features = ['card1', 'card2', 'card3', 
                    'card4', 'card5', 'card6', 
                    'addr1', 'addr2']

In [43]:
def temp_func_Frequency(dfs, lst, inplace=False):
    out_dfs = []
    for df in dfs:
        tmp_df = pd.DataFrame()
        for feature in lst:
            freq_encoder = df[feature].value_counts(normalize=True)
            new_name = f'{feature}_frequency'
            tmp_df[new_name] = df[feature].map(freq_encoder)
            if inplace:
                df[new_name] = tmp_df[new_name]
        if not inplace:
            out_dfs.append(tmp_df)
    if not inplace:
        return out_dfs

In [44]:
temp_func_Frequency(dfs=[x_train_ce, x_valid_ce, x_test_ce, x_LB_ce], 
                    lst=current_features,
                    inplace=True)

In [45]:
current_features = list(map(lambda x: x + '_frequency', current_features))

In [46]:
x_test_ce[current_features].head(3)

Unnamed: 0,card1_frequency,card2_frequency,card3_frequency,card4_frequency,card5_frequency,card6_frequency,addr1_frequency,addr2_frequency
54894,0.000278,0.075689,0.878333,0.306111,0.05678,0.318333,0.011564,0.982654
8611,0.007222,0.012662,0.098472,0.306111,0.129046,0.681389,,
119389,0.006806,0.036579,0.878333,0.655,0.511998,0.318333,0.002657,0.982654


In [47]:
new_features += current_features

In [48]:
model_fit(model, 
          x_train_ce, 
          y_train, 
          x_valid_ce,
          y_valid)

[0]	validation_0-auc:0.74689	validation_1-auc:0.61405
[100]	validation_0-auc:0.96227	validation_1-auc:0.88608
[200]	validation_0-auc:0.96627	validation_1-auc:0.89160
[300]	validation_0-auc:0.96732	validation_1-auc:0.89423
[400]	validation_0-auc:0.96762	validation_1-auc:0.89487
[500]	validation_0-auc:0.96812	validation_1-auc:0.89526
[547]	validation_0-auc:0.96816	validation_1-auc:0.89533

model_3:
{'train': 0.9680987679067595, 'valid': 0.8955527528843812, 'test': 0.892033682595582}


In [49]:
models['model_2']['scores']

{'train': 0.9669344962919636,
 'valid': 0.8933799484021945,
 'test': 0.8885643055143918}

Пока, всё хуже первых двух, но, наверное, нужно просто выкидывать некоторые признаки

#### Задание 4: 
Создать признаки на основе отношения: TransactionAmt к вычисленной статистике. Статистика - среднее значение / стандартное отклонение TransactionAmt, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.

In [50]:
current_features = ['card1', 'card2', 'card3', 
                    'card4', 'card5', 'card6', 
                    'addr1', 'addr2', 'card1_2', 
                    'card1_2_3_5', 'card1_2_3_5_addr1_2']

In [51]:
def temp_func_aggregate(dfs, features, stat_feature):
    out_dfs = []
    for df in dfs:
        out_df = df.copy()
        for feature in features:
            tmp_df = pd.DataFrame()
            new_name = f'{feature}_{stat_feature}_stat'
            feature_groupby = df.groupby(feature, as_index=False)
            tmp_df = feature_groupby[stat_feature].mean()
            tmp_df = tmp_df / feature_groupby[stat_feature].std()
            tmp_df = tmp_df.rename(columns={stat_feature: new_name})
            out_df = out_df.merge(tmp_df, how="left", on=feature)   
        out_dfs.append(out_df)
    return out_dfs

In [52]:
x_train_ce, x_valid_ce, x_test_ce, x_LB_ce = temp_func_aggregate(
        dfs=[x_train_ce, x_valid_ce, x_test_ce, x_LB_ce], 
        features=current_features, 
        stat_feature='TransactionAmt')

In [53]:
current_features = list(map(lambda x: x + '_TransactionAmt_stat', current_features))

In [54]:
new_features += current_features

In [55]:
model_fit(model, 
          x_train_ce, 
          y_train, 
          x_valid_ce,
          y_valid)

[0]	validation_0-auc:0.74689	validation_1-auc:0.61405
[100]	validation_0-auc:0.96227	validation_1-auc:0.88608
[200]	validation_0-auc:0.96627	validation_1-auc:0.89160
[300]	validation_0-auc:0.96732	validation_1-auc:0.89423
[400]	validation_0-auc:0.96762	validation_1-auc:0.89487
[500]	validation_0-auc:0.96812	validation_1-auc:0.89526
[547]	validation_0-auc:0.96816	validation_1-auc:0.89533

model_4:
{'train': 0.9680987679067595, 'valid': 0.8955527528843812, 'test': 0.892033682595582}


In [56]:
models['model_3']['scores']

{'train': 0.9680987679067595,
 'valid': 0.8955527528843812,
 'test': 0.892033682595582}

На уровне предыдущей модели

#### Задание 5: 
Создать признаки на основе отношения: D15 к вычисленной статистике. Статистика - среднее значение / стандартное отклонение D15, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.

In [57]:
current_features = ['card1', 'card2', 'card3', 
                    'card4', 'card5', 'card6', 
                    'addr1', 'addr2', 'card1_2', 
                    'card1_2_3_5', 'card1_2_3_5_addr1_2']

In [58]:
x_train_ce, x_valid_ce, x_test_ce, x_LB_ce = temp_func_aggregate(
        dfs=[x_train_ce, x_valid_ce, x_test_ce, x_LB_ce], 
        features=current_features, 
        stat_feature='D15')

In [59]:
current_features = list(map(lambda x: x + '_TransactionAmt_stat', current_features))

In [60]:
new_features += current_features

In [61]:
model_fit(model, 
          x_train_ce, 
          y_train, 
          x_valid_ce,
          y_valid)

[0]	validation_0-auc:0.74689	validation_1-auc:0.61405
[100]	validation_0-auc:0.96227	validation_1-auc:0.88608
[200]	validation_0-auc:0.96627	validation_1-auc:0.89160
[300]	validation_0-auc:0.96732	validation_1-auc:0.89423
[400]	validation_0-auc:0.96762	validation_1-auc:0.89487
[500]	validation_0-auc:0.96812	validation_1-auc:0.89526
[548]	validation_0-auc:0.96816	validation_1-auc:0.89533

model_5:
{'train': 0.9680987679067595, 'valid': 0.8955527528843812, 'test': 0.892033682595582}


In [62]:
models['model_4']['scores']

{'train': 0.9680987679067595,
 'valid': 0.8955527528843812,
 'test': 0.892033682595582}

Не видно изменений

#### Задание 6: 
выделить дробную часть и целую часть признака TransactionAmt в два отдельных признака. После создать отдельных признак - логарифм от TransactionAmt

In [63]:
def get_int_part(x):
    return int(x)
def get_float_part(x):
    return round(x%1, 2) * 100
def get_log(x):
    return np.log(x)

In [64]:
current_features = ['get_int_part', 'get_float_part', 'get_log']

In [65]:
for feature in current_features:
    transform_x(dfs=[x_train_ce, x_valid_ce, x_test_ce, x_LB_ce],
            column='TransactionAmt',
            func=eval(f'{feature}'),
            new_column=f'TransactionAmt_{feature}')

In [66]:
current_features = list(map(lambda x: 'TransactionAmt_' + x, current_features))

In [67]:
x_train_ce[current_features].head(3)

Unnamed: 0,TransactionAmt_get_int_part,TransactionAmt_get_float_part,TransactionAmt_get_log
0,50,0.0,3.912023
1,30,95.0,3.432373
2,300,0.0,5.703782


In [68]:
new_features += current_features

In [69]:
model_fit(model, 
          x_train_ce, 
          y_train, 
          x_valid_ce,
          y_valid)

[0]	validation_0-auc:0.74689	validation_1-auc:0.61405
[100]	validation_0-auc:0.96253	validation_1-auc:0.88624
[200]	validation_0-auc:0.96624	validation_1-auc:0.89335
[300]	validation_0-auc:0.96732	validation_1-auc:0.89511
[400]	validation_0-auc:0.96762	validation_1-auc:0.89539
[402]	validation_0-auc:0.96762	validation_1-auc:0.89539

model_6:
{'train': 0.9675140089516863, 'valid': 0.8954644176634416, 'test': 0.887677260761171}


In [70]:
models['model_5']['scores']

{'train': 0.9680987679067595,
 'valid': 0.8955527528843812,
 'test': 0.892033682595582}

По сравнению с последними оценками, эта, конечно, чуть лучше

#### Задание 7 (опция): 
выполнить предварительную подготовку / очистку признаков P_emaildomain и R_emaildomain (что и как делать - остается на ваше усмотрение) и сделать Frequency Encoding для очищенных признаков.