# Линейные модели

## Импорты

In [60]:
import os

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.cluster import DBSCAN
from sklearn.svm import OneClassSVM
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV

from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE, ADASYN

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [61]:
random_state = 42

In [3]:
df = pd.read_csv(os.path.join('..', 'data', 'creditcard.csv'))

In [4]:
def print_metrics(clf, data_train, target_train, data_test, target_test):
    train_pred = clf.predict(data_train)
    test_pred = clf.predict(data_test)

    train_pred_proba = clf.predict_proba(data_train)[:, 1]
    test_pred_proba = clf.predict_proba(data_test)[:, 1]
    

    print(f'f1 score train - {f1_score(target_train, train_pred)}')
    print(f'f1 score test - {f1_score(target_test, test_pred)}')

    print(f'rocauc score train - {roc_auc_score(target_train, train_pred_proba)}')
    print(f'rocauc score test - {roc_auc_score(target_test, test_pred_proba)}')

## Предобработка

In [5]:
df['Amount_log'] = np.log(df['Amount'] + 1e-9)
df.drop(columns=['Amount', 'Time'], axis=1, inplace=True)
data, target = df.drop(columns=['Class'], axis=1), df['Class']
data_train, data_test, target_train, target_test = train_test_split(
    data,
    target,
    test_size=.2,
    stratify=target,
    random_state=random_stat
)

StratifiedKFold для грид сёрча

In [None]:
skf = StratifiedKFold(5, shuffle=True, random_state=random_state)

## Бейзлайн

Будем предсказывать случайно. Во время EDA мы обнаружили, что только 0.1 процент данных - аномальные. заполним массив нулями и случайно инициализируем $5\%$ аномальных данных - это и будет наш бейзлайн

In [4]:
train_pred = np.zeros_like(target_train)
test_pred = np.zeros_like(target_test)

train_pred[:int(train_pred.shape[0] * .05)] = 1
test_pred[:int(test_pred.shape[0] * .05)] = 1

np.random.shuffle(train_pred, random_state=random_state)
np.random.shuffle(test_pred, random_state=random_state)

In [5]:
print(f'f1 score train - {f1_score(target_train, train_pred)}')
print(f'f1 score test - {f1_score(target_test, test_pred)}')

f1 score train - 0.0027150856948922448
f1 score test - 0.006109979633401223


Результат получился крайне плохим

## Логистическая регрессия

In [26]:
lg_base = LogisticRegression(random_state=random_state)
lg_base.fit(data_train, target_train)

print_metrics(lg_base, data_train, target_train, data_test, target_test)

f1 score train - 0.7194029850746269
f1 score test - 0.7349397590361446
rocauc score train - 0.9757468708550089
rocauc score test - 0.9823495802372721


In [27]:
lg = LogisticRegression(random_state=random_state)

params = {
    'C': np.linspace(0.1, 1, 15),
    'warm_start': (True, False)
}

skf.get_n_splits(data_train, target_train)

gs = GridSearchCV(lg, params, scoring='f1', cv=skf, n_jobs=-1, verbose=5)

gs.fit(data_train, target_train);

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [28]:
gs.best_params_, gs.best_score_

({'C': 0.7428571428571429, 'warm_start': True}, 0.7190732025476552)

In [29]:
lg = LogisticRegression(**gs.best_params_, random_state=random_state)
lg.fit(data_train, target_train)

print_metrics(lg, data_train, target_train, data_test, target_test)

f1 score train - 0.7194029850746269
f1 score test - 0.7349397590361446
rocauc score train - 0.9758331280679475
rocauc score test - 0.9823998254338313


Можно наблюдать, что обыкновенная линейная регрессия дала отличный результат для определения аномальных данных. Посмотрим, сможем ли мы ещё улучшить метрику

## Over-sampling

In [33]:
data_train_resampled, target_train_resampled = SMOTE(
    random_state=random_state
).fit_resample(data_train, target_train)

lg = LogisticRegression(**gs.best_params_, random_state=random_state)
lg.fit(data_train_resampled, target_train_resampled)

print_metrics(lg, data_train_resampled, target_train_resampled, data_test, target_test)

f1 score train - 0.9434484445491277
f1 score test - 0.10650887573964497
rocauc score train - 0.98874841980504
rocauc score test - 0.9857170850895226


In [34]:
data_train_resampled, target_train_resampled = ADASYN(
    random_state=random_state
).fit_resample(data_train, target_train)

lg = LogisticRegression(**gs.best_params_, random_state=random_state)
lg.fit(data_train_resampled, target_train_resampled)

print_metrics(lg, data_train_resampled, target_train_resampled, data_test, target_test)

f1 score train - 0.884847638189143
f1 score test - 0.035583059369561014
rocauc score train - 0.9617903690970029
rocauc score test - 0.9771648142937535


Как и следовало ожидать, техника over-sampling'a не помогла - аномальных данных слишком мало и алгоритм переобучается на самплированных таргетах. Нет смысла пробовать under-sampling по тем же соображениям - данных будет слишком мало

## Elliptic Envelope

Один из классических методов определения аномалий. Предполагаем, что распределение обычных данных нам известно и пытаемся выделить аномалии

In [25]:
envelope = EllipticEnvelope(contamination=.01, random_state=random_state)

envelope.fit(data_train);



In [26]:
train_pred = envelope.predict(data_train)
test_pred = envelope.predict(data_test)

In [27]:
train_pred = np.clip(train_pred, 0, 1)
train_pred = np.where((train_pred == 0) | (train_pred == 1), train_pred^1, train_pred)

test_pred = np.clip(test_pred, 0, 1)
test_pred = np.where((test_pred == 0) | (test_pred == 1), test_pred^1, test_pred)

In [28]:
print(f'f1 score train - {f1_score(target_train, train_pred)}')
print(f'f1 score test - {f1_score(target_test, test_pred)}')

f1 score train - 0.24766180321735876
f1 score test - 0.19578313253012045


Результат даже близко не дотягивает до линейной модели

## Isolation Forest

In [16]:
isol_for = IsolationForest(
    n_jobs=-1,
    contamination=.01,
    n_estimators=500,
    random_state=random_state
)

isol_for.fit(data_train);



In [17]:
train_pred = isol_for.predict(data_train)
test_pred = isol_for.predict(data_test)

In [18]:
train_pred = np.clip(train_pred, 0, 1)
train_pred = np.where((train_pred == 0) | (train_pred == 1), train_pred^1, train_pred)

test_pred = np.clip(test_pred, 0, 1)
test_pred = np.where((test_pred == 0) | (test_pred == 1), test_pred^1, test_pred)

In [19]:
print(f'f1 score train - {f1_score(target_train, train_pred)}')
print(f'f1 score test - {f1_score(target_test, test_pred)}')

f1 score train - 0.1848110736999626
f1 score test - 0.15759312320916904


Также, результат очень плохой

## One-class SVM

In [5]:
svm = OneClassSVM(kernel='linear')

svm.fit(data_train);

In [None]:
train_pred = svm.predict(data_train)
test_pred = svm.predict(data_test)

In [None]:
train_pred = np.clip(train_pred, 0, 1)
train_pred = np.where((train_pred == 0) | (train_pred == 1), train_pred^1, train_pred)

test_pred = np.clip(test_pred, 0, 1)
test_pred = np.where((test_pred == 0) | (test_pred == 1), test_pred^1, test_pred)

In [None]:
print(f'f1 score train - {f1_score(target_train, train_pred)}')
print(f'f1 score test - {f1_score(target_test, test_pred)}')

f1 score train - 0.18930041152263372
f1 score test - 0.19402985074626866


## DBSCAN

In [52]:
dbscan = DBSCAN(eps=.99, min_samples=2, n_jobs=-1)

train_pred = dbscan.fit_predict(data_train);

In [53]:
train_pred

array([-1,  0, -1, ..., 37, -1, -1], dtype=int64)

In [54]:
np.unique(train_pred, return_counts=True)

(array([  -1,    0,    1, ..., 9836, 9837, 9838], dtype=int64),
 array([106062,   1697,      4, ...,      2,      2,      2], dtype=int64))

Методы кластеризации очень плохо справляются, помечая более 50 процентов данных как шум.

## Random Forest

In [55]:
ran_for = RandomForestClassifier(n_jobs=-1, random_state=random_state)

params = {
    'max_depth': [None, 5, 7, 10]
}

skf.get_n_splits(data_train, target_train)

gs = GridSearchCV(ran_for, params, scoring='f1', cv=skf, n_jobs=-1, verbose=5)

gs.fit(data_train, target_train);

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [56]:
gs.best_params_, gs.best_score_

({'max_depth': None}, 0.8701841190074735)

In [58]:
ran_for = RandomForestClassifier(
    n_jobs=-1,
    n_estimators=500,
    **gs.best_params_,
    random_state=random_state
)
ran_for.fit(data_train, target_train)

print_metrics(ran_for, data_train, target_train, data_test, target_test)

f1 score train - 1.0
f1 score test - 0.7283950617283952
rocauc score train - 1.0
rocauc score test - 0.9214882017100595


Bagging показал довольно неплохой результат, но, всё равно, не смог побить линейную модель

## Gradient Boosting

In [62]:
xgb = XGBClassifier(random_state=random_state)

params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

skf.get_n_splits(data_train, target_train)

gs = RandomizedSearchCV(xgb,
        params,
        scoring='f1',
        n_iter=10,
        cv=skf,
        n_jobs=-1,
        verbose=3,
        random_state=random_state
)

gs.fit(data_train, target_train);

Fitting 3 folds for each of 5 candidates, totalling 15 fits


In [63]:
gs.best_params_, gs.best_score_

({'subsample': 0.8,
  'min_child_weight': 10,
  'max_depth': 3,
  'gamma': 5,
  'colsample_bytree': 1.0},
 0.8727520219494093)

In [66]:
xgb = XGBClassifier(
    n_jobs=-1,
    n_estimators=1000,
    learning_rate=.03,
    **gs.best_params_,
    random_state=random_state
)
xgb.fit(data_train, target_train)

print_metrics(xgb, data_train, target_train, data_test, target_test)

f1 score train - 0.908108108108108
f1 score test - 0.7393939393939395
rocauc score train - 0.9986418785084675
rocauc score test - 0.9701118242738852


Бустинг выглядит более многообещающе