# Линейные модели

## Импорты

In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from imblearn.over_sampling import SMOTE, ADASYN

In [2]:
df = pd.read_csv(os.path.join('..', 'data', 'creditcard.csv'))

In [3]:
def print_metrics(clf, data_train, target_train, data_test, target_test):
    train_pred = clf.predict(data_train)
    test_pred = clf.predict(data_test)

    train_pred_proba = clf.predict_proba(data_train)[:, 1]
    test_pred_proba = clf.predict_proba(data_test)[:, 1]
    

    print(f'f1 score train - {f1_score(target_train, train_pred)}')
    print(f'f1 score test - {f1_score(target_test, test_pred)}')

    print(f'rocauc score train - {roc_auc_score(target_train, train_pred_proba)}')
    print(f'rocauc score test - {roc_auc_score(target_test, test_pred_proba)}')

## Предобработка

In [4]:
df['Amount_log'] = np.log(df['Amount'] + 1e-9)
df.drop(columns=['Amount', 'Time'], axis=1, inplace=True)
data, target = df.drop(columns=['Class'], axis=1), df['Class']
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=.2, stratify=target)

## Бейзлайн

Будем предсказывать случайно. Во время EDA мы обнаружили, что только 0.1 процент данных - аномальные. заполним массив нулями и случайно инициализируем $5\%$ аномальных данных - это и будет наш бейзлайн

In [4]:
train_pred = np.zeros_like(target_train)
test_pred = np.zeros_like(target_test)

train_pred[:int(train_pred.shape[0] * .05)] = 1
test_pred[:int(test_pred.shape[0] * .05)] = 1

np.random.shuffle(train_pred)
np.random.shuffle(test_pred)

In [5]:
print(f'f1 score train - {f1_score(target_train, train_pred)}')
print(f'f1 score test - {f1_score(target_test, test_pred)}')

f1 score train - 0.0027150856948922448
f1 score test - 0.006109979633401223


## Логистическая регрессия

In [26]:
lg_base = LogisticRegression()
lg_base.fit(data_train, target_train)

print_metrics(lg_base, data_train, target_train, data_test, target_test)

f1 score train - 0.7194029850746269
f1 score test - 0.7349397590361446
rocauc score train - 0.9757468708550089
rocauc score test - 0.9823495802372721


In [27]:
lg = LogisticRegression()

params = {
    'C': np.linspace(0.1, 1, 15),
    'warm_start': (True, False)
}

skf = StratifiedKFold(5)

skf.get_n_splits(data_train, target_train)

gs = GridSearchCV(lg, params, scoring='f1', cv=skf, n_jobs=-1, verbose=5)

gs.fit(data_train, target_train);

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [28]:
gs.best_params_, gs.best_score_

({'C': 0.7428571428571429, 'warm_start': True}, 0.7190732025476552)

In [29]:
lg = LogisticRegression(**gs.best_params_)
lg.fit(data_train, target_train)

print_metrics(lg, data_train, target_train, data_test, target_test)

f1 score train - 0.7194029850746269
f1 score test - 0.7349397590361446
rocauc score train - 0.9758331280679475
rocauc score test - 0.9823998254338313


## Over-sampling

In [33]:
data_train_resampled, target_train_resampled = SMOTE().fit_resample(data_train, target_train)

lg = LogisticRegression(**gs.best_params_)
lg.fit(data_train_resampled, target_train_resampled)

print_metrics(lg, data_train_resampled, target_train_resampled, data_test, target_test)

f1 score train - 0.9434484445491277
f1 score test - 0.10650887573964497
rocauc score train - 0.98874841980504
rocauc score test - 0.9857170850895226


In [34]:
data_train_resampled, target_train_resampled = ADASYN().fit_resample(data_train, target_train)

lg = LogisticRegression(**gs.best_params_)
lg.fit(data_train_resampled, target_train_resampled)

print_metrics(lg, data_train_resampled, target_train_resampled, data_test, target_test)

f1 score train - 0.884847638189143
f1 score test - 0.035583059369561014
rocauc score train - 0.9617903690970029
rocauc score test - 0.9771648142937535


## Isolation Forest

In [101]:
isol_for = IsolationForest(contamination=.01)

isol_for.fit(data_train);



In [102]:
train_pred = isol_for.predict(data_train)
test_pred = isol_for.predict(data_test)

In [103]:
train_pred = np.clip(train_pred, 0, 1)
train_pred = np.where((train_pred == 0) | (train_pred == 1), train_pred^1, train_pred)

test_pred = np.clip(test_pred, 0, 1)
test_pred = np.where((test_pred == 0) | (test_pred == 1), test_pred^1, test_pred)

In [104]:
print(f'f1 score train - {f1_score(target_train, train_pred)}')
print(f'f1 score test - {f1_score(target_test, test_pred)}')

f1 score train - 0.18930041152263372
f1 score test - 0.19402985074626866


## One-class SVM

In [5]:
svm = OneClassSVM(kernel='linear')

svm.fit(data_train);

In [None]:
train_pred = svm.predict(data_train)
test_pred = svm.predict(data_test)

In [None]:
train_pred = np.clip(train_pred, 0, 1)
train_pred = np.where((train_pred == 0) | (train_pred == 1), train_pred^1, train_pred)

test_pred = np.clip(test_pred, 0, 1)
test_pred = np.where((test_pred == 0) | (test_pred == 1), test_pred^1, test_pred)

In [None]:
print(f'f1 score train - {f1_score(target_train, train_pred)}')
print(f'f1 score test - {f1_score(target_test, test_pred)}')

f1 score train - 0.18930041152263372
f1 score test - 0.19402985074626866
