# Линейные модели

## Импорты

In [105]:
import os

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import IsolationForest

In [14]:
df = pd.read_csv(os.path.join('..', 'data', 'creditcard.csv'))

## Предобработка

In [15]:
df['Amount_log'] = np.log(df['Amount'] + 1e-9)
df.drop(columns=['Amount', 'Time'], axis=1, inplace=True)
data, target = df.drop(columns=['Class'], axis=1), df['Class']
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=.2, stratify=target)

## Бейзлайн

Будем предсказывать случайно. Во время EDA мы обнаружили, что только 0.1 процент данных - аномальные. заполним массив нулями и случайно инициализируем $5\%$ аномальных данных - это и будет наш бейзлайн

In [29]:
train_pred = np.zeros_like(target_train)
test_pred = np.zeros_like(target_test)

train_pred[:int(train_pred.shape[0] * .05)] = 1
test_pred[:int(test_pred.shape[0] * .05)] = 1

np.random.shuffle(train_pred)
np.random.shuffle(test_pred)

In [30]:
print(f'f1 score train - {f1_score(target_train, train_pred)}')
print(f'f1 score test - {f1_score(target_test, test_pred)}')

f1 score train - 0.004072628542338367
f1 score test - 0.0006788866259334691


## Логистическая регрессия

In [111]:
lg = LogisticRegression(C=.7)
lg.fit(data_train, target_train)

train_pred = lg.predict(data_train)
test_pred = lg.predict(data_test)

train_pred_proba = lg.predict_proba(data_train)[:, 1]
test_pred_proba = lg.predict_proba(data_test)[:, 1]

In [112]:
print(f'f1 score train - {f1_score(target_train, train_pred)}')
print(f'f1 score test - {f1_score(target_test, test_pred)}')

f1 score train - 0.7377777777777778
f1 score test - 0.7176470588235294


In [113]:
print(f'rocauc score train - {roc_auc_score(target_train, train_pred_proba)}')
print(f'rocauc score test - {roc_auc_score(target_test, test_pred_proba)}')

rocauc score train - 0.9770306861653049
rocauc score test - 0.9779734030640956


## Isolation Forest

In [101]:
isol_for = IsolationForest(contamination=.01)

isol_for.fit(data_train);



In [102]:
train_pred = isol_for.predict(data_train)
test_pred = isol_for.predict(data_test)

In [103]:
train_pred = np.clip(train_pred, 0, 1)
train_pred = np.where((train_pred == 0) | (train_pred == 1), train_pred^1, train_pred)

test_pred = np.clip(test_pred, 0, 1)
test_pred = np.where((test_pred == 0) | (test_pred == 1), test_pred^1, test_pred)

In [104]:
print(f'f1 score train - {f1_score(target_train, train_pred)}')
print(f'f1 score test - {f1_score(target_test, test_pred)}')

f1 score train - 0.18930041152263372
f1 score test - 0.19402985074626866
