In [5]:
import pandas as pd
import numpy as np
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('../data/train.csv')
valid = pd.read_csv('../data/valid.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
X_train, y_train = train.drop('Class', axis=1), train['Class']
X_valid, y_valid = valid.drop('Class', axis=1), valid['Class']
X_test, y_test = test.drop('Class', axis=1), test['Class']

In [7]:
# 거리를 계산해야하기 때문에 scale을 맞춰준다
scaler = StandardScaler()
scaler.fit(X_train)


In [8]:
scaled_X_train = scaler.transform(X_train)
scaled_X_valid = scaler.transform(X_valid)
scaled_X_test = scaler.transform(X_test)

In [11]:
lof = LocalOutlierFactor(n_neighbors=10, novelty=True) # predict를 사용하기 위해서는 novelty=True가 필요
lof.fit(scaled_X_train)

- LOF 값은 prediction이 불가능 -> 따라서 train set 의 LOF 만 위의 모델로 알 수 있다
- LOF는 주어진 데이터셋에서 이상치탐지 비지도학습 알고리즘!

In [12]:
pred_train = lof.predict(scaled_X_train)
pred_valid = lof.predict(scaled_X_valid)
pred_test = lof.predict(scaled_X_test)

In [16]:
# pred 결과가 1, -1 이라 이를 변환
pred_train = np.where(pred_train == -1, 1, 0)
pred_valid = np.where(pred_valid == -1, 1, 0)
pred_test = np.where(pred_test == -1, 1, 0)

In [17]:
from sklearn.metrics import precision_score, recall_score, f1_score

p_train = precision_score(y_train, pred_train)
r_train = recall_score(y_train, pred_train)
f1_train = f1_score(y_train, pred_train)

p_valid = precision_score(y_valid, pred_valid)
r_valid = recall_score(y_valid, pred_valid)
f1_valid = f1_score(y_valid, pred_valid)

In [18]:
print(p_train, r_train, f1_train)

0.003964882469555367 0.11244979919678715 0.007659690876761046


In [19]:
print(p_valid, r_valid, f1_valid)

0.0035174839644113386 0.1504424778761062 0.0068742418115649
