# Setting

## Library

In [853]:
# 데이터 다루기
import pandas as pd
import numpy as np

# 전처리
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA, KernelPCA, SparsePCA, TruncatedSVD, IncrementalPCA
from sklearn.cluster import KMeans
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

# 모델링
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# 기타
import os
import random

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

## Fixed Random Seed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# Load Data Set

## Googel Drive Mount

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Unzip File

In [6]:
!unzip --qq '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/사기거래/data/사기거래.zip'

## Load Train / Val / Test Set

In [7]:
train = pd.read_csv('/content/train.csv')
val = pd.read_csv('/content/val.csv')
test = pd.read_csv('/content/test.csv')

In [11]:
val['Class'].value_counts()

0    28432
1       30
Name: Class, dtype: int64

# Preprocessing

## Feature Selection

In [639]:
X_train = train.drop(columns=['ID']) 

X_val = val.drop(columns=['ID', 'Class']) 
y_val = val['Class']

X_test = test.drop(columns=['ID'])

In [640]:
X_train = X_train[['V3', 'V4', 'V7', 'V8', 'V9', 'V10', 'V11',
                   'V12', 'V14', 'V16', 'V17', 'V18']]

X_val = X_val[['V3', 'V4', 'V7', 'V8', 'V9', 'V10', 'V11',
               'V12', 'V14', 'V16', 'V17', 'V18']]

X_test = X_test[['V3', 'V4', 'V7', 'V8', 'V9', 'V10', 'V11',
                 'V12', 'V14', 'V16', 'V17', 'V18']]

In [641]:
len(X_train.columns)

12

## 파생 변수

In [642]:
X_train['V7_V29_mean'] = (train['V7'] + train['V29']) / 2
X_val['V7_V29_mean'] = (val['V7'] + val['V29']) / 2
X_test['V7_V29_mean'] = (test['V7'] + test['V29']) / 2

X_train['V3_V30_mean'] = (X_train['V3'] + train['V30']) / 2
X_val['V3_V30_mean'] = (X_val['V3'] + val['V30']) / 2
X_test['V3_V30_mean'] = (X_test['V3'] + test['V30']) / 2

X_train['V5_V29_mean'] = (train['V5'] + train['V29']) / 2
X_val['V5_V29_mean'] = (val['V5'] + val['V29']) / 2
X_test['V5_V29_mean'] = (test['V5'] + test['V29']) / 2

In [643]:
len(X_train.columns)

15

## Scaling

In [644]:
#scaler = StandardScaler()
#scaler = MinMaxScaler()
scaler = RobustScaler(quantile_range=(45.0, 55.0))

scaled_train = scaler.fit_transform(X_train)
scaled_val = scaler.transform(X_val)
scaled_test = scaler.transform(X_test)

scaled_train = pd.DataFrame(scaled_train)
scaled_val = pd.DataFrame(scaled_val)
scaled_test = pd.DataFrame(scaled_test)

## Dimension Reduction

In [844]:
#dimesion_reducer = PCA(n_components=15)
dimesion_reducer = SparsePCA(n_components=15, alpha=0.01)
#dimesion_reducer = SparseRandomProjection(n_components=15, eps=0.5, random_state=42)


reduced_train = dimesion_reducer.fit_transform(scaled_train)
reduced_val = dimesion_reducer.transform(scaled_val)
reduced_test = dimesion_reducer.transform(scaled_test)

reduced_train = pd.DataFrame(reduced_train)
reduced_val = pd.DataFrame(reduced_val)
reduced_test = pd.DataFrame(reduced_test)

## Clustering

In [845]:
cluster = KMeans(n_clusters=2)

In [846]:
clusted_train_1 = cluster.fit_predict(X_train)
clusted_val_1 = cluster.predict(X_val)
clusted_test_1 = cluster.predict(X_test)

clusted_train_1 = pd.DataFrame(clusted_train_1)
clusted_val_1 = pd.DataFrame(clusted_val_1)
clusted_test_1 = pd.DataFrame(clusted_test_1)

In [847]:
clusted_train_2 = cluster.fit_predict(scaled_train)
clusted_val_2 = cluster.predict(scaled_val)
clusted_test_2 = cluster.predict(scaled_test)

clusted_train_2 = pd.DataFrame(clusted_train_2)
clusted_val_2 = pd.DataFrame(clusted_val_2)
clusted_test_2 = pd.DataFrame(clusted_test_2)

In [848]:
clusted_train_3 = cluster.fit_predict(reduced_train)
clusted_val_3 = cluster.predict(reduced_val)
clusted_test_3 = cluster.predict(reduced_test)

clusted_train_3 = pd.DataFrame(clusted_train_3)
clusted_val_3 = pd.DataFrame(clusted_val_3)
clusted_test_3 = pd.DataFrame(clusted_test_3)

In [849]:
preprocessed_train = pd.concat([
                                reduced_train, 
                                #clusted_train_1,
                                clusted_train_2,
                                #clusted_train_3
                                ], axis=1)

preprocessed_val = pd.concat([
                              reduced_val, 
                              #clusted_val_1,
                              clusted_val_2,
                              #clusted_val_3
                              ], axis=1)

preprocessed_test = pd.concat([
                               reduced_test, 
                               #clusted_test_1,
                               clusted_test_2,
                               #clusted_test_3
                               ], axis=1)

In [850]:
preprocessed_train.columns = [i for i in range(len(preprocessed_train.columns))]
preprocessed_val.columns = [i for i in range(len(preprocessed_val.columns))]
preprocessed_test.columns = [i for i in range(len(preprocessed_test.columns))]

# Modeling

In [863]:
model = IsolationForest(n_estimators=500,
                        max_samples=len(X_train),
                        max_features=0.8,
                        contamination=30/28432,
                        random_state=42,
                        verbose=0)

In [862]:
model = LocalOutlierFactor(n_neighbors=10, 
                           p=1, # 민코프스키 거리 -> 1 : 맨하탄 거리와 같음 / 2 : 유클리드 거리와 같음
                           algorithm='auto',
                           contamination='auto',
                           novelty=True)

In [864]:
model.fit(preprocessed_train)

IsolationForest(contamination=0.0010551491277433877, max_features=0.8,
                max_samples=113842, n_estimators=500, random_state=42)

# Evaluation

In [865]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량(사기)) 이므로 (0:정상, 1:불량(사기))로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [866]:
pred_val = model.predict(preprocessed_val) # model prediction
pred_val = get_pred_label(pred_val)

In [867]:
val_score = f1_score(y_val, pred_val, average='macro')

In [868]:
# 0.7879157743510873
# 0.7928924258723169
# 0.7980433877878794
# 0.8089062742283302
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(y_val, pred_val))

Validation F1 Score : [0.8089062742283302]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.68      0.57      0.62        30

    accuracy                           1.00     28462
   macro avg       0.84      0.78      0.81     28462
weighted avg       1.00      1.00      1.00     28462



# Inference

In [860]:
pred_test = model.predict(preprocessed_test) # model prediction
pred_test = get_pred_label(pred_test)

# Submission

In [463]:
submit = pd.read_csv('./sample_submission.csv')

In [464]:
submit['Class'] = pred_test
submit.head()

Unnamed: 0,ID,Class
0,AAAA0x1,0
1,AAAA0x2,0
2,AAAA0x5,0
3,AAAA0x7,0
4,AAAA0xc,0


In [465]:
submit.to_csv('./submission_0.csv', index=False)