# Setting

## Library

In [37]:
# 데이터 다루기
import pandas as pd
import numpy as np
from itertools import combinations

# 전처리
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA, KernelPCA, SparsePCA, TruncatedSVD, IncrementalPCA
from sklearn.cluster import KMeans
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

# 모델링
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# 기타
import os
import random

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

## Fixed Random Seed

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# Load Data Set

## Googel Drive Mount

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Unzip File

In [5]:
!unzip --qq '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/사기거래/data/사기거래.zip'

## Load Train / Val / Test Set

In [6]:
train = pd.read_csv('/content/train.csv')
val = pd.read_csv('/content/val.csv')
test = pd.read_csv('/content/test.csv')

In [7]:
val['Class'].value_counts()

0    28432
1       30
Name: Class, dtype: int64

# Preprocessing

## Feature Selection

In [193]:
X_train = train.drop(columns=['ID']) 

X_val = val.drop(columns=['ID', 'Class']) 
y_val = val['Class']

X_test = test.drop(columns=['ID'])

In [194]:
X_train = X_train[['V3', 'V4', 'V7', 'V8', 'V9', 'V10', 'V11',
                   'V12', 'V14', 'V16', 'V17', 'V18']]

X_val = X_val[['V3', 'V4', 'V7', 'V8', 'V9', 'V10', 'V11',
               'V12', 'V14', 'V16', 'V17', 'V18']]

X_test = X_test[['V3', 'V4', 'V7', 'V8', 'V9', 'V10', 'V11',
                 'V12', 'V14', 'V16', 'V17', 'V18']]

In [195]:
len(X_train.columns)

12

## Scaling

In [196]:
#scaler = StandardScaler()
#scaler = MinMaxScaler()
scaler = RobustScaler(quantile_range=(25.0, 75.0))

scaled_train = scaler.fit_transform(X_train)
scaled_val = scaler.transform(X_val)
scaled_test = scaler.transform(X_test)

scaled_train = pd.DataFrame(scaled_train)
scaled_val = pd.DataFrame(scaled_val)
scaled_test = pd.DataFrame(scaled_test)

In [197]:
#scaler = StandardScaler()
scaler = MinMaxScaler()
#scaler = RobustScaler(quantile_range=(45.0, 55.0))

scaled_train = scaler.fit_transform(scaled_train)
scaled_val = scaler.transform(scaled_val)
scaled_test = scaler.transform(scaled_test)

scaled_train = pd.DataFrame(scaled_train)
scaled_val = pd.DataFrame(scaled_val)
scaled_test = pd.DataFrame(scaled_test)

## 파생 변수

In [198]:
main_columns = list(scaled_train.columns)

com_main = list(combinations(main_columns, 2))
print(len(com_main))
print(com_main)

66
[(0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 9), (0, 10), (0, 11), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (2, 10), (2, 11), (3, 4), (3, 5), (3, 6), (3, 7), (3, 8), (3, 9), (3, 10), (3, 11), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9), (4, 10), (4, 11), (5, 6), (5, 7), (5, 8), (5, 9), (5, 10), (5, 11), (6, 7), (6, 8), (6, 9), (6, 10), (6, 11), (7, 8), (7, 9), (7, 10), (7, 11), (8, 9), (8, 10), (8, 11), (9, 10), (9, 11), (10, 11)]


In [199]:
for com in com_main:

  x = com[0]
  y = com[-1]

  scaled_train[f'{x}_{y}_mean'] = (scaled_train[x] + scaled_train[y]) / 2
  scaled_val[f'{x}_{y}_mean'] = (scaled_val[x] + scaled_val[y]) / 2
  scaled_test[f'{x}_{y}_mean'] = (scaled_test[x] + scaled_test[y]) / 2

In [200]:
len(scaled_train.columns)

78

## Dimension Reduction

In [201]:
dimesion_reducer = PCA(n_components=78)
#dimesion_reducer = SparsePCA(n_components=78, alpha=0.01)
#dimesion_reducer = SparseRandomProjection(n_components=15, eps=0.5, random_state=42)


reduced_train = dimesion_reducer.fit_transform(scaled_train)
reduced_val = dimesion_reducer.transform(scaled_val)
reduced_test = dimesion_reducer.transform(scaled_test)

reduced_train = pd.DataFrame(reduced_train)
reduced_val = pd.DataFrame(reduced_val)
reduced_test = pd.DataFrame(reduced_test)

# Modeling

In [202]:
model = IsolationForest(n_estimators=500,
                        max_samples=len(X_train),
                        max_features=0.8,
                        contamination=30/28432,
                        random_state=42,
                        verbose=0)

In [203]:
model.fit(reduced_train)

IsolationForest(contamination=0.0010551491277433877, max_features=0.8,
                max_samples=113842, n_estimators=500, random_state=42)

# Evaluation

In [204]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량(사기)) 이므로 (0:정상, 1:불량(사기))로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [205]:
pred_val = model.predict(reduced_val) # model prediction
pred_val = get_pred_label(pred_val)

In [206]:
val_score = f1_score(y_val, pred_val, average='macro')

In [207]:
# 0.7879157743510873
# 0.7928924258723169
# 0.7980433877878794
# 0.8033779981712035
# 0.8089062742283302
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(y_val, pred_val))

Validation F1 Score : [0.8033779981712035]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.65      0.57      0.61        30

    accuracy                           1.00     28462
   macro avg       0.83      0.78      0.80     28462
weighted avg       1.00      1.00      1.00     28462



# Inference

In [208]:
pred_test = model.predict(reduced_test) # model prediction
pred_test = get_pred_label(pred_test)

# Submission

In [209]:
submit = pd.read_csv('./sample_submission.csv')

In [210]:
submit['Class'] = pred_test
submit.head()

Unnamed: 0,ID,Class
0,AAAA0x1,0
1,AAAA0x2,0
2,AAAA0x5,0
3,AAAA0x7,0
4,AAAA0xc,0


In [211]:
submit.to_csv('./submission_3.csv', index=False)