# Setting

## Library

In [1]:
# 데이터 다루기
import pandas as pd
import numpy as np
from itertools import combinations

# 전처리
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA, KernelPCA, SparsePCA, TruncatedSVD, FastICA
from sklearn.cluster import KMeans
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

# 모델링
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score # 재현율
from sklearn.metrics import precision_score # 정밀도
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# 기타
import os
import random

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import warnings
warnings.filterwarnings(action='ignore')

## Fixed Random Seed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# Load Data Set

## Googel Drive Mount

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Unzip File

In [6]:
!unzip --qq '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/사기거래/data/사기거래.zip'

## Load Train / Val / Test Set

In [7]:
train = pd.read_csv('/content/train.csv')
val = pd.read_csv('/content/val.csv')
test = pd.read_csv('/content/test.csv')

In [8]:
val['Class'].value_counts()

0    28432
1       30
Name: Class, dtype: int64

# Preprocessing

## Feature Selection

In [17]:
X_train = train.drop(columns=['ID']) 

X_val = val.drop(columns=['ID', 'Class']) 
y_val = val['Class']

X_test = test.drop(columns=['ID'])

In [18]:
X_train = X_train[['V3', 'V4', 'V9', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18']]

X_val = X_val[['V3', 'V4', 'V9', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18']]

X_test = X_test[['V3', 'V4', 'V9', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18']]

In [19]:
len(X_train.columns)

10

## Scaling

In [20]:
#scaler = StandardScaler()
#scaler = MinMaxScaler()
scaler = RobustScaler(quantile_range=(45.0, 55.0))

scaled_train = scaler.fit_transform(X_train)
scaled_val = scaler.transform(X_val)
scaled_test = scaler.transform(X_test)

scaled_train = pd.DataFrame(scaled_train)
scaled_val = pd.DataFrame(scaled_val)
scaled_test = pd.DataFrame(scaled_test)

In [21]:
#scaler = StandardScaler()
scaler = MinMaxScaler()
#scaler = RobustScaler(quantile_range=(45.0, 55.0))

scaled_train = scaler.fit_transform(scaled_train)
scaled_val = scaler.transform(scaled_val)
scaled_test = scaler.transform(scaled_test)

scaled_train = pd.DataFrame(scaled_train)
scaled_val = pd.DataFrame(scaled_val)
scaled_test = pd.DataFrame(scaled_test)

## 파생 변수

In [22]:
main_columns = list(scaled_train.columns)

com_main = list(combinations(main_columns, 5))
print(len(com_main))

252


In [23]:
for com in com_main:

  x = com[0]
  y = com[1]
  z = com[2]
  w = com[3]
  v = com[-1]

  scaled_train[f'{x}_{y}_{z}_{w}_{v}_mean'] = (scaled_train[x] + scaled_train[y] + scaled_train[z] + scaled_train[w] + scaled_train[v]) / 5
  scaled_val[f'{x}_{y}_{z}_{w}_{v}_mean'] = (scaled_val[x] + scaled_val[y] + scaled_val[z] + scaled_val[w] + scaled_val[v]) / 5
  scaled_test[f'{x}_{y}_{z}_{w}_{v}_mean'] = (scaled_test[x] + scaled_test[y] + scaled_test[z] + scaled_test[w] + scaled_test[v]) / 5

In [24]:
print(len(scaled_train.columns))
n_components = len(scaled_train.columns)

262


## Dimension Reduction

In [25]:
def dimension_isolate_forest(train, val, test, random_state):

  dimesion_reducer = SparseRandomProjection(n_components=262, eps=0.1, random_state=random_state)
  #dimesion_reducer = SparsePCA(n_components=128, alpha=0.001, random_state=random_state)

  train = dimesion_reducer.fit_transform(train)
  val = dimesion_reducer.transform(val)
  test = dimesion_reducer.transform(test)

  train = pd.DataFrame(train)
  val = pd.DataFrame(val)
  test = pd.DataFrame(test)

  return train, val, test


# Modeling

In [26]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량(사기)) 이므로 (0:정상, 1:불량(사기))로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [None]:
result_val = np.zeros(val.shape[0])
result_test = np.zeros(test.shape[0])

for rs in range(20):

  reduced_train, reduced_val, reduced_test = dimension_isolate_forest(scaled_train,
                                                                      scaled_val,
                                                                      scaled_test,
                                                                      rs)
  
  model = IsolationForest(n_estimators=300,
                        max_samples=0.80,
                        max_features=0.4,
                        contamination=30/28432,
                        random_state=rs,
                        verbose=0)
  
  model.fit(reduced_train)

  pred_val = model.predict(reduced_val) # model prediction
  pred_val = get_pred_label(pred_val)

  result_val += pred_val


  pred_test = model.predict(reduced_test) # model prediction
  pred_test = get_pred_label(pred_test)

  result_test += pred_test

  val_score = f1_score(y_val, pred_val, average='macro')

  print(f'"{rs}" Marco F1 Score : {val_score}')

"0" Marco F1 Score : 0.8460131793934016
"1" Marco F1 Score : 0.8271056766827749
"2" Marco F1 Score : 0.8394731804484362
"3" Marco F1 Score : 0.8394731804484362


# Evaluation

In [None]:
def check(result, ts, val):

  pred_result = []

  for i in result:

    if i >= ts:

      i = 1

    else:

      i = 0

    pred_result.append(i)

  if val == True:

    val_score = f1_score(y_val, pred_result, average='macro')
    recall = recall_score(y_val, pred_result)
    precision = precision_score(y_val, pred_result)

    print(f'Marco F1 Score : {val_score}\n')
    print(f'Recall : {recall}\n')
    print(f'Precision : {precision}\n')

    print(classification_report(y_val, pred_result))

  return pred_result

In [None]:
pred_val = check(result_val, 1, val=True)

Marco F1 Score : 0.7810037812170243

Recall : 0.6

Precision : 0.5294117647058824

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.53      0.60      0.56        30

    accuracy                           1.00     28462
   macro avg       0.76      0.80      0.78     28462
weighted avg       1.00      1.00      1.00     28462



# Inference

In [None]:
pred_test = check(result_test, 1, val=False)

# Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['Class'] = pred_test
submit.head()

Unnamed: 0,ID,Class
0,AAAA0x1,0
1,AAAA0x2,0
2,AAAA0x5,0
3,AAAA0x7,0
4,AAAA0xc,0


In [None]:
submit.to_csv('./submission_16.csv', index=False)