# Setting

## Library

In [1]:
# 데이터 다루기
import pandas as pd
import numpy as np
from itertools import combinations

# 전처리
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA, KernelPCA, SparsePCA, TruncatedSVD, FastICA
from sklearn.cluster import KMeans
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

# 모델링
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score # 재현율
from sklearn.metrics import precision_score # 정밀도
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# 기타
import os
import random

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import warnings
warnings.filterwarnings(action='ignore')

# Load Data Set

## Load Train / Val / Test Set

In [4]:
train = pd.read_csv('./train.csv')
val = pd.read_csv('./val.csv')
test = pd.read_csv('./test.csv')

In [5]:
val['Class'].value_counts()

0    28432
1       30
Name: Class, dtype: int64

# Preprocessing

## Feature Selection

In [6]:
X_train = train.drop(columns=['ID']) 

X_val = val.drop(columns=['ID', 'Class']) 
y_val = val['Class']

X_test = test.drop(columns=['ID'])

In [7]:
X_train = X_train[['V3', 'V4', 'V9', 'V10', 'V11',
                   'V12', 'V14', 'V16', 'V17', 'V18']]

X_val = X_val[['V3', 'V4', 'V9', 'V10', 'V11',
               'V12', 'V14', 'V16', 'V17', 'V18']]

X_test = X_test[['V3', 'V4', 'V9', 'V10', 'V11',
                 'V12', 'V14', 'V16', 'V17', 'V18']]

In [8]:
len(X_train.columns)

10

## Scaling


In [9]:
#scaler = StandardScaler()
#scaler = MinMaxScaler()
scaler = RobustScaler(quantile_range=(45.0, 55.0))

scaled_train = scaler.fit_transform(X_train)
scaled_val = scaler.transform(X_val)
scaled_test = scaler.transform(X_test)

scaled_train = pd.DataFrame(scaled_train)
scaled_val = pd.DataFrame(scaled_val)
scaled_test = pd.DataFrame(scaled_test)

## 파생 변수

In [10]:
main_columns = list(scaled_train.columns)

com_main = list(combinations(main_columns, 5))
print(len(com_main))

252


In [11]:
for com in com_main:

  x = com[0]
  y = com[1]
  z = com[2]
  w = com[3]
  v = com[-1]

  scaled_train[f'{x}_{y}_{z}_{w}_{v}_mean'] = (scaled_train[x] + \
                                               scaled_train[y] + \
                                               scaled_train[z] + \
                                               scaled_train[w] + \
                                               scaled_train[v]) / 5
    
  scaled_val[f'{x}_{y}_{z}_{w}_{v}_mean'] = (scaled_val[x] + \
                                             scaled_val[y] + \
                                             scaled_val[z] + \
                                             scaled_val[w] + \
                                             scaled_val[v]) / 5

  scaled_test[f'{x}_{y}_{z}_{w}_{v}_mean'] = (scaled_test[x] + \
                                              scaled_test[y] + \
                                              scaled_test[z] + \
                                              scaled_test[w] + \
                                              scaled_test[v]) / 5

In [12]:
print(len(scaled_train.columns))
n_components = len(scaled_train.columns)

262


## Dimension Reduction

In [13]:
def dimension_reduction(train, val, test, dimesion_reducer,  random_state):

  if dimesion_reducer == 'Sparse_Random_Projection':

    reducer = SparseRandomProjection(n_components=262,
                                     eps=0.1,
                                     random_state=random_state)
    
  if dimesion_reducer == 'Sparse_PCA':

    reducer = SparsePCA(n_components=262,
                        alpha=0.001,
                        random_state=random_state)

  train = reducer.fit_transform(train)
  val = reducer.transform(val)
  test = reducer.transform(test)

  train = pd.DataFrame(train)
  val = pd.DataFrame(val)
  test = pd.DataFrame(test)

  return train, val, test

# Modeling

In [14]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량(사기)) 이므로 (0:정상, 1:불량(사기))로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [15]:
def modeling(result_val, result_test, num_root, dimesion_reducer, model, train, val, test):

  result_val = np.zeros(val.shape[0])
  result_test = np.zeros(test.shape[0])

  for random_state in range(num_root):

    reduced_train, reduced_val, reduced_test = dimension_reduction(
                                                                   train=train,
                                                                   val=val,
                                                                   test=test,
                                                                   dimesion_reducer=dimesion_reducer,
                                                                   random_state=random_state
                                                                   )
    
    if model == 'Isolate_Forest':

      model = IsolationForest(n_estimators=300,
                              max_samples=0.80,
                              max_features=0.4,
                              contamination=30/28432,
                              random_state=random_state,
                              verbose=0)
      
    if model == 'Elliptic_Envelope':

      model = EllipticEnvelope(support_fraction=0.994,
                               contamination=30/28432,
                               random_state=random_state) 


    
    model.fit(reduced_train)

    pred_val = model.predict(reduced_val) # model prediction
    pred_val = get_pred_label(pred_val)

    result_val += pred_val

    pred_test = model.predict(reduced_test) # model prediction
    pred_test = get_pred_label(pred_test)

    result_test += pred_test

    val_score = f1_score(y_val, pred_val, average='macro')

    print(f'"{random_state}" Marco F1 Score : {val_score}')

  return result_val, result_test

## Sparse PCA + Isolate Forest

In [16]:
result_val = np.zeros(val.shape[0])
result_test = np.zeros(test.shape[0])

In [17]:
result_val_PCA_ISF,  result_test_PCA_ISF = modeling(result_val=result_val,
                                                    result_test=result_test,
                                                    num_root=4,
                                                    dimesion_reducer='Sparse_PCA',
                                                    model='Isolate_Forest',
                                                    train=scaled_train,
                                                    val=scaled_val,
                                                    test=scaled_test)

"0" Marco F1 Score : 0.8212527256101849
"1" Marco F1 Score : 0.851711180144449
"2" Marco F1 Score : 0.8453050791372196
"3" Marco F1 Score : 0.8453050791372196


## Sparse Random Projection + Isolate Forest

In [18]:
result_val_SRP_ISF,  result_test_SRP_ISF = modeling(result_val=result_val,
                                                    result_test=result_test,
                                                    num_root=4,
                                                    dimesion_reducer='Sparse_Random_Projection',
                                                    model='Isolate_Forest',
                                                    train=scaled_train,
                                                    val=scaled_val,
                                                    test=scaled_test)

"0" Marco F1 Score : 0.8331750776625051
"1" Marco F1 Score : 0.8394731804484362
"2" Marco F1 Score : 0.8598769209128951
"3" Marco F1 Score : 0.8394731804484362


## Sparse PCA + Elliptic Envelope

In [19]:
result_val_PCA_EE,  result_test_PCA_EE = modeling(result_val=result_val,
                                                  result_test=result_test,
                                                  num_root=4,
                                                  dimesion_reducer='Sparse_PCA',
                                                  model='Elliptic_Envelope',
                                                  train=scaled_train,
                                                  val=scaled_val,
                                                  test=scaled_test)

"0" Marco F1 Score : 0.8858506104888013
"1" Marco F1 Score : 0.8858506104888013
"2" Marco F1 Score : 0.8858506104888013
"3" Marco F1 Score : 0.8858506104888013


## Sparse Random Projection + Elliptic Envelope

In [20]:
result_val_SRP_EE,  result_test_SRP_EE = modeling(result_val=result_val,
                                                  result_test=result_test,
                                                  num_root=4,
                                                  dimesion_reducer='Sparse_Random_Projection',
                                                  model='Elliptic_Envelope',
                                                  train=scaled_train,
                                                  val=scaled_val,
                                                  test=scaled_test)

"0" Marco F1 Score : 0.8858506104888013
"1" Marco F1 Score : 0.8858506104888013
"2" Marco F1 Score : 0.8858506104888013
"3" Marco F1 Score : 0.8858506104888013


# Ensemble

In [21]:
result_val = result_val_PCA_ISF + \
             result_val_SRP_ISF + \
             result_val_PCA_EE + \
             result_val_SRP_EE

result_test = result_test_PCA_ISF + \
              result_test_SRP_ISF + \
              result_test_PCA_EE + \
              result_test_SRP_EE

In [22]:
def check(result, ts, val):

  pred_result = []

  for i in result:

    if i >= ts:

      i = 1

    else:

      i = 0

    pred_result.append(i)

  if val == True:

    val_score = f1_score(y_val, pred_result, average='macro')
    recall = recall_score(y_val, pred_result)
    precision = precision_score(y_val, pred_result)

    print(f'Marco F1 Score : {val_score}\n')
    print(f'Recall : {recall}\n')
    print(f'Precision : {precision}\n')

    print(classification_report(y_val, pred_result))

  return pred_resul

In [71]:
pred_val = check(result_val_SRP_EE, 4, val=True)

Marco F1 Score : 0.8858506104888013

Recall : 0.7333333333333333

Precision : 0.8148148148148148

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.81      0.73      0.77        30

    accuracy                           1.00     28462
   macro avg       0.91      0.87      0.89     28462
weighted avg       1.00      1.00      1.00     28462



In [63]:
pred_test = check(result_test, 8, val=False)

# Submission

In [64]:
submit = pd.read_csv('./sample_submission.csv')

In [65]:
submit['Class'] = pred_test
submit.head()

Unnamed: 0,ID,Class
0,AAAA0x1,0
1,AAAA0x2,0
2,AAAA0x5,0
3,AAAA0x7,0
4,AAAA0xc,0


In [66]:
submit.to_csv('./submission_20.csv', index=False)