# Setting

## Install

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m77.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.1


## Library

In [174]:
# 데이터 다루기
import pandas as pd
import numpy as np
from itertools import combinations

# 전처리
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA, KernelPCA, SparsePCA, TruncatedSVD, IncrementalPCA
from sklearn.cluster import KMeans
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

# 모델링
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score # 재현율
from sklearn.metrics import precision_score # 정밀도
from sklearn.metrics import classification_report

# 기타
import os
import random
from tqdm.auto import tqdm
import math

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim import Optimizer, AdamW
from torch.optim.lr_scheduler import LambdaLR, CyclicLR, OneCycleLR

In [4]:
import warnings
warnings.filterwarnings(action='ignore')

In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

## Fixed Random Seed

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42) # Seed 고정

# Load Data Set

## Googel Drive Mount

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Unzip File

In [8]:
!unzip --qq '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/사기거래/data/사기거래.zip'

## Load Train / Val / Test Set

In [153]:
train = pd.read_csv('/content/train.csv')
val = pd.read_csv('/content/val.csv')
test = pd.read_csv('/content/test.csv')

# Preprocessing

## Feature Selection

In [159]:
X_train = train.drop(columns=['ID']) 

X_val = val.drop(columns=['ID', 'Class']) 
y_val = val['Class']

X_test = test.drop(columns=['ID'])

In [160]:
X_train = X_train[['V3', 'V4', 'V9', 'V10', 'V11',
                   'V12', 'V14', 'V16', 'V17', 'V18']]

X_val = X_val[['V3', 'V4', 'V9', 'V10', 'V11',
               'V12', 'V14', 'V16', 'V17', 'V18']]

X_test = X_test[['V3', 'V4', 'V9', 'V10', 'V11',
                 'V12', 'V14', 'V16', 'V17', 'V18']]

In [161]:
len(X_train.columns)

10

## Scaling

In [162]:
#scaler = StandardScaler()
#scaler = MinMaxScaler()
scaler = RobustScaler(quantile_range=(45.0, 55.0))

scaled_train = scaler.fit_transform(X_train)
scaled_val = scaler.transform(X_val)
scaled_test = scaler.transform(X_test)

scaled_train = pd.DataFrame(scaled_train)
scaled_val = pd.DataFrame(scaled_val)
scaled_test = pd.DataFrame(scaled_test)

In [163]:
#scaler = StandardScaler()
scaler = MinMaxScaler()
#scaler = RobustScaler(quantile_range=(45.0, 55.0))

scaled_train = scaler.fit_transform(scaled_train)
scaled_val = scaler.transform(scaled_val)
scaled_test = scaler.transform(scaled_test)

scaled_train = pd.DataFrame(scaled_train)
scaled_val = pd.DataFrame(scaled_val)
scaled_test = pd.DataFrame(scaled_test)

## 파생 변수

In [164]:
main_columns = list(scaled_train.columns)

com_main = list(combinations(main_columns, 5))
print(len(com_main))

252


In [165]:
for com in com_main:

  x = com[0]
  y = com[1]
  z = com[2]
  w = com[3]
  v = com[-1]

  scaled_train[f'{x}_{y}_{z}_{w}_{v}_mean'] = (scaled_train[x] + scaled_train[y] + scaled_train[z] + scaled_train[w] + scaled_train[v]) / 5
  scaled_val[f'{x}_{y}_{z}_{w}_{v}_mean'] = (scaled_val[x] + scaled_val[y] + scaled_val[z] + scaled_val[w] + scaled_val[v]) / 5
  scaled_test[f'{x}_{y}_{z}_{w}_{v}_mean'] = (scaled_test[x] + scaled_test[y] + scaled_test[z] + scaled_test[w] + scaled_test[v]) / 5

In [166]:
print(len(scaled_train.columns))
n_components = len(scaled_train.columns)

262


## Dimension Reduction

In [167]:
#dimesion_reducer = PCA(n_components=N_COMPONETS)
#dimesion_reducer = SparsePCA(n_components=78, alpha=0.01)
dimesion_reducer = SparseRandomProjection(n_components=262, eps=0.1, random_state=42)


reduced_train = dimesion_reducer.fit_transform(scaled_train)
reduced_val = dimesion_reducer.transform(scaled_val)
reduced_test = dimesion_reducer.transform(scaled_test)

reduced_train = pd.DataFrame(reduced_train)
reduced_val = pd.DataFrame(reduced_val)
reduced_test = pd.DataFrame(reduced_test)

In [168]:
reduced_train = reduced_train.sample(frac=0.9, random_state=42)
reduced_val = reduced_val.sample(frac=0.9, random_state=42)

# Make Data Set for Pytorch

In [169]:
class MyDataset(Dataset):
    def __init__(self, df, eval_mode):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.eval_mode:
            x = torch.from_numpy(self.df[index]).type(torch.FloatTensor)
            y = torch.FloatTensor([self.labels[index]])
            return x, y
            #self.x = self.df[index]
            #self.y = self.labels[index]
            #return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

In [170]:
train_dataset = MyDataset(df=reduced_train, eval_mode=False)
train_loader = DataLoader(train_dataset, batch_size=2**13, shuffle=True, num_workers=6)

reduced_val=pd.concat([reduced_val, val[['Class']]], axis=1)
val_dataset = MyDataset(df = reduced_val, eval_mode=True)
val_loader = DataLoader(val_dataset, batch_size=2**13, shuffle=False, num_workers=6)

# Modeling

## 모델 및 학습 정의

In [171]:
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.Encoder = nn.Sequential(
            nn.Linear(262,512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
            nn.Linear(512,1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(),
            nn.Linear(1024,2048),
            nn.BatchNorm1d(2048),
            nn.LeakyReLU(),
            nn.Linear(2048,4096),
            nn.BatchNorm1d(4096),
            nn.LeakyReLU()
        )
        self.Decoder = nn.Sequential(
            nn.Linear(4096,2048),
            nn.BatchNorm1d(2048),
            nn.LeakyReLU(),
            nn.Linear(2048,1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(),
            nn.Linear(1024,512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
            nn.Linear(512,262),
        )
        
    def forward(self, x):
        x = self.Encoder(x)
        x = self.Decoder(x)
        return x

In [172]:
class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        # Loss Function
        self.criterion = nn.L1Loss().to(self.device)
        
    def fit(self):
        self.model.to(self.device)
        best_score = 0
        avg = 1
        for epoch in range(50):
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader):
                x = x.float().to(self.device)
                self.optimizer.zero_grad()

                _x = self.model(x)
                loss = self.criterion(x, _x)

                loss.backward()
                self.optimizer.step()

                train_loss.append(loss.item())

            score = self.validation(self.model, 0.95)
            print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')

            if self.scheduler is not None:
                self.scheduler.step(score)

            if best_score <= score and avg > np.mean(train_loss):
                best_score = score
                avg = np.mean(train_loss)
                torch.save(model.module.state_dict(), './best_model.pth', _use_new_zipfile_serialization=False)
    
    def validation(self, eval_model, thr):
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        eval_model.eval()
        pred = []
        true = []
        with torch.no_grad():
            for x, y in iter(self.val_loader):
                x = x.float().to(self.device)

                _x = self.model(x)
                diff = cos(x, _x).cpu().tolist()
                batch_pred = np.where(np.array(diff)<thr, 1, 0).tolist()
                pred += batch_pred
                true += y.tolist()

        return f1_score(true, pred, average='macro')

## 모델 학습

In [173]:
model = nn.DataParallel(AutoEncoder())
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = 1e-2)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)

trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
trainer.fit()

Epoch : [0] Train loss : [0.31101111379953533] Val Score : [0.092118388263202])
Epoch : [1] Train loss : [0.1438565540772218] Val Score : [0.5806114997347178])
Epoch : [2] Train loss : [0.0860010886994692] Val Score : [0.870247282626393])
Epoch : [3] Train loss : [0.0426597331578915] Val Score : [0.9097393418694286])
Epoch : [4] Train loss : [0.04093006167274255] Val Score : [0.9097393418694286])
Epoch : [5] Train loss : [0.042776656695283376] Val Score : [0.8844834793761085])
Epoch : [6] Train loss : [0.057154672363629706] Val Score : [0.8786471773914175])
Epoch : [7] Train loss : [0.04824763464813049] Val Score : [0.8786471773914175])
Epoch : [8] Train loss : [0.036914290143893316] Val Score : [0.9031202878275757])
Epoch : [9] Train loss : [0.029229298520546693] Val Score : [0.8786471773914175])
Epoch : [10] Train loss : [0.03569290572060989] Val Score : [0.8879277916133668])
Epoch : [11] Train loss : [0.0307303242958509] Val Score : [0.8786471773914175])
Epoch : [12] Train loss : [0

## 최고의 모델

In [175]:
model = AutoEncoder()
model.load_state_dict(torch.load('./best_model.pth'))
model = nn.DataParallel(model)
model.eval()

DataParallel(
  (module): AutoEncoder(
    (Encoder): Sequential(
      (0): Linear(in_features=262, out_features=512, bias=True)
      (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01)
      (3): Linear(in_features=512, out_features=1024, bias=True)
      (4): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): LeakyReLU(negative_slope=0.01)
      (6): Linear(in_features=1024, out_features=2048, bias=True)
      (7): BatchNorm1d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (8): LeakyReLU(negative_slope=0.01)
      (9): Linear(in_features=2048, out_features=4096, bias=True)
      (10): BatchNorm1d(4096, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (11): LeakyReLU(negative_slope=0.01)
    )
    (Decoder): Sequential(
      (0): Linear(in_features=4096, out_features=2048, bias=True)
      (1): BatchNorm1d(2048, eps=

# Evaluation

In [176]:
def prediction(model, thr, test_loader, device):
    model.to(device)
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    pred = []
    with torch.no_grad():
        for x in iter(test_loader):
            x = x.float().to(device)
            _x = model(x)
            
            diff = cos(x, _x).cpu().tolist()
            batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
            pred += batch_pred
    return pred

In [177]:
reduced_val = reduced_val.drop(columns=['Class'])
val_dataset = MyDataset(reduced_val, False)
val_loader = DataLoader(val_dataset, batch_size=2**13, shuffle=False, num_workers=6)

In [193]:
pred_val = prediction(model, 0.95, val_loader, device)

In [194]:
val_score = f1_score(y_val, pred_val, average='macro')
recall = recall_score(y_val, pred_val)
precision = precision_score(y_val, pred_val)

In [195]:
print(f'Marco F1 Score : {val_score}\n')
print(f'Recall : {recall}\n')
print(f'Precision : {precision}\n')
print(classification_report(y_val, pred_val))

Marco F1 Score : 0.9236496787663914

Recall : 0.8333333333333334

Precision : 0.8620689655172413

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.86      0.83      0.85        30

    accuracy                           1.00     28462
   macro avg       0.93      0.92      0.92     28462
weighted avg       1.00      1.00      1.00     28462



# Inference

In [97]:
test_dataset = MyDataset(reduced_test, False)
test_loader = DataLoader(test_dataset, batch_size=2**13, shuffle=False, num_workers=6)

In [100]:
pred_test = prediction(model, 0.95, test_loader, device)

# Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['Class'] = pred_test
submit.head()

Unnamed: 0,ID,Class
0,AAAA0x1,0
1,AAAA0x2,0
2,AAAA0x5,0
3,AAAA0x7,0
4,AAAA0xc,0


In [None]:
submit.to_csv('./submission_3.csv', index=False)