---
# 1. 필요 라이브러리 불러오기
---

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
from imblearn.over_sampling import *
from sklearn.preprocessing import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.combine import *
from sklearn.metrics import f1_score
from sklearn import *
import torch 
import torch.nn as nn # nn : neural netwroks (define class) attribute를 활용해 state를 저장하고 활용
import torch.optim as optim # 최적화 알고리즘
import torch.nn.functional as F # (define function) 인스턴스화 시킬 필요없이 사용 가능
from PIL import Image
from torchvision import transforms, datasets # transforms : 데이터를 조작하고 학습에 적합하게 만듦.
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import torchsummary

---
# 2. 데이터 불러오기
---

In [2]:
dataset_link = '/content/drive/MyDrive/Numble_Challenge/data/datast_final.csv'
model_save_link = "/content/drive/MyDrive/Numble_Challenge/Numble_model.pt"

In [None]:
data = pd.read_csv(dataset_link)
del data['Unnamed: 0']

---
# 3. 데이터 증강 및 정규화
---

In [4]:
# 데이터 분리
data_y = data['target']
data_x = data.drop(['target'], axis = 1)

# 불균형 처리
smoteenn = SMOTEENN(random_state=42)
data_x , data_y = smoteenn.fit_resample(data_x, data_y)

# 데이터 정규화
scaler = StandardScaler()
scaler.fit(data_x)
data_x = scaler.transform(data_x)

# 데이터 분리
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size = 0.2, random_state= 415)

# 로지스틱 회귀분석
model = LogisticRegression()
model.fit(train_x, train_y)

LogisticRegression()

In [5]:
print('train 정확도:',round(model.score(train_x, train_y),3))
print('test 정확도:',round(model.score(test_x, test_y),3))
print('데이터 개수 :', len(data_x))

train 정확도: 0.745
test 정확도: 0.742
데이터 개수 : 107031


---
# 4. 모델 구현
---

In [6]:
class CustomDataset(Dataset):

  def __init__(self, data_x, data_y):
    self.data_x = torch.FloatTensor(data_x)
    self.data_y = torch.FloatTensor(data_y)
  
  def __getitem__(self,index):
    return self.data_x[index], self.data_y[index]

  def __len__(self):
    return self.data_x.shape[0]

In [7]:
batch_size = 64
# Dataset
train_dataset = CustomDataset(train_x , train_y.values)
val_dataset = CustomDataset(test_x, test_y.values)

In [8]:
# Dataloader
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=0,
                                               pin_memory=True,
                                               drop_last = True)

test_dataloader = torch.utils.data.DataLoader(dataset=val_dataset,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             num_workers=0,
                                             pin_memory=True,
                                             drop_last = False)

In [9]:
class Net(nn.Module):
  def __init__(self,):
    super(Net,self).__init__()

    self.layer1 = nn.Sequential(
                      nn.Linear(16,512),
                      nn.ReLU(),
                      nn.Dropout(p = 0.3)
                                ) 

    self.layer2 = nn.Sequential(
                      nn.Linear(512,256),
                      nn.ReLU(),
                      nn.Dropout(p = 0.3)
                                ) 

    self.layer3 = nn.Sequential(
                      nn.Linear(256,128),
                      nn.ReLU(),
                      nn.Dropout(p = 0.3)
                                )     

    self.layer4 = nn.Sequential(
                      nn.Linear(128,64),
                      nn.ReLU(),
                      nn.Dropout(p = 0.3)
                                )   

    self.layer5 = nn.Sequential(
                      nn.Linear(64,32),
                      nn.ReLU(),
                      nn.Dropout(p = 0.3)
                                )           
    
    self.layer6 = nn.Sequential(
                      nn.Linear(32,2),
                      nn.ReLU(),
                      nn.Dropout(p = 0.3)
                                )  
  
  def forward(self,x):
    out = self.layer1(x)
    out = self.layer2(out)
    out = self.layer3(out)
    out = self.layer4(out)
    out = self.layer5(out)
    out = self.layer6(out)
    return out



In [10]:
class FocalLoss(nn.Module): # for imbalanced data
  def __init__(self, alpha=1, gamma=2, logits=False, reduction='sum'):
    super(FocalLoss, self).__init__()
    self.alpha = alpha
    self.gamma = gamma
    self.reduction = reduction
    
    if logits:
      raise ValueError 

  def forward(self, inputs, targets):    
    ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)

    pt = torch.exp(-ce_loss)
    F_loss = self.alpha * (1-pt)**self.gamma * ce_loss

    if self.reduction == 'sum':
      return torch.sum(F_loss)
    else:
      return F_loss

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
deep_model = Net().to(device)
deep_model.load_state_dict(torch.load(model_save_link))
criterion = FocalLoss()
optimizer = optim.Adam(deep_model.parameters(), lr= 0.00001)

---
# 5. Inference
---

In [13]:
test_loss = 0
test_accuracy = 0

deep_model.eval()
with torch.no_grad():
  for batch_idx, (data, target) in enumerate(tqdm(test_dataloader)):
  
    # data, target을 device에 올림
    target = target.type(torch.LongTensor)
    data, target = data.to(device), target.to(device)

    # Forward_propagation
    output = deep_model(data)

    # 손실함수를 통해 Loss 계산
    test_loss += criterion(output, target).item() * data.size(0)

    ############################# Accuracy #############################
    # max probability와 max index를 반환
    # max probability는 무시하고, max index는 pred에 저장하여 label 값과 대조하여 정확도를 도출
    _, preds = torch.max(output,1)
    test_accuracy += torch.sum(preds == target)
    ####################################################################
    F1_score = metrics.f1_score(preds.cpu().data, target.cpu(), average = 'weighted')
    test_acc = test_accuracy.item() / len(test_dataloader.dataset)
    test_lo = test_loss / len(test_dataloader.dataset)
    
print('#'*30,'Test','#'*30)
print(f"Test_Accuracy : {test_acc :.3f}", '\n')
print(f"F1_score : {F1_score :0.3f}")
print('#'*66)


  0%|          | 0/335 [00:00<?, ?it/s]

############################## Test ##############################
Test_Accuracy : 0.859 

F1_score : 0.833
##################################################################
