<a href="https://colab.research.google.com/github/pitapatat/wanted_pre_onboarding/blob/main/help_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. 첨부된 'Week2_3_assginment.ipynb'를 다운로드 받아서 문제 푸시고, help.py과 최종 코드 파일(.ipynb)을 제출하세요.
    help.py에 포함되어야 할 함수와 클래스는 아래와 같습니다.
스크립트 내 포함해야하는 함수
- set_device()
- custom_collate_fn()
포함해야하는 클래스
- CustomDataset
- CustomClassifier

In [None]:
### library
import os
import sys
import pandas as pd
import numpy as np 
import torch
import random
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, random_split
!pip install transformers
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

In [None]:
## 데이터 다운로드
!wget https://raw.githubusercontent.com/ChristinaROK/PreOnboarding_AI_assets/e56006adfac42f8a2975db0ebbe60eacbe1c6b11/data/sample_df.csv

In [None]:
## set_device()
def set_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"# available GPUs : {torch.cuda.device_count()}")
        print(f"GPU name : {torch.cuda.get_device_name()}")
    else:
        device = torch.device("cpu")
    return(device)

###################################################################################################
## CustomDataSet 
class CustomDataset(Dataset):
  """
  - input_data: list of string
  - target_data: list of int
  """
  ## 데이터 전처리
  def __init__(self, input_data:list, target_data:list) -> list:
      self.X = input_data
      self.Y = target_data

  ## 데이터셋 길이
  def __len__(self):
      return len(self.Y)
      
  ## index넣으면 데이터 가져오기
  def __getitem__(self, index):
      result = (self.X[index], self.Y[index])
      return result

tokenizer_bert = BertTokenizer.from_pretrained("klue/bert-base")

###################################################################################################
## custom_collate_fn
def custom_collate_fn(batch):

  global tokenizer_bert

  # batch: list of tuples (input_data(string), target_data(int))
  input_list, target_list = [], []
  for text, label in batch:
      input_list.append(text)
      target_list.append(label)
  
  # 토큰 개수는 배치 내 가장 긴 문장(longest), 최대 길이를 넘는 문장은 최대 길이 이후의 토큰을 제거(truncation), 토크나이즈된 결과 값은 텐서 형태로 반환(return_tensors = 'pt')
  tensorized_input = tokenizer_bert(input_list,
                                    return_tensors = 'pt', 
                                    truncation = True, 
                                    padding='longest')
  
  tensorized_label = torch.tensor(target_list)

  # return type: (tensor)
  return tensorized_input, tensorized_label

###################################################################################################
## CustomClassifier
class CustomClassifier(nn.Module):

  # nn.Module 상속 
  def __init__(self, hidden_size: int, n_label: int):
    super(CustomClassifier, self).__init__()

    self.bert = BertModel.from_pretrained('klue/bert-base')
    self.hidden_size = hidden_size 
    self.n_label = n_label

    dropout_rate = 0.1
    linear_layer_hidden_size = 32

    self.classifier = nn.Sequential(
                        nn.Linear(self.hidden_size, linear_layer_hidden_size),
                        nn.ReLU(),
                        nn.Dropout(p = dropout_rate),
                        nn.Linear(linear_layer_hidden_size, self.n_label))
 
  
  def forward(self, input_ids=None, attention_mask=None, token_type_ids=None):
    outputs =self.bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
    )

    # BERT 모델의 마지막 레이어의 첫번재 토큰을 인덱싱
    cls_token_last_hidden_states = outputs['pooler_output'] # 마지막 layer의 첫 번째 토큰 ("[CLS]") 벡터를 가져오기, shape = (1, hidden_size)
    logits = self.classifier(cls_token_last_hidden_states)
    
    return logits

In [None]:
## 모델 훈련 함수
def train(model, data_loader):
  global loss_fct
  device = set_device()

  # 배치 단위 평균 loss와 총 평균 loss 계산하기위해 변수 생성
  total_loss, batch_loss, batch_count = 0,0,0
  
  # model을 train 모드로 설정 & device 할당
  model.train()
  model.to(device)
  
  # data iterator를 돌면서 하나씩 학습
  for step, batch in enumerate(data_loader):
      batch_count+=1
      
      # tensor 연산 전, 각 tensor에 device 할당
      batch = tuple(item.to(device) for item in batch)
      
      batch_input, batch_label = batch
      
      # batch마다 모델이 갖고 있는 기존 gradient를 초기화
      model.zero_grad()
      
      # forward
      logits = model(**batch_input)
      
      # loss
      loss = loss_fct(logits, batch_label)
      batch_loss += loss.item()
      total_loss += loss.item()
      
      # backward -> 파라미터의 미분(gradient)를 자동으로 계산
      loss.backward()
      
      # optimizer 업데이트
      optimizer.step()
      
      # 배치 10개씩 처리할 때마다 평균 loss를 출력
      if (step % 10 == 0 and step != 0):
          print(f"Step : {step}, Avg Loss : {batch_loss / batch_count:.4f}")
          
          # 변수 초기화 
          batch_loss, batch_count = 0,0
  
  print(f"Mean Loss : {total_loss/(step+1):.4f}")
  print("Train Finished")

In [None]:
## 모델 구현 
sample_df = pd.read_csv('sample_df.csv')

In [None]:
# dataset 
dataset = CustomDataset(list(sample_df.document.values),list(sample_df.label.values))
n_train = 9000
n_valid = 1000
train_dataset, valid_dataset = random_split(dataset, [n_train, n_valid])

# dataloader 정의 
random_sampler = RandomSampler(train_dataset)
seq_sampler = SequentialSampler(valid_dataset) 
train_dataloader =  DataLoader(train_dataset, batch_size = 32, sampler = random_sampler, 
                             collate_fn = custom_collate_fn)
valid_dataloader =  DataLoader(valid_dataset, batch_size = 64, sampler = seq_sampler,
                               collate_fn = custom_collate_fn)

# 모델 인스턴스
model = CustomClassifier(hidden_size=768, n_label=2)

# 데이터로더
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size = 32, sampler = random_sampler, 
                             collate_fn = custom_collate_fn)

# 로스 및 옵티마이저
loss_fct = CrossEntropyLoss()
optimizer = AdamW(
    model.parameters(),
    lr=2e-5,
    eps=1e-8
)

# 학습 시작
train(model, train_dataloader)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# available GPUs : 1
GPU name : Tesla P100-PCIE-16GB
Step : 10, Avg Loss : 0.6824
Step : 20, Avg Loss : 0.6265
Step : 30, Avg Loss : 0.5405
Step : 40, Avg Loss : 0.4647
Step : 50, Avg Loss : 0.4084
Step : 60, Avg Loss : 0.4684
Step : 70, Avg Loss : 0.4018
Step : 80, Avg Loss : 0.3706
Step : 90, Avg Loss : 0.4090
Step : 100, Avg Loss : 0.3917
Step : 110, Avg Loss : 0.4049
Step : 120, Avg Loss : 0.4052
Step : 130, Avg Loss : 0.3913
Step : 140, Avg Loss : 0.3930
Step : 150, Avg Loss : 0.3624
Step : 160, Avg Loss : 0.3956
Step : 170, Avg Loss : 0.3488
Step : 180, Avg Loss : 0.3180
Step : 190, Avg Loss : 0.3495
Step : 200, Avg Loss : 0.3614
Step : 210, Avg Loss : 0.3810
Step : 220, Avg Loss : 0.3451
Step : 230, Avg Loss : 0.3843
Step : 240, Avg Loss : 0.3109
Step : 250, Avg Loss : 0.3550
Step : 260, Avg Loss : 0.3312
Step : 270, Avg Loss : 0.3184
Step : 280, Avg Loss : 0.3822
Mean Loss : 0.4041
Train Finished
