#Start

In [None]:
# 구글 드라이브 저장소 연결
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## transformers & Huggingface

transformer 강의 (https://youtu.be/Yk1tV_cXMMU)

huggingface <br>
: transformer model과 학습 스크립트를 제공하는 모듈 <br>
huggingface를 사용하면 bert, gpt 등 transformer 모델 사용시 layer, model 등을 선언하거나 구현하지 않아도 됨 

In [None]:
# transformer 설치
!pip install transformers

In [None]:
# import
import pandas as pd
import numpy as np
import random
import os
import math
import easydict
import warnings
warnings.filterwarnings('ignore') # waring 무시

from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

from tqdm import tqdm
tqdm.pandas()

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.optim.optimizer import Optimizer
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, BertModel
from transformers import get_linear_schedule_with_warmup

# 작업 디렉토리 변경
root_dir = "/content/drive/MyDrive/"
project_folder = "project"
os.chdir(os.path.join(root_dir,project_folder))

In [None]:
# 재현성 확보를 위한 random seed 고정
def seed_everything(seed: int = 42, contain_cuda: bool = False):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    print(f"Seed set as {seed}")

# Arguments

In [None]:
# device 선택 gpu 사용 가능하다면 cuda, 아니면 cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'current device : {device}')

# arguments 설정
args = easydict.EasyDict({
        "seed":42,
        "warmup_steps":500,
        "cycle_mult":1.2,
        "seq_max_len":128,
        "batch_size": 32,
        "epochs": 10,
        "patience":3,
        "n_splits" : 5,
        "lr": 1e-05,
        "num_workers":2,
        "smoothing": 0.1,
        "dp": 0.1,
        "train_file":'train.csv',
    })

project_name = "project"
args.update(
            {
                "project_name":project_name,
                "model_name":project_name,
             }
            )

seed_everything(args.seed)

current device : cuda
Seed set as 42


#Data Processing
: 자연어 csv 형태의 data를 bert model의 input 형태로 변경하는 과정

In [None]:
# Dataset 구성
class NewsDataset(Dataset):
    def __init__(self, tokenized_dataset, label):
        self.tokenized_dataset = tokenized_dataset
        self.label = label

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.tokenized_dataset.items()}
        item['label'] = torch.tensor(self.label[idx])
        return item

    def __len__(self):
        return len(self.label)

In [None]:
# 기본적인 데이터 전처리
def preprocessing_dataset(args, dataset):
    dataset = dataset.loc[dataset.label.isnull()==False,:] # label 기준으로 결측치 있는 행 제거
    dataset = dataset.drop_duplicates(['title','content','label']) # 중복제거
    dataset = dataset.reset_index(drop=True)
    return dataset

# 데이터 불러오기
def load_data(args, dataset_dir):
    dataset = pd.read_csv(dataset_dir) # load dataset
    dataset = preprocessing_dataset(args, dataset) # preprecessing dataset
    return dataset

# bert input을 위한 tokenizing
def tokenized_dataset(args, dataset, tokenizer):
    lst_title = dataset['title'].tolist() # 제목 column to list
    lst_content = dataset['content'].tolist() # 내용 column to list

    # 사전학습된 tokenizer를 활용해 data tokenizing
    tokenized_sentences = tokenizer(
        lst_title,
        lst_content,
        return_tensors="pt", # token return type : Tensor
        padding=True, # max_len으로 padding
        truncation=True, # max_len 보다 길 경우 crop
        max_length=args.seq_max_len, # max_len
        add_special_tokens=True # <CLS>, <SEP> 등 special token 추가
    )

    return tokenized_sentences

In [None]:
# DataLoader? 전체 data batch size로 slice, mini batch 생성
def get_trainLoader(args, train_data, valid_data, train_label, valid_label, tokenizer):
    # bert input을 위한 tokenizing
    tokenized_train = tokenized_dataset(args, train_data, tokenizer)
    tokenized_valid = tokenized_dataset(args, valid_data, tokenizer)

    # make dataset for pytorch.
    Newstrain_dataset = NewsDataset(tokenized_train, train_label)
    Newsvalid_dataset = NewsDataset(tokenized_valid, valid_label)

    # DataLoader 사용해 train, valid mini batch 생성
    trainloader = DataLoader(Newstrain_dataset,
                             batch_size=args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers,
                             )

    validloader = DataLoader(Newsvalid_dataset,
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers,
                             )

    return trainloader, validloader

# Optimizer
: loss 최적화 알고리즘 선택

In [None]:
def get_optimizer(model, args):
    optimizer = AdamW(
        model.parameters(), 
        lr=args.lr,
        weight_decay=0.01 # gradient descent에서 weight 업데이트를 할 때, 이전 weight의 크기를 일정 비율 감소시켜 과적합 방지
        )
    # 모든 parameter들의 grad값을 0으로 초기화
    optimizer.zero_grad()
    return optimizer

# Scheduler
: 학습 과정에서 lr 조절하는 scheduler 선택

In [None]:
def get_scheduler(optimizer, args, total_batch_):
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=int(total_batch_*args.epochs),
        )
    return scheduler

# Model
: transformers의 BertModel 구조와 한국어 data로 사전학습 된 parameter를 불러온 후, <br> classification에 사용하기 위해 추가적인 layer을 쌓음.

BERT 강의 (https://youtu.be/IwtexRHoWG0)

In [None]:
class kobert_Classifier(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=2, dr_rate=0.0):
        super(kobert_Classifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.pooler = nn.Linear(hidden_size, hidden_size)
        self.classifier = nn.Linear(hidden_size, num_classes)
        torch.nn.init.xavier_uniform_(self.classifier.weight)
        self.dropout = nn.Dropout(p=dr_rate)

    def forward(self, token_ids, attention_mask, segment_ids):
        out = self.bert(input_ids=token_ids, attention_mask=attention_mask, token_type_ids=segment_ids)[0]
        out = out[:, 0, :]
        out = self.pooler(out)
        out = torch.nn.functional.tanh(out)

        if self.dr_rate:
            out = self.dropout(out)
        
        return self.classifier(out)

In [None]:
# 사전학습된 tokenizer import
def get_tokenizer(args):
    tokenizer = AutoTokenizer.from_pretrained("kykim/bert-kor-base")
    return tokenizer

In [None]:
# 사전학습된 model, parameter import
def get_model(args) :
    feature_model = BertModel.from_pretrained("kykim/bert-kor-base")
    model = kobert_Classifier(feature_model, dr_rate = args.dp)
    return model

# Loss
: 손실함수 선택

In [None]:
# Label Smoothing은 Hard label(0, 1)을 Soft label(0과 1 사이의 값으로 구성)로 smoothing하는 것, 불균형 데이터 학습시 사용
# CrossEntrophy에 LabelSmoothing 적용한 loss
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes=2, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

def get_criterion(args):
    criterion = LabelSmoothingLoss(smoothing=args.smoothing)
    return criterion

# Train
- StratifiedKFold를 사용해 data 분할
- F1 Score 사용

In [None]:
def train(args, fold_lst=list(range(1, args.n_splits + 1))):
    criterion = get_criterion(args)
    tokenizer = get_tokenizer(args)
    all_dataset = load_data(args, dataset_dir = f'{args.train_file}')
    all_label = all_dataset['label'].values

    kf = StratifiedKFold(n_splits=args.n_splits, random_state=42, shuffle=True)
    fold_idx = 1
    best_val_f1_list = []
    for train_index, test_index in kf.split(all_dataset, all_label):
        ###########################
        if fold_idx not in fold_lst:
            fold_idx+=1
            continue
        ###########################

        os.makedirs(f'./models/{fold_idx}-fold', exist_ok=True)
        ### Model Select
        model = get_model(args)
        print('===================get model===================')
        model.to(device)

        train_data, valid_data = all_dataset.iloc[train_index], all_dataset.iloc[test_index]
        train_label, valid_label = all_label[train_index], all_label[test_index]
        
        print(f"len(train_label) : {len(train_label)}")
        print(f"len(valid_label) : {len(valid_label)}")

        trainloader, validloader = get_trainLoader(args, train_data, valid_data, train_label, valid_label, tokenizer)	
        total_batch_, valid_batch_ = len(trainloader), len(validloader)

        ### Optimizer
        optimizer = get_optimizer(model, args)

        ### Scheduler
        scheduler = get_scheduler(optimizer, args, total_batch_)

        best_val_loss, best_val_f1, = np.inf, 0
        early_stopping_counter = 0

        print(f"---------------------------------- {fold_idx} fold----------------------------------")	
        for i in tqdm(range(1, args.epochs+1)):
            model.train()
            epoch_perform, batch_perform = np.zeros(2), np.zeros(2)	
            print()
            progress_bar = tqdm(enumerate(trainloader), total=len(trainloader), leave=True, position=0,)
            for j, v in progress_bar:
                input_ids, attention_mask, labels = v['input_ids'].to(device), v['attention_mask'].to(device), v['label'].to(device)

                token_type_ids = None

                optimizer.zero_grad()

                output = model(input_ids, attention_mask, token_type_ids) ## label을 안 넣어서 logits값만 출력	

                loss = criterion(output, labels)
                loss.backward()
                optimizer.step()
                scheduler.step()

                predict = output.argmax(dim=-1)
                predict = predict.detach().cpu().numpy()
                labels = labels.detach().cpu().numpy()	
                f1 = f1_score(labels, predict)

                batch_perform += np.array([loss.item(), f1])
                epoch_perform += np.array([loss.item(), f1])

                if (j + 1) % 50 == 0:
                    print(
                        f"Epoch {i:#04d} #{j + 1:#03d} -- loss: {batch_perform[0] / 50:#.5f}, f1: {batch_perform[1] / 50:#.4f}"
                        )
                    batch_perform = np.zeros(2)
            print()
            print(
                f"Epoch {i:#04d} loss: {epoch_perform[0] / total_batch_:#.5f}, f1: {epoch_perform[1] / total_batch_:#.2f}"
                )
            
            ###### Validation	
            model.eval()
            valid_perform = np.zeros(2)

            all_valid_predict_lst = []
            all_valid_labels_lst = []

            with torch.no_grad():
                for v in tqdm(validloader, total=valid_batch_, leave=True, position=0,):
                    input_ids, attention_mask, valid_labels = v['input_ids'].to(device), v['attention_mask'].to(device), v['label'].to(device)

                    token_type_ids = None

                    valid_output = model(input_ids, attention_mask, token_type_ids)
                    valid_loss = criterion(valid_output, valid_labels)	

                    valid_predict = valid_output.argmax(dim=-1)
                    valid_predict = valid_predict.detach().cpu().numpy()
                    valid_labels = valid_labels.detach().cpu().numpy()	

                    valid_f1 = f1_score(valid_labels, valid_predict)	
                    valid_perform += np.array([valid_loss.item(), valid_f1])

                    all_valid_predict_lst += list(valid_predict)
                    all_valid_labels_lst += list(valid_labels)
            
            ###### Model save
            val_total_loss = valid_perform[0] / valid_batch_
            val_total_f1 = valid_perform[1] / valid_batch_
            if best_val_loss > val_total_loss:
                best_val_loss = val_total_loss
                best_epoch_loss = i
        
            if val_total_f1 > best_val_f1 + 5e-04:    #  and val_total_f1 >= 0.33
                print(f"New best model for val f1uracy : {val_total_f1:#.4f}! saving the best model..")
                torch.save(model.state_dict(), f"./models/{fold_idx}-fold/best.pt")
                
                # 참고 : Model 추가 재학습을 위한 모델을 저장하는 코드
                # https://tutorials.pytorch.kr/beginner/saving_loading_models.html#checkpoint

                best_val_f1 = val_total_f1
                best_epoch_f1 = i
                early_stopping_counter = 0
            
            else: # best보다 score가 안 좋을 때, early stopping check
                early_stopping_counter += 1
                if early_stopping_counter >= args.patience:
                    print(
                        f"EarlyStopping counter: {early_stopping_counter} out of {args.patience}"
                    )
                    break

            print()
            print(
                f">>>> Validation loss: {val_total_loss:#.5f}, f1: {val_total_f1:#.4f}"
                )
            print()

        best_val_f1_list.append(best_val_f1)
        fold_idx +=1
    print('='*50)
    print(f"{args.n_splits}-fold best_val_f1_list : {best_val_f1_list}")
    print('='*15, f'{args.n_splits}-fold Final Score(f1) : {np.mean(best_val_f1_list)}', '='*15)

# Training

In [None]:
args

In [None]:
train(args, fold_lst=list(range(1, args.n_splits + 1)))

In [None]:
model = get_model(args)
torch.save(model, './models/model.pt')