In [None]:
!pip install transformers
!pip install colorama

In [None]:
import numpy as np
import pandas as pd
import copy
import gc
import os
import re
import time
from tqdm import tqdm
from time import sleep

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset,  RandomSampler, SequentialSampler
import torch.nn.functional as F

import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, DataCollatorWithPadding, RobertaPreTrainedModel

from colorama import Fore, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

### CONFIG

In [None]:
CONFIG = {"seed": 42,
          "epochs": 3,
          "model_name": "huggingface/CodeBERTa-small-v1",
          "train_bsize": 32,
          "val_bsize": 128,
          "max_length": 256,
          "learning_rate": 1e-4, 
          "scheduler": 'get_linear_schedule_with_warmup', 
          "weight_decay": 0.01,
          "n_fold":3,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          }

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'],truncation_side = 'left')
print(CONFIG['device'])

In [None]:
# set seed 
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)

seed_everything(CONFIG['seed'])

### Loss function

In [None]:
# criterion = nn.MSELoss()
criterion = nn.CrossEntropyLoss()

### Step fucntion

In [None]:
THRESHOLD = 0.5 

def step_function(value):
    ret =  (value.view(-1) >= torch.tensor([THRESHOLD]).to(CONFIG['epochs'])).int()
    return ret

### 모델 저장 디렉토리 설정

In [None]:
################ 모델 저장 위치 복사해서 mkdir
OPTION='testff'
CONFIG['save_path'] = "/content/drive/MyDrive/{}_{}_model/".format(re.sub("/","",CONFIG['model_name']), OPTION)
CONFIG['save_path']
##### dir make
!mkdir CONFIG['save_path']

### Dataset df

In [None]:
df = pd.read_csv('./drive/MyDrive/data/train.csv')
df.head()

Unnamed: 0,code1,code2,similar
0,"import sys\ndef MI(): return map(int,sys.stdin...","M1, D1 = [int(x) for x in input().split()]\nM2...",1
1,"a = input()\nif a == ""1"":\n print(""0"")\nelse:...",import os\nimport sys\nimport math\nimport hea...,1
2,S = input()\nT = input()\nl = len(S)\ncount = ...,S = input()\nT = input()\nN = len(S)\ncnt = 0\...,1
3,import collections\nn = int(input())\nd = list...,"n = int(input())\nd = list(map(int, input().sp...",1
4,a = input().split()\nif int(a[0]) <= int(a[1])...,"import math\na,b = map(int,input().split())\np...",1


### fold 설정

In [None]:
# fold
from sklearn.model_selection import StratifiedKFold, KFold
skf = StratifiedKFold(n_splits=CONFIG['n_fold'], shuffle=True, random_state=CONFIG['seed'])

for fold, ( _, val_) in enumerate(skf.split(X=df, y=df.similar)):
    df.loc[val_ , "kfold"] = int(fold)
    
df["kfold"] = df["kfold"].astype(int)
df.head()

Unnamed: 0,code1,code2,similar,kfold
0,"import sys\ndef MI(): return map(int,sys.stdin...","M1, D1 = [int(x) for x in input().split()]\nM2...",1,1
1,"a = input()\nif a == ""1"":\n print(""0"")\nelse:...",import os\nimport sys\nimport math\nimport hea...,1,0
2,S = input()\nT = input()\nl = len(S)\ncount = ...,S = input()\nT = input()\nN = len(S)\ncnt = 0\...,1,0
3,import collections\nn = int(input())\nd = list...,"n = int(input())\nd = list(map(int, input().sp...",1,0
4,a = input().split()\nif int(a[0]) <= int(a[1])...,"import math\na,b = map(int,input().split())\np...",1,2


In [None]:
# Tokenizer
def tokenizing(dataset):
    codes1 = dataset['code1'].tolist()
    codes2 = dataset['code2'].tolist()
    labels = dataset['similar'].tolist()
    
    tokenized = CONFIG['tokenizer'](
        codes1,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=CONFIG['max_length']
        )
    tokenized2 = CONFIG['tokenizer'](
        codes2,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=CONFIG['max_length']
        )
    for key, value in tokenized2.items():
        tokenized[key+"2"] = value
    return tokenized, labels

# Dataset 구성.
class CustomDataset(Dataset):
    def __init__(self, tokenized_dataset, labels):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.tokenized_dataset.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
# dataloader
def pro_dataset(dataset, batch_size, state):
    tokenized, labels = tokenizing(dataset)
    
    custom_dataset = CustomDataset(tokenized, labels)
    if state == 'train':
        dataloader = DataLoader(
        custom_dataset, 
        sampler = RandomSampler(custom_dataset), 
        batch_size=batch_size,
        drop_last=True,
        )
    if state == 'val':
        dataloader = DataLoader(
        custom_dataset, 
        sampler = SequentialSampler(custom_dataset),
        batch_size=batch_size,

        drop_last=True,
        )
    return dataloader

In [None]:
# Customized model 
class CustomModel(nn.Module):
    def __init__(self, config):
        super(CustomModel, self).__init__()
        self.model = AutoModel.from_pretrained(CONFIG['model_name'], config=config)
        self.similarity_fn = nn.CosineSimilarity()
        self.sequential = nn.Sequential(
            nn.Linear(1, 64),
            nn.BatchNorm1d(64),
            nn.Linear(64, 2)
        )

    def forward(self, input_ids=None, attention_mask=None, 
                input_ids2=None, attention_mask2=None, labels=None):

        outputs1 = self.model(
            input_ids, attention_mask=attention_mask
        )

        outputs2 = self.model(
            input_ids2, attention_mask=attention_mask2
        )

        pooler1 = outputs1[0]
        pooler2 = outputs2[0]

        # Mean
        pooler1 =  pooler1.mean(dim=1) # self.pooling(pooler1, attention_mask)
        pooler2 =  pooler2.mean(dim=1) # self.pooling(pooler2, attention_mask2)

        # Normalize
        a_norm = F.normalize(pooler1, p=2, dim=1)
        b_norm = F.normalize(pooler2, p=2, dim=1)
        
        sim_score =  self.similarity_fn(a_norm, b_norm)
        sim_score = sim_score.unsqueeze(-1)
        sim_score = self.sequential(sim_score)
        del pooler1, pooler2, a_norm, b_norm

        return sim_score

In [None]:
def run_training(model, train_loader, val_loader, optimizer, scheduler, fold):

  best_model_wts = copy.deepcopy(model.state_dict())
  best_val_loss = np.inf
  best_val_acc = -1

  train_total_loss = []
  train_total_acc = []
  valid_total_loss = []
  valid_total_acc = []

  train_log_interval = 100
  valid_log_interval = 25 

  for epoch in range(1, CONFIG['epochs'] + 1): 
    print("\n==== epoch {} ====".format(epoch))
    model.train()

    # 학습 단계 loss/accuracy
    train_loss_value = 0
    train_epoch_loss = []
    train_accum_acc = 0
    train_epoch_acc = []

    bar = tqdm(enumerate(train_loader), total=len(train_loader))
    for idx, items in bar:

      item = {key: val.to(CONFIG['device']) for key, val in items.items()}
      optimizer.zero_grad()

      outputs = model(**item)

      preds = torch.argmax(outputs, dim=-1)# step_function(outputs)
      loss = criterion(outputs, item['labels'].view(-1))

      loss.backward()
      optimizer.step()
      scheduler.step()

      train_loss_value += loss.item()
      train_accum_acc += (sum(preds == item['labels']) / CONFIG['train_bsize'])
      if (idx + 1) % train_log_interval == 0:
        print("Loss: {:3f}   |    Accuracy: {:3f}".\
              format(train_loss_value/train_log_interval, train_accum_acc/train_log_interval))
        train_epoch_acc.append(train_accum_acc/train_log_interval)
        train_epoch_loss.append(train_loss_value/train_log_interval)
        train_loss_value = 0
        train_accum_acc = 0

        train_total_loss.append(sum(train_epoch_loss)/len(train_epoch_loss))
        train_total_acc.append(sum(train_epoch_acc)/len(train_epoch_acc))

    with torch.no_grad():
      # 검증 단계 loss/accuracy
      valid_loss_value = 0
      valid_epoch_loss = []
      valid_accum_acc = 0
      valid_epoch_acc = []

      print("---- Validation.... ----")
      model.eval()
      for idx, items in enumerate(val_loader):
        item = {key: val.to(CONFIG['device']) for key,val in items.items()}
        outputs = model(**item)

        preds = torch.argmax(outputs, dim=-1)# step_function(outputs)
        loss = criterion(outputs, item['labels'].view(-1))

        valid_loss_value += loss.item()
        valid_accum_acc += (sum(preds == item['labels']) / CONFIG['val_bsize'])
        if (idx + 1) % valid_log_interval == 0:
          print("Loss: {:3f}   |    Accuracy: {:3f}".\
                format(valid_loss_value/valid_log_interval, valid_accum_acc/valid_log_interval))
          valid_epoch_acc.append(valid_accum_acc/valid_log_interval)
          valid_epoch_loss.append(valid_loss_value/valid_log_interval)
          valid_loss_value = 0
          valid_accum_acc = 0
      print("Best Loss: {:3f}    |    This epoch Loss: {:3f}".format(best_val_loss, (sum(valid_epoch_loss)/len(valid_epoch_loss))))
      if best_val_loss > (sum(valid_epoch_loss)/len(valid_epoch_loss)):
        best_model_wts = copy.deepcopy(model.state_dict())
        PATH = CONFIG['save_path']+f"Loss-Fold-{fold}.pt"
        torch.save(model.state_dict(), PATH)

  return model

### 실행

In [None]:
for fold in range(0, CONFIG['n_fold']):
  print(f"{y_}====== Fold: {fold} ======{sr_}")
  
  # dataloader
  df_train = df[df.kfold != fold].reset_index(drop=True)
  df_val = df[df.kfold == fold].reset_index(drop=True)

  train_loader = pro_dataset(df_train, CONFIG['train_bsize'], 'train')
  valid_loader = pro_dataset(df_val, CONFIG['val_bsize'], 'val')
  
  model = CustomModel(config=MODEL_CONFIG).to(CONFIG['device'])
  optimizer = optim.Adam(
      model.parameters(),
      lr=CONFIG['learning_rate'],
      weight_decay=CONFIG['weight_decay'],
      eps=1e-8
      )
  scheduler = transformers.get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=500,
      num_training_steps=len(train_loader)* CONFIG["epochs"],
      last_epoch=-1
      )
  
  model = run_training(model, train_loader, valid_loader,optimizer, scheduler,fold=fold)

  del model, train_loader, valid_loader
  print()