In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# pip, import

In [None]:
!pip install transformers
!pip install transformers datasets
!pip install wandb

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import random
import copy
from pprint import pprint
from tqdm import tqdm, tqdm_notebook
from collections import defaultdict, Counter, deque
import re
from itertools import chain
from importlib import import_module
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss, MSELoss
from torch.utils.data import (Dataset,
                              DataLoader, 
                              RandomSampler, 
                              SequentialSampler, 
                              TensorDataset)
from transformers import TrainingArguments, Trainer
from transformers import (AutoConfig, 
                          AutoTokenizer, 
                          RobertaForSequenceClassification,
                          Trainer,
                          TrainingArguments,
                          DataCollatorWithPadding,
                          EarlyStoppingCallback)
from transformers import AdamW
from transformers import (get_scheduler, 
                          get_cosine_with_hard_restarts_schedule_with_warmup,
                          get_linear_schedule_with_warmup)
from torch.optim.lr_scheduler import ReduceLROnPlateau, _LRScheduler
from tqdm.auto import tqdm
from datasets import load_metric, load_dataset, Dataset, concatenate_datasets
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import (accuracy_score, 
                             precision_recall_curve,
                             f1_score,
                             auc)
from sklearn.model_selection import StratifiedKFold
from torch.optim import Adam, AdamW
from torch.optim.optimizer import Optimizer, required
import math
import easydict
import wandb

# 시드 고정

In [None]:
def seed_everything(seed: int = 42, contain_cuda: bool = False):
  os.environ['PYTHONHASHSEED'] = str(seed)
  random.seed(seed)
  np.random.seed(seed)

  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  print(f"Seed set as {seed}")

seed = 42
seed_everything(seed)

# 경로 설정 및 디바이스 정의

In [None]:
root_dir = '/content/drive/MyDrive'
project_folder = "DACON"
os.chdir(os.path.join(root_dir,project_folder))

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# wandb에 잘못 분류하는 코드 Pair 기록

In [None]:
def wrong_batch_for_wandb(tokenizer,
                          wrong_sample_index,
                          input_ids,
                          valid_labels,
                          valid_predict,
                          valid_output,
                          ):
  num_to_label_dict = {0:'diff', 1:'same',}

  wrong_sample_index = np.where(valid_labels!=valid_predict)[0]
  wrong_sample_text = [tokenizer.decode(element, skip_special_tokens=False) for element in input_ids[wrong_sample_index]]
  wrong_sample_label = [num_to_label_dict[lab] for lab in list(valid_labels[wrong_sample_index])]
  wrong_sample_pred = [num_to_label_dict[pred] for pred in list(valid_predict[wrong_sample_index])]
  wrong_sample_output = valid_output[wrong_sample_index].tolist()

  diff_prob, same_prob = [], []
  for element in wrong_sample_output:
      diff_prob.append(element[0])
      same_prob.append(element[1])

  return wrong_sample_text, wrong_sample_label, wrong_sample_pred, diff_prob, same_prob

# Optimizer

In [None]:
class AdamP(Optimizer):
  def __init__(
      self,
      params,
      lr=1e-3,
      betas=(0.9, 0.999),
      eps=1e-8,
      weight_decay=0,
      delta=0.1,
      wd_ratio=0.1,
      nesterov=False,
      ):
    defaults = dict(
        lr=lr,
        betas=betas,
        eps=eps,
        weight_decay=weight_decay,
        delta=delta,
        wd_ratio=wd_ratio,
        nesterov=nesterov,
        )
    super(AdamP, self).__init__(params, defaults)

  def _channel_view(self, x):
    return x.view(x.size(0), -1)

  def _layer_view(self, x):
    return x.view(1, -1)

  def _cosine_similarity(self, x, y, eps, view_func):
    x = view_func(x)
    y = view_func(y)

    return F.cosine_similarity(x, y, dim=1, eps=eps).abs_()

  def _projection(self, p, grad, perturb, delta, wd_ratio, eps):
    wd = 1
    expand_size = [-1] + [1] * (len(p.shape) - 1)
    for view_func in [self._channel_view, self._layer_view]:
      
      cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func)
      
      if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)):
        p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps)
        perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size)
        wd = wd_ratio
        return perturb, wd

    return perturb, wd

  def step(self, closure=None):
    loss = None
    if closure is not None:
      loss = closure()

    for group in self.param_groups:
      for p in group["params"]:
        if p.grad is None:
          continue

        grad = p.grad.data
        beta1, beta2 = group["betas"]
        nesterov = group["nesterov"]

        state = self.state[p]

        # State initialization
        if len(state) == 0:
          state["step"] = 0
          state["exp_avg"] = torch.zeros_like(p.data)
          state["exp_avg_sq"] = torch.zeros_like(p.data)

        # Adam
        exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]

        state["step"] += 1
        bias_correction1 = 1 - beta1 ** state["step"]
        bias_correction2 = 1 - beta2 ** state["step"]

        exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

        denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(
            group["eps"]
            )
        step_size = group["lr"] / bias_correction1

        if nesterov:
          perturb = (beta1 * exp_avg + (1 - beta1) * grad) / denom
        else:
          perturb = exp_avg / denom

        # Projection
        wd_ratio = 1
        if len(p.shape) > 1:
          perturb, wd_ratio = self._projection(
              p,
              grad,
              perturb,
              group["delta"],
              group["wd_ratio"],
              group["eps"],
              )

          # Weight decay
        if group["weight_decay"] > 0:
          p.data.mul_(1 - group["lr"] * group["weight_decay"] * wd_ratio)

          # Step
        p.data.add_(perturb, alpha=-step_size)

    return loss

def get_optimizer(model, args):
  if args.optimizer == "Adam":
    optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01)
  elif args.optimizer == "AdamW":
    optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01)
  elif args.optimizer == "AdamP":
    optimizer = AdamP(
        model.parameters(),
        lr=args.lr,
        betas=(0.9, 0.999),
        weight_decay=0.01,
        delta=0.1,
        wd_ratio=0.1,
        nesterov=False,
        )
  else:
    raise NotImplementedError('Optimizer not available')

  # 모든 parameter들의 grad값을 0으로 초기화
  optimizer.zero_grad()

  return optimizer

# Scheduler

In [None]:
class CosineAnnealingWarmupRestarts(_LRScheduler):
  """
    optimizer (Optimizer): Wrapped optimizer.
    first_cycle_steps (int): First cycle step size.
    cycle_mult(float): Cycle steps magnification. Default: -1.
    max_lr(float): First cycle's max learning rate. Default: 0.1.
    min_lr(float): Min learning rate. Default: 0.001.
    warmup_steps(int): Linear warmup step size. Default: 0.
    gamma(float): Decrease rate of max learning rate by cycle. Default: 1.
    last_epoch (int): The index of last epoch. Default: -1.
  """
  def __init__(self,
               optimizer : torch.optim.Optimizer,
               first_cycle_steps : int,
               cycle_mult : float = 1.,
               max_lr : float = 0.1,
               min_lr : float = 0.001,
               warmup_steps : int = 0,
               gamma : float = 1.,
               last_epoch : int = -1
               ):
    assert warmup_steps < first_cycle_steps
        
    self.first_cycle_steps = first_cycle_steps # first cycle step size
    self.cycle_mult = cycle_mult # cycle steps magnification
    self.base_max_lr = max_lr # first max learning rate
    self.max_lr = max_lr # max learning rate in the current cycle
    self.min_lr = min_lr # min learning rate
    self.warmup_steps = warmup_steps # warmup step size
    self.gamma = gamma # decrease rate of max learning rate by cycle
    
    self.cur_cycle_steps = first_cycle_steps # first cycle step size
    self.cycle = 0 # cycle count
    self.step_in_cycle = last_epoch # step size of the current cycle
    
    super(CosineAnnealingWarmupRestarts, self).__init__(optimizer, last_epoch)
        
    # set learning rate min_lr
    self.init_lr()
    
  def init_lr(self):
    self.base_lrs = []
    for param_group in self.optimizer.param_groups:
      param_group['lr'] = self.min_lr
      self.base_lrs.append(self.min_lr)
    
  def get_lr(self):
    if self.step_in_cycle == -1:
      return self.base_lrs
    elif self.step_in_cycle < self.warmup_steps:
      return [(self.max_lr - base_lr)*self.step_in_cycle / self.warmup_steps + base_lr for base_lr in self.base_lrs]
    else:
      return [base_lr + (self.max_lr - base_lr) \
              * (1 + math.cos(math.pi * (self.step_in_cycle-self.warmup_steps) \
                              / (self.cur_cycle_steps - self.warmup_steps))) / 2
              for base_lr in self.base_lrs]

  def step(self, epoch=None):
    if epoch is None:
      epoch = self.last_epoch + 1
      self.step_in_cycle = self.step_in_cycle + 1
      if self.step_in_cycle >= self.cur_cycle_steps:
        self.cycle += 1
        self.step_in_cycle = self.step_in_cycle - self.cur_cycle_steps
        self.cur_cycle_steps = int((self.cur_cycle_steps - self.warmup_steps) * self.cycle_mult) + self.warmup_steps
      else:
        if epoch >= self.first_cycle_steps:
          if self.cycle_mult == 1.:
            self.step_in_cycle = epoch % self.first_cycle_steps
            self.cycle = epoch // self.first_cycle_steps
          else:
            n = int(math.log((epoch / self.first_cycle_steps * (self.cycle_mult - 1) + 1), self.cycle_mult))
            self.cycle = n
            self.step_in_cycle = epoch - int(self.first_cycle_steps * (self.cycle_mult ** n - 1) / (self.cycle_mult - 1))
            self.cur_cycle_steps = self.first_cycle_steps * self.cycle_mult ** (n)
        else:
          self.cur_cycle_steps = self.first_cycle_steps
          self.step_in_cycle = epoch
                
        self.max_lr = self.base_max_lr * (self.gamma**self.cycle)
        self.last_epoch = math.floor(epoch)
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr


def get_scheduler(optimizer, args, total_batch_):
  if args.scheduler == "plateau":
      scheduler = ReduceLROnPlateau(
          optimizer, patience=2, factor=0.85, mode="max", verbose=True
      )
  elif args.scheduler == "linear":
      scheduler = get_linear_schedule_with_warmup(
          optimizer,
          # num_warmup_steps=int(total_batch_*args.epochs*0.1),
          num_warmup_steps=args.warmup_steps,
          num_training_steps=int(total_batch_*args.epochs),
      )
  elif args.scheduler == "cosine":
      scheduler = CosineAnnealingWarmupRestarts(  
          optimizer,
          first_cycle_steps=200,
          warmup_steps=args.warmup_steps,
          cycle_mult=args.cycle_mult,
          max_lr=args.lr,
          min_lr=args.lr * 0.01,
          gamma=0.9,
      )
  else:
    raise NotImplementedError('LR Scheduler not available')

  return scheduler

# Loss

In [None]:
class FocalLoss(nn.Module):
  def __init__(self, weight=None,
               gamma=2., reduction='mean'):
    nn.Module.__init__(self)
    self.weight = weight
    self.gamma = gamma
    self.reduction = reduction

  def forward(self, input_tensor, target_tensor):
    log_prob = F.log_softmax(input_tensor, dim=-1)
    prob = torch.exp(log_prob)
    return F.nll_loss(
        ((1 - prob) ** self.gamma) * log_prob,
        target_tensor,
        weight=self.weight,
        reduction=self.reduction
        )

class LabelSmoothingLoss(nn.Module):
  def __init__(self, classes=3, smoothing=0.0, dim=-1):
    super(LabelSmoothingLoss, self).__init__()
    self.confidence = 1.0 - smoothing
    self.smoothing = smoothing
    self.cls = classes
    self.dim = dim

  def forward(self, pred, target):
    pred = pred.log_softmax(dim=self.dim)
    with torch.no_grad():
      true_dist = torch.zeros_like(pred)
      true_dist.fill_(self.smoothing / (self.cls - 1))
      true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
    return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))


def get_criterion(args):
  if args.smoothing!=0 and args.criterion == 'smoothing':
    criterion = LabelSmoothingLoss(smoothing=args.smoothing)
  elif args.criterion == 'cross':
    criterion = nn.CrossEntropyLoss()
  elif args.criterion == 'focal':
    criterion = FocalLoss(gamma=2.0)
  else:
    raise NotImplementedError('Criterion not available')
  return criterion

# Tokenize

In [None]:
# 5 folds를 위해 나누어져있는 데이터를 다시 합쳤습니다.
tdataset = load_dataset("csv", data_files='/content/drive/MyDrive/train_data_lv1.csv')['train']
vdataset = load_dataset("csv", data_files='/content/drive/MyDrive/valid_data_lv1.csv')['train']
rawdataset = concatenate_datasets([tdataset, vdataset])
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = 'left'

def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding='max_length', max_length=512, truncation=True)
    outputs['labels'] = examples['similar']
    return outputs

dataset = rawdataset.map(example_fn, remove_columns=['code1', 'code2', 'similar'])

# Arguments 설정

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'current device : {device}')

args = easydict.EasyDict({
    "seed":42,
    "optimizer":"AdamW",
    "scheduler":"linear",
    "warmup_steps":500,
    "cycle_mult":1.2,
    "batch_size": 16,
    "patience":5,
    "n_splits":6,
    "epochs":3,
    "lr": 2e-05,
    "criterion":'cross',
    "smoothing": 0.0,
    "model": "microsoft/graphcodebert-base",
    "logging_wrong_samples":True,
    })

project_name = "graphcodebert_Bs16_OptAdamW_ScduLinear_Sm0.0"
args.update(
            {
                "project_name":project_name,
                "model_name":project_name,
             }
            )

seed_everything(args.seed)

# Train

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
criterion = get_criterion(args)
config =  AutoConfig.from_pretrained("microsoft/graphcodebert-base")
config.num_labels = 2
model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base", config=config)
model.to(device)

best_val_acc_list = []
gap = int(len(dataset) / args.n_splits)

In [None]:
wandb.login()

## fold 1

In [None]:
# Google Colab 런타임문제로 cell 단위로 나누어 진행했습니다.
f = 1

print(f"---------------------------------- {f} fold----------------------------------")

run = wandb.init(project=args.project_name)
wandb.run.name = f'{args.model_name}/{f}-fold'
wandb.config.update(args)
os.makedirs(f'./models/{args.model_name}/{f}-fold', exist_ok=True)

total_size = len(dataset)
total_ids = list(range(total_size))
del_ids = list(range((f-1)*gap, f*gap))
training_ids = set(total_ids) - set(del_ids)

training_dset = dataset.select(list(training_ids))
eval_dset = dataset.select(del_ids)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainloader = DataLoader(training_dset,
                          batch_size=16,
                          shuffle=True,
                          collate_fn = collator
                          )

validloader = DataLoader(eval_dset,
                          batch_size=16,
                          shuffle=False,
                          collate_fn = collator
                          )

total_batch_ = len(trainloader)
valid_batch_ = len(validloader)

optimizer = get_optimizer(model, args)
scheduler = get_scheduler(optimizer, args, total_batch_)

## epoch 1

In [None]:
f = 1
e = 1

print(f"------------------------------ {f} fold {e} epoch------------------------------")

model.train()
epoch_perform, batch_perform = np.zeros(2), np.zeros(2)
print()	
progress_bar = tqdm(enumerate(trainloader), total=len(trainloader), leave=True, position=0,)
for j, v in progress_bar:
  input_ids, attention_mask, labels = v['input_ids'].to(device), v['attention_mask'].to(device), v['labels'].to(device)
  
  optimizer.zero_grad()
  
  outputs = model(input_ids, attention_mask) ## label을 안 넣어서 logits값만 출력
  output = outputs.logits # The outputs object is a SequenceClassifierOutput
  loss = criterion(output, labels)
  loss.backward()
  optimizer.step()
  scheduler.step()
  for learning_rate in scheduler.get_lr():
    wandb.log({"learning_rate": learning_rate})

  predict = output.argmax(dim=-1)
  predict = predict.detach().cpu().numpy()
  labels = labels.detach().cpu().numpy()
  acc = accuracy_score(labels, predict)

  batch_perform += np.array([loss.item(), acc])
  epoch_perform += np.array([loss.item(), acc])

  if (j + 1) % 50 == 0:
    print(
        f"Epoch {e} #{j + 1} -- loss: {batch_perform[0] / 50}, acc: {batch_perform[1] / 50}"
    )
    batch_perform = np.zeros(2)
print()
print(
    f"Epoch {e} loss: {epoch_perform[0] / total_batch_}, acc: {epoch_perform[1] / total_batch_}"
    )
wandb.log({
    "epoch": e,
    "Train epoch Loss": epoch_perform[0] / total_batch_,
    "Train epoch Acc": epoch_perform[1] / total_batch_}
    )
torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/train.pt")

In [None]:
f = 1
e = 1
best_val_loss, best_val_acc, = np.inf, 0
# Validation
load_path = f'./models/{args.model_name}/{f}-fold/train.pt'
model.load_state_dict(torch.load(load_path,map_location=device))
model.to(device)
model.eval()
valid_perform = np.zeros(2)

all_valid_predict_lst = []
all_valid_labels_lst = []

# wandb에 잘못 분류하는 코드 Pair 기록
wrong_sample_dict = defaultdict(list)

with torch.no_grad():
    for v in validloader:
      input_ids, attention_mask, valid_labels = v["input_ids"].to(device), v["attention_mask"].to(device), v["labels"].to(device)
      
      valid_outputs = model(input_ids, attention_mask)
      valid_output = valid_outputs.logits
      valid_loss = criterion(valid_output, valid_labels)
      
      valid_predict = valid_output.argmax(dim=-1)
      valid_predict = valid_predict.detach().cpu().numpy()
      valid_labels = valid_labels.detach().cpu().numpy()

      
      # wandb에 잘못 분류하는 코드 Pair 기록
      if args.logging_wrong_samples:
        wrong_sample_index = np.where(valid_labels!=valid_predict)[0]
        if len(wrong_sample_index)>0:
          wrong_sample_text, wrong_sample_label, wrong_sample_pred, entailment_prob, contradiction_prob = wrong_batch_for_wandb(tokenizer, wrong_sample_index, input_ids, valid_labels, valid_predict, valid_output)

          wrong_sample_dict['입력 코드 Pair'] += wrong_sample_text
          wrong_sample_dict['실제값'] += wrong_sample_label
          wrong_sample_dict['예측값'] += wrong_sample_pred
          wrong_sample_dict['diff_logit'] += entailment_prob
          wrong_sample_dict['same_logit'] += contradiction_prob


      valid_acc = accuracy_score(valid_labels, valid_predict)
      valid_perform += np.array([valid_loss.item(), valid_acc])

      all_valid_predict_lst += list(valid_predict)
      all_valid_labels_lst += list(valid_labels)

# Model 저장

val_total_loss = valid_perform[0] / valid_batch_
val_total_acc = valid_perform[1] / valid_batch_
best_val_loss = min(best_val_loss, val_total_loss)


if val_total_acc > best_val_acc:
    print(f"New best model for val accuracy : {val_total_acc}! saving the best model..")
    torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/best.pt")

    best_val_acc = val_total_acc

    # wandb에 Confusion Matrix 생성
    class_names = ['diff','same'] # (0,1)
    wandb.log({f"{e}_epoch_conf_mat" : wandb.plot.confusion_matrix(probs=None,
                                                                      y_true=all_valid_labels_lst, preds=all_valid_predict_lst,
                                                                      class_names=class_names)})
      
    if args.logging_wrong_samples and val_total_acc > 0.91:

      wrong_sample_df = pd.DataFrame(wrong_sample_dict)
      wrong_sample_df.to_csv(f"./models/{args.model_name}/{f}-fold/wrong_df.csv",index=False)
      print('='*15,f'Fold{f} Wrong DataFrame Saved','='*15)
      text_table = wandb.Table(data = wrong_sample_df)
      run.log({f"{f}_fold_wrong_samples" : text_table})
    
print()
print(
    f">>>> Validation loss: {val_total_loss}, Acc: {val_total_acc}"
    )
print()
wandb.log({
    "epoch": e,
    "Last_Valid Loss": val_total_loss,
    "Last_Valid Acc": val_total_acc,
    })
best_val_acc_list.append(best_val_acc)
print('='*50)
print(f"{f}fold best_val_acc_list : {best_val_acc_list}")
print('='*15, f'{f}fold Final Score(ACC) : {np.mean(best_val_acc_list)}', '='*15)
wandb.log({
f"Total Mean ACC ({f}fold)": np.mean(best_val_acc_list)}
)

## fold 2

In [None]:
# Google Colab 런타임문제로 cell 단위로 나누어 진행했습니다.
f = 2

print(f"---------------------------------- {f} fold----------------------------------")

run = wandb.init(project=args.project_name)
wandb.run.name = f'{args.model_name}/{f}-fold'
wandb.config.update(args)
os.makedirs(f'./models/{args.model_name}/{f}-fold', exist_ok=True)

total_size = len(dataset)
total_ids = list(range(total_size))
del_ids = list(range((f-1)*gap, f*gap))
training_ids = set(total_ids) - set(del_ids)

training_dset = dataset.select(list(training_ids))
eval_dset = dataset.select(del_ids)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainloader = DataLoader(training_dset,
                          batch_size=16,
                          shuffle=True,
                          collate_fn = collator
                          )

validloader = DataLoader(eval_dset,
                          batch_size=16,
                          shuffle=False,
                          collate_fn = collator
                          )

total_batch_ = len(trainloader)
valid_batch_ = len(validloader)

optimizer = get_optimizer(model, args)
scheduler = get_scheduler(optimizer, args, total_batch_)

## epoch 1

In [None]:
f = 2
e = 1

print(f"------------------------------ {f} fold {e} epoch------------------------------")

model.train()
epoch_perform, batch_perform = np.zeros(2), np.zeros(2)
print()	
progress_bar = tqdm(enumerate(trainloader), total=len(trainloader), leave=True, position=0,)
for j, v in progress_bar:
  input_ids, attention_mask, labels = v['input_ids'].to(device), v['attention_mask'].to(device), v['labels'].to(device)
  
  optimizer.zero_grad()
  
  outputs = model(input_ids, attention_mask) ## label을 안 넣어서 logits값만 출력
  output = outputs.logits # The outputs object is a SequenceClassifierOutput
  loss = criterion(output, labels)
  loss.backward()
  optimizer.step()
  scheduler.step()
  for learning_rate in scheduler.get_lr():
    wandb.log({"learning_rate": learning_rate})

  predict = output.argmax(dim=-1)
  predict = predict.detach().cpu().numpy()
  labels = labels.detach().cpu().numpy()
  acc = accuracy_score(labels, predict)

  batch_perform += np.array([loss.item(), acc])
  epoch_perform += np.array([loss.item(), acc])

  if (j + 1) % 50 == 0:
    print(
        f"Epoch {e} #{j + 1} -- loss: {batch_perform[0] / 50}, acc: {batch_perform[1] / 50}"
    )
    batch_perform = np.zeros(2)
print()
print(
    f"Epoch {e} loss: {epoch_perform[0] / total_batch_}, acc: {epoch_perform[1] / total_batch_}"
    )
wandb.log({
    "epoch": e,
    "Train epoch Loss": epoch_perform[0] / total_batch_,
    "Train epoch Acc": epoch_perform[1] / total_batch_}
    )
torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/train.pt")

In [None]:
f = 2
e = 1
best_val_loss, best_val_acc, = np.inf, 0
# Validation
load_path = f'./models/{args.model_name}/{f}-fold/train.pt'
model.load_state_dict(torch.load(load_path,map_location=device))
model.to(device)
model.eval()
valid_perform = np.zeros(2)

all_valid_predict_lst = []
all_valid_labels_lst = []

# wandb에 잘못 분류하는 코드 Pair 기록
wrong_sample_dict = defaultdict(list)

with torch.no_grad():
    for v in validloader:
      input_ids, attention_mask, valid_labels = v["input_ids"].to(device), v["attention_mask"].to(device), v["labels"].to(device)
      
      valid_outputs = model(input_ids, attention_mask)
      valid_output = valid_outputs.logits
      valid_loss = criterion(valid_output, valid_labels)
      
      valid_predict = valid_output.argmax(dim=-1)
      valid_predict = valid_predict.detach().cpu().numpy()
      valid_labels = valid_labels.detach().cpu().numpy()

      
      # wandb에 잘못 분류하는 코드 Pair 기록
      if args.logging_wrong_samples:
        wrong_sample_index = np.where(valid_labels!=valid_predict)[0]
        if len(wrong_sample_index)>0:
          wrong_sample_text, wrong_sample_label, wrong_sample_pred, entailment_prob, contradiction_prob = wrong_batch_for_wandb(tokenizer, wrong_sample_index, input_ids, valid_labels, valid_predict, valid_output)

          wrong_sample_dict['입력 코드 Pair'] += wrong_sample_text
          wrong_sample_dict['실제값'] += wrong_sample_label
          wrong_sample_dict['예측값'] += wrong_sample_pred
          wrong_sample_dict['diff_logit'] += entailment_prob
          wrong_sample_dict['same_logit'] += contradiction_prob


      valid_acc = accuracy_score(valid_labels, valid_predict)
      valid_perform += np.array([valid_loss.item(), valid_acc])

      all_valid_predict_lst += list(valid_predict)
      all_valid_labels_lst += list(valid_labels)

# Model 저장

val_total_loss = valid_perform[0] / valid_batch_
val_total_acc = valid_perform[1] / valid_batch_
best_val_loss = min(best_val_loss, val_total_loss)


if val_total_acc > best_val_acc:
    print(f"New best model for val accuracy : {val_total_acc}! saving the best model..")
    torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/best.pt")

    best_val_acc = val_total_acc

    # wandb에 Confusion Matrix 생성
    class_names = ['diff','same'] # (0,1)
    wandb.log({f"{e}_epoch_conf_mat" : wandb.plot.confusion_matrix(probs=None,
                                                                      y_true=all_valid_labels_lst, preds=all_valid_predict_lst,
                                                                      class_names=class_names)})
      
    if args.logging_wrong_samples and val_total_acc > 0.91:

      wrong_sample_df = pd.DataFrame(wrong_sample_dict)
      wrong_sample_df.to_csv(f"./models/{args.model_name}/{f}-fold/wrong_df.csv",index=False)
      print('='*15,f'Fold{f} Wrong DataFrame Saved','='*15)
      text_table = wandb.Table(data = wrong_sample_df)
      run.log({f"{f}_fold_wrong_samples" : text_table})
    
print()
print(
    f">>>> Validation loss: {val_total_loss}, Acc: {val_total_acc}"
    )
print()
wandb.log({
    "epoch": e,
    "Last_Valid Loss": val_total_loss,
    "Last_Valid Acc": val_total_acc,
    })
best_val_acc_list.append(best_val_acc)
print('='*50)
print(f"{f}fold best_val_acc_list : {best_val_acc_list}")
print('='*15, f'{f}fold Final Score(ACC) : {np.mean(best_val_acc_list)}', '='*15)
wandb.log({
f"Total Mean ACC ({f}fold)": np.mean(best_val_acc_list)}
)

## fold 3

In [None]:
# Google Colab 런타임문제로 cell 단위로 나누어 진행했습니다.
f = 3

print(f"---------------------------------- {f} fold----------------------------------")

run = wandb.init(project=args.project_name)
wandb.run.name = f'{args.model_name}/{f}-fold'
wandb.config.update(args)
os.makedirs(f'./models/{args.model_name}/{f}-fold', exist_ok=True)

total_size = len(dataset)
total_ids = list(range(total_size))
del_ids = list(range((f-1)*gap, f*gap))
training_ids = set(total_ids) - set(del_ids)

training_dset = dataset.select(list(training_ids))
eval_dset = dataset.select(del_ids)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainloader = DataLoader(training_dset,
                          batch_size=16,
                          shuffle=True,
                          collate_fn = collator
                          )

validloader = DataLoader(eval_dset,
                          batch_size=16,
                          shuffle=False,
                          collate_fn = collator
                          )

total_batch_ = len(trainloader)
valid_batch_ = len(validloader)

optimizer = get_optimizer(model, args)
scheduler = get_scheduler(optimizer, args, total_batch_)

## epoch 1

In [None]:
f = 3
e = 1

print(f"------------------------------ {f} fold {e} epoch------------------------------")

model.train()
epoch_perform, batch_perform = np.zeros(2), np.zeros(2)
print()	
progress_bar = tqdm(enumerate(trainloader), total=len(trainloader), leave=True, position=0,)
for j, v in progress_bar:
  input_ids, attention_mask, labels = v['input_ids'].to(device), v['attention_mask'].to(device), v['labels'].to(device)
  
  optimizer.zero_grad()
  
  outputs = model(input_ids, attention_mask) ## label을 안 넣어서 logits값만 출력
  output = outputs.logits # The outputs object is a SequenceClassifierOutput
  loss = criterion(output, labels)
  loss.backward()
  optimizer.step()
  scheduler.step()
  for learning_rate in scheduler.get_lr():
    wandb.log({"learning_rate": learning_rate})

  predict = output.argmax(dim=-1)
  predict = predict.detach().cpu().numpy()
  labels = labels.detach().cpu().numpy()
  acc = accuracy_score(labels, predict)

  batch_perform += np.array([loss.item(), acc])
  epoch_perform += np.array([loss.item(), acc])

  if (j + 1) % 50 == 0:
    print(
        f"Epoch {e} #{j + 1} -- loss: {batch_perform[0] / 50}, acc: {batch_perform[1] / 50}"
    )
    batch_perform = np.zeros(2)
print()
print(
    f"Epoch {e} loss: {epoch_perform[0] / total_batch_}, acc: {epoch_perform[1] / total_batch_}"
    )
wandb.log({
    "epoch": e,
    "Train epoch Loss": epoch_perform[0] / total_batch_,
    "Train epoch Acc": epoch_perform[1] / total_batch_}
    )
torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/train.pt")

In [None]:
f = 3
e = 1
best_val_loss, best_val_acc, = np.inf, 0
# Validation
load_path = f'./models/{args.model_name}/{f}-fold/train.pt'
model.load_state_dict(torch.load(load_path,map_location=device))
model.to(device)
model.eval()
valid_perform = np.zeros(2)

all_valid_predict_lst = []
all_valid_labels_lst = []

# wandb에 잘못 분류하는 코드 Pair 기록
wrong_sample_dict = defaultdict(list)

with torch.no_grad():
    for v in validloader:
      input_ids, attention_mask, valid_labels = v["input_ids"].to(device), v["attention_mask"].to(device), v["labels"].to(device)
      
      valid_outputs = model(input_ids, attention_mask)
      valid_output = valid_outputs.logits
      valid_loss = criterion(valid_output, valid_labels)
      
      valid_predict = valid_output.argmax(dim=-1)
      valid_predict = valid_predict.detach().cpu().numpy()
      valid_labels = valid_labels.detach().cpu().numpy()

      
      # wandb에 잘못 분류하는 코드 Pair 기록
      if args.logging_wrong_samples:
        wrong_sample_index = np.where(valid_labels!=valid_predict)[0]
        if len(wrong_sample_index)>0:
          wrong_sample_text, wrong_sample_label, wrong_sample_pred, entailment_prob, contradiction_prob = wrong_batch_for_wandb(tokenizer, wrong_sample_index, input_ids, valid_labels, valid_predict, valid_output)

          wrong_sample_dict['입력 코드 Pair'] += wrong_sample_text
          wrong_sample_dict['실제값'] += wrong_sample_label
          wrong_sample_dict['예측값'] += wrong_sample_pred
          wrong_sample_dict['diff_logit'] += entailment_prob
          wrong_sample_dict['same_logit'] += contradiction_prob


      valid_acc = accuracy_score(valid_labels, valid_predict)
      valid_perform += np.array([valid_loss.item(), valid_acc])

      all_valid_predict_lst += list(valid_predict)
      all_valid_labels_lst += list(valid_labels)

# Model 저장

val_total_loss = valid_perform[0] / valid_batch_
val_total_acc = valid_perform[1] / valid_batch_
best_val_loss = min(best_val_loss, val_total_loss)


if val_total_acc > best_val_acc:
    print(f"New best model for val accuracy : {val_total_acc}! saving the best model..")
    torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/best.pt")

    best_val_acc = val_total_acc

    # wandb에 Confusion Matrix 생성
    class_names = ['diff','same'] # (0,1)
    wandb.log({f"{e}_epoch_conf_mat" : wandb.plot.confusion_matrix(probs=None,
                                                                      y_true=all_valid_labels_lst, preds=all_valid_predict_lst,
                                                                      class_names=class_names)})
      
    if args.logging_wrong_samples and val_total_acc > 0.91:

      wrong_sample_df = pd.DataFrame(wrong_sample_dict)
      wrong_sample_df.to_csv(f"./models/{args.model_name}/{f}-fold/wrong_df.csv",index=False)
      print('='*15,f'Fold{f} Wrong DataFrame Saved','='*15)
      text_table = wandb.Table(data = wrong_sample_df)
      run.log({f"{f}_fold_wrong_samples" : text_table})
    
print()
print(
    f">>>> Validation loss: {val_total_loss}, Acc: {val_total_acc}"
    )
print()
wandb.log({
    "epoch": e,
    "Last_Valid Loss": val_total_loss,
    "Last_Valid Acc": val_total_acc,
    })
best_val_acc_list.append(best_val_acc)
print('='*50)
print(f"{f}fold best_val_acc_list : {best_val_acc_list}")
print('='*15, f'{f}fold Final Score(ACC) : {np.mean(best_val_acc_list)}', '='*15)
wandb.log({
f"Total Mean ACC ({f}fold)": np.mean(best_val_acc_list)}
)

## fold 4

In [None]:
# Google Colab 런타임문제로 cell 단위로 나누어 진행했습니다.
f = 4

print(f"---------------------------------- {f} fold----------------------------------")

run = wandb.init(project=args.project_name)
wandb.run.name = f'{args.model_name}/{f}-fold'
wandb.config.update(args)
os.makedirs(f'./models/{args.model_name}/{f}-fold', exist_ok=True)

total_size = len(dataset)
total_ids = list(range(total_size))
del_ids = list(range((f-1)*gap, f*gap))
training_ids = set(total_ids) - set(del_ids)

training_dset = dataset.select(list(training_ids))
eval_dset = dataset.select(del_ids)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainloader = DataLoader(training_dset,
                          batch_size=16,
                          shuffle=True,
                          collate_fn = collator
                          )

validloader = DataLoader(eval_dset,
                          batch_size=16,
                          shuffle=False,
                          collate_fn = collator
                          )

total_batch_ = len(trainloader)
valid_batch_ = len(validloader)

optimizer = get_optimizer(model, args)
scheduler = get_scheduler(optimizer, args, total_batch_)

## epoch 1

In [None]:
f = 4
e = 1

print(f"------------------------------ {f} fold {e} epoch------------------------------")

model.train()
epoch_perform, batch_perform = np.zeros(2), np.zeros(2)
print()	
progress_bar = tqdm(enumerate(trainloader), total=len(trainloader), leave=True, position=0,)
for j, v in progress_bar:
  input_ids, attention_mask, labels = v['input_ids'].to(device), v['attention_mask'].to(device), v['labels'].to(device)
  
  optimizer.zero_grad()
  
  outputs = model(input_ids, attention_mask) ## label을 안 넣어서 logits값만 출력
  output = outputs.logits # The outputs object is a SequenceClassifierOutput
  loss = criterion(output, labels)
  loss.backward()
  optimizer.step()
  scheduler.step()
  for learning_rate in scheduler.get_lr():
    wandb.log({"learning_rate": learning_rate})

  predict = output.argmax(dim=-1)
  predict = predict.detach().cpu().numpy()
  labels = labels.detach().cpu().numpy()
  acc = accuracy_score(labels, predict)

  batch_perform += np.array([loss.item(), acc])
  epoch_perform += np.array([loss.item(), acc])

  if (j + 1) % 50 == 0:
    print(
        f"Epoch {e} #{j + 1} -- loss: {batch_perform[0] / 50}, acc: {batch_perform[1] / 50}"
    )
    batch_perform = np.zeros(2)
print()
print(
    f"Epoch {e} loss: {epoch_perform[0] / total_batch_}, acc: {epoch_perform[1] / total_batch_}"
    )
wandb.log({
    "epoch": e,
    "Train epoch Loss": epoch_perform[0] / total_batch_,
    "Train epoch Acc": epoch_perform[1] / total_batch_}
    )
torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/train.pt")

In [None]:
f = 4
e = 1
best_val_loss, best_val_acc, = np.inf, 0
# Validation
load_path = f'./models/{args.model_name}/{f}-fold/train.pt'
model.load_state_dict(torch.load(load_path,map_location=device))
model.to(device)
model.eval()
valid_perform = np.zeros(2)

all_valid_predict_lst = []
all_valid_labels_lst = []

# wandb에 잘못 분류하는 코드 Pair 기록
wrong_sample_dict = defaultdict(list)

with torch.no_grad():
    for v in validloader:
      input_ids, attention_mask, valid_labels = v["input_ids"].to(device), v["attention_mask"].to(device), v["labels"].to(device)
      
      valid_outputs = model(input_ids, attention_mask)
      valid_output = valid_outputs.logits
      valid_loss = criterion(valid_output, valid_labels)
      
      valid_predict = valid_output.argmax(dim=-1)
      valid_predict = valid_predict.detach().cpu().numpy()
      valid_labels = valid_labels.detach().cpu().numpy()

      
      # wandb에 잘못 분류하는 코드 Pair 기록
      if args.logging_wrong_samples:
        wrong_sample_index = np.where(valid_labels!=valid_predict)[0]
        if len(wrong_sample_index)>0:
          wrong_sample_text, wrong_sample_label, wrong_sample_pred, entailment_prob, contradiction_prob = wrong_batch_for_wandb(tokenizer, wrong_sample_index, input_ids, valid_labels, valid_predict, valid_output)

          wrong_sample_dict['입력 코드 Pair'] += wrong_sample_text
          wrong_sample_dict['실제값'] += wrong_sample_label
          wrong_sample_dict['예측값'] += wrong_sample_pred
          wrong_sample_dict['diff_logit'] += entailment_prob
          wrong_sample_dict['same_logit'] += contradiction_prob


      valid_acc = accuracy_score(valid_labels, valid_predict)
      valid_perform += np.array([valid_loss.item(), valid_acc])

      all_valid_predict_lst += list(valid_predict)
      all_valid_labels_lst += list(valid_labels)

# Model 저장

val_total_loss = valid_perform[0] / valid_batch_
val_total_acc = valid_perform[1] / valid_batch_
best_val_loss = min(best_val_loss, val_total_loss)


if val_total_acc > best_val_acc:
    print(f"New best model for val accuracy : {val_total_acc}! saving the best model..")
    torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/best.pt")

    best_val_acc = val_total_acc

    # wandb에 Confusion Matrix 생성
    class_names = ['diff','same'] # (0,1)
    wandb.log({f"{e}_epoch_conf_mat" : wandb.plot.confusion_matrix(probs=None,
                                                                      y_true=all_valid_labels_lst, preds=all_valid_predict_lst,
                                                                      class_names=class_names)})
      
    if args.logging_wrong_samples and val_total_acc > 0.91:

      wrong_sample_df = pd.DataFrame(wrong_sample_dict)
      wrong_sample_df.to_csv(f"./models/{args.model_name}/{f}-fold/wrong_df.csv",index=False)
      print('='*15,f'Fold{f} Wrong DataFrame Saved','='*15)
      text_table = wandb.Table(data = wrong_sample_df)
      run.log({f"{f}_fold_wrong_samples" : text_table})
    
print()
print(
    f">>>> Validation loss: {val_total_loss}, Acc: {val_total_acc}"
    )
print()
wandb.log({
    "epoch": e,
    "Last_Valid Loss": val_total_loss,
    "Last_Valid Acc": val_total_acc,
    })
best_val_acc_list.append(best_val_acc)
print('='*50)
print(f"{f}fold best_val_acc_list : {best_val_acc_list}")
print('='*15, f'{f}fold Final Score(ACC) : {np.mean(best_val_acc_list)}', '='*15)
wandb.log({
f"Total Mean ACC ({f}fold)": np.mean(best_val_acc_list)}
)

## fold 5

In [None]:
# Google Colab 런타임문제로 cell 단위로 나누어 진행했습니다.
f = 5

print(f"---------------------------------- {f} fold----------------------------------")

run = wandb.init(project=args.project_name)
wandb.run.name = f'{args.model_name}/{f}-fold'
wandb.config.update(args)
os.makedirs(f'./models/{args.model_name}/{f}-fold', exist_ok=True)

total_size = len(dataset)
total_ids = list(range(total_size))
del_ids = list(range((f-1)*gap, f*gap))
training_ids = set(total_ids) - set(del_ids)

training_dset = dataset.select(list(training_ids))
eval_dset = dataset.select(del_ids)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainloader = DataLoader(training_dset,
                          batch_size=16,
                          shuffle=True,
                          collate_fn = collator
                          )

validloader = DataLoader(eval_dset,
                          batch_size=16,
                          shuffle=False,
                          collate_fn = collator
                          )

total_batch_ = len(trainloader)
valid_batch_ = len(validloader)

optimizer = get_optimizer(model, args)
scheduler = get_scheduler(optimizer, args, total_batch_)

## epoch 1

In [None]:
f = 5
e = 1

print(f"------------------------------ {f} fold {e} epoch------------------------------")

model.train()
epoch_perform, batch_perform = np.zeros(2), np.zeros(2)
print()	
progress_bar = tqdm(enumerate(trainloader), total=len(trainloader), leave=True, position=0,)
for j, v in progress_bar:
  input_ids, attention_mask, labels = v['input_ids'].to(device), v['attention_mask'].to(device), v['labels'].to(device)
  
  optimizer.zero_grad()
  
  outputs = model(input_ids, attention_mask) ## label을 안 넣어서 logits값만 출력
  output = outputs.logits # The outputs object is a SequenceClassifierOutput
  loss = criterion(output, labels)
  loss.backward()
  optimizer.step()
  scheduler.step()
  for learning_rate in scheduler.get_lr():
    wandb.log({"learning_rate": learning_rate})

  predict = output.argmax(dim=-1)
  predict = predict.detach().cpu().numpy()
  labels = labels.detach().cpu().numpy()
  acc = accuracy_score(labels, predict)

  batch_perform += np.array([loss.item(), acc])
  epoch_perform += np.array([loss.item(), acc])

  if (j + 1) % 50 == 0:
    print(
        f"Epoch {e} #{j + 1} -- loss: {batch_perform[0] / 50}, acc: {batch_perform[1] / 50}"
    )
    batch_perform = np.zeros(2)
print()
print(
    f"Epoch {e} loss: {epoch_perform[0] / total_batch_}, acc: {epoch_perform[1] / total_batch_}"
    )
wandb.log({
    "epoch": e,
    "Train epoch Loss": epoch_perform[0] / total_batch_,
    "Train epoch Acc": epoch_perform[1] / total_batch_}
    )
torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/train.pt")

In [None]:
f = 5
e = 1
best_val_loss, best_val_acc, = np.inf, 0
# Validation
load_path = f'./models/{args.model_name}/{f}-fold/train.pt'
model.load_state_dict(torch.load(load_path,map_location=device))
model.to(device)
model.eval()
valid_perform = np.zeros(2)

all_valid_predict_lst = []
all_valid_labels_lst = []

# wandb에 잘못 분류하는 코드 Pair 기록
wrong_sample_dict = defaultdict(list)

with torch.no_grad():
    for v in validloader:
      input_ids, attention_mask, valid_labels = v["input_ids"].to(device), v["attention_mask"].to(device), v["labels"].to(device)
      
      valid_outputs = model(input_ids, attention_mask)
      valid_output = valid_outputs.logits
      valid_loss = criterion(valid_output, valid_labels)
      
      valid_predict = valid_output.argmax(dim=-1)
      valid_predict = valid_predict.detach().cpu().numpy()
      valid_labels = valid_labels.detach().cpu().numpy()

      
      # wandb에 잘못 분류하는 코드 Pair 기록
      if args.logging_wrong_samples:
        wrong_sample_index = np.where(valid_labels!=valid_predict)[0]
        if len(wrong_sample_index)>0:
          wrong_sample_text, wrong_sample_label, wrong_sample_pred, entailment_prob, contradiction_prob = wrong_batch_for_wandb(tokenizer, wrong_sample_index, input_ids, valid_labels, valid_predict, valid_output)

          wrong_sample_dict['입력 코드 Pair'] += wrong_sample_text
          wrong_sample_dict['실제값'] += wrong_sample_label
          wrong_sample_dict['예측값'] += wrong_sample_pred
          wrong_sample_dict['diff_logit'] += entailment_prob
          wrong_sample_dict['same_logit'] += contradiction_prob


      valid_acc = accuracy_score(valid_labels, valid_predict)
      valid_perform += np.array([valid_loss.item(), valid_acc])

      all_valid_predict_lst += list(valid_predict)
      all_valid_labels_lst += list(valid_labels)

# Model 저장

val_total_loss = valid_perform[0] / valid_batch_
val_total_acc = valid_perform[1] / valid_batch_
best_val_loss = min(best_val_loss, val_total_loss)


if val_total_acc > best_val_acc:
    print(f"New best model for val accuracy : {val_total_acc}! saving the best model..")
    torch.save(model.state_dict(), f"./models/{args.model_name}/{f}-fold/best.pt")

    best_val_acc = val_total_acc

    # wandb에 Confusion Matrix 생성
    class_names = ['diff','same'] # (0,1)
    wandb.log({f"{e}_epoch_conf_mat" : wandb.plot.confusion_matrix(probs=None,
                                                                      y_true=all_valid_labels_lst, preds=all_valid_predict_lst,
                                                                      class_names=class_names)})
      
    if args.logging_wrong_samples and val_total_acc > 0.91:

      wrong_sample_df = pd.DataFrame(wrong_sample_dict)
      wrong_sample_df.to_csv(f"./models/{args.model_name}/{f}-fold/wrong_df.csv",index=False)
      print('='*15,f'Fold{f} Wrong DataFrame Saved','='*15)
      text_table = wandb.Table(data = wrong_sample_df)
      run.log({f"{f}_fold_wrong_samples" : text_table})
    
print()
print(
    f">>>> Validation loss: {val_total_loss}, Acc: {val_total_acc}"
    )
print()
wandb.log({
    "epoch": e,
    "Last_Valid Loss": val_total_loss,
    "Last_Valid Acc": val_total_acc,
    })
best_val_acc_list.append(best_val_acc)
print('='*50)
print(f"{f}fold best_val_acc_list : {best_val_acc_list}")
print('='*15, f'{f}fold Final Score(ACC) : {np.mean(best_val_acc_list)}', '='*15)
wandb.log({
f"Total Mean ACC ({f}fold)": np.mean(best_val_acc_list)}
)

# Inference

## 테스트 데이터 전처리

In [None]:
def preprocess_script(code):
    
    codea = code['code1']
    codeb = code['code2']

    new_codea = deque()
    for line in codea.split('\n'):
        if line.lstrip().startswith('#'): # 주석으로 시작되는 행 skip
            continue
        line = line.rstrip()
        if '#' in line:
            line = line[:line.index('#')] # 주석 전까지 코드만 저장
        line = line.replace('\n','')      # 개행 문자를 모두 삭제함
        line = line.replace('    ','\t')  # 공백 4칸을 tab으로 변환

        if line == '': # 전처리 후 빈 라인은 skip
            continue

        new_codea.append(line)

    new_codea = '\n'.join(new_codea)
    new_codea = re.sub('("""[\w\W]*?""")', '<str>', new_codea)
    new_codea = re.sub("('''[\w\W]*?''')", '<str>', new_codea)
    new_codea = re.sub('/^(http?|https?):\/\/([a-z0-9-]+\.)+[a-z0-9]{2,4}.*$/', '', new_codea)
    code['code1'] = new_codea

    new_codeb = deque()   
    for line in codeb.split('\n'):
        if line.lstrip().startswith('#'): # 주석으로 시작되는 행 skip
            continue
        line = line.rstrip()
        if '#' in line:
            line = line[:line.index('#')] # 주석 전까지 코드만 저장
        line = line.replace('\n','')      # 개행 문자를 모두 삭제함
        line = line.replace('    ','\t')  # 공백 4칸을 tab으로 변환

        if line == '': # 전처리 후 빈 라인은 skip
            continue

        new_codeb.append(line)

    new_codeb = '\n'.join(new_codeb)
    new_codeb = re.sub('("""[\w\W]*?""")', '<str>', new_codeb)
    new_codeb = re.sub("('''[\w\W]*?''')", '<str>', new_codeb)
    new_codeb = re.sub('/^(http?|https?):\/\/([a-z0-9-]+\.)+[a-z0-9]{2,4}.*$/', '', new_codeb)
    
    code['code2'] = new_codeb
    return code


def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding='max_length', max_length=512, truncation=True)
    return outputs

## inference 및 voting

In [None]:
testdataset = load_dataset("csv", data_files='/content/drive/MyDrive/test.csv')['train']
tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = 'left'


preprocessed = testdataset.map(preprocess_script)
test_dataset = preprocessed.map(example_fn, remove_columns=['code1', 'code2'])
collator = DataCollatorWithPadding(tokenizer=tokenizer)

testloader = DataLoader(test_dataset,
                          batch_size=16,
                          shuffle=False,
                         collate_fn = collator
                          )

all_fold_logits = np.zeros((179700, 2))  # rows of df, target labels
for idx in tqdm(range(1, args.n_splits+1)):
  if idx == 6: # 6번째 모델은 별도로 저장된 모델 불러옴
    checkpoint_path = f'./models/{args.model_name}/{idx}-fold/checkpoint-66000'
    config =  AutoConfig.from_pretrained(checkpoint_path)
    model = RobertaForSequenceClassification.from_pretrained(checkpoint_path, config=config)
    model = model.to(device)
  else:
    model = RobertaForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
    load_path = f'./models/{args.model_name}/{idx}-fold/best.pt'
    model.load_state_dict(torch.load(load_path,map_location=torch.device('cpu')))
    model.to(device)
  
  model.eval()
  progress_bar = tqdm(enumerate(testloader), total=len(testloader), leave=True, position=0,)
  for i, data in progress_bar:
    with torch.no_grad():
      logits = model(
                  data['input_ids'].to(device),
                  data['attention_mask'].to(device),
                  )
      logits=logits.logits
    if i==0:
      one_fold_logits = logits
    else:
      one_fold_logits = torch.cat([one_fold_logits,logits],dim=0)

  # torch tensor를 저장하기 위한 numpy 변환
  one_fold_logits = one_fold_logits.squeeze(0).detach().cpu().numpy()
  # numpy array 저장
  np.save(f'./models/{args.model_name}/{idx}-fold/numpy_logits', one_fold_logits)
  
  all_fold_logits += one_fold_logits
  if idx == 1:
    all_fold_predictions = np.argmax(one_fold_logits, axis=1)
  else:
    one_fold_predictions = np.argmax(one_fold_logits, axis=1)
    all_fold_predictions = np.vstack([all_fold_predictions, one_fold_predictions])

soft_output = list(np.argmax(all_fold_logits, axis=1))
hard_output = ([max(list(Counter(lst).items()), key=lambda x:x[1])[0] for lst in all_fold_predictions.T])

# Submission

In [None]:
submission_path = "/content/drive/MyDrive/sample_submission.csv"

submissionsoft = pd.read_csv(submission_path)
submissionhard = pd.read_csv(submission_path)

submissionsoft['similar']=soft_output
submissionhard['similar']=hard_output

submissionsoft.to_csv('/content/drive/MyDrive/submissionsoft.csv', index=False)
submissionhard.to_csv('/content/drive/MyDrive/submissionhard.csv', index=False)