In [None]:
!pip install adamp

In [None]:
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import os

import transformers
from transformers import AutoTokenizer, AdamW, RobertaForSequenceClassification
from transformers import get_linear_schedule_with_warmup

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

from tqdm.notebook import tqdm, tqdm_notebook

import random
import torch.backends.cudnn as cudnn

from sklearn.model_selection import StratifiedKFold

from adamp import AdamP

In [None]:
train_1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dacon/nli/train_data.csv')
train_2 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dacon/nli/plus_data.csv')
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/dacon/nli/test_data.csv")
submission = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/dacon/nli/sample_submission.csv")


In [None]:
train=pd.concat([train_1, train_2])

In [None]:
list1 = [(train['label']== "entailment"), (train['label']== "contradiction"), (train['label']== "neutral")]
choicelist1 = [0,1,2]
train['label']=np.select(list1, choicelist1)

train=train[['premise','hypothesis','label']]
test=test[['premise','hypothesis']]

In [None]:
class TRAINDataset(Dataset):
  
  def __init__(self, data):
    self.dataset = data
    self.tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

    print(self.dataset)
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 0:3].values
    sentence1 = row[0]
    sentence2 = row[1]
    y = row[2]
    inputs = self.tokenizer(
        sentence1,
        sentence2,
        truncation=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        add_special_tokens=True,
        max_length=100
    )
    
    input_ids = torch.from_numpy(np.asarray(inputs['input_ids']))
    attention_mask = torch.from_numpy(np.asarray(inputs['attention_mask']))

    return input_ids, attention_mask, y

In [None]:
class TESTDataset(Dataset):
  
  def __init__(self, data):
    self.dataset = data
    self.tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

    print(self.dataset)
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 0:2].values
    sentence1 = row[0]
    sentence2 = row[1]
    inputs = self.tokenizer(
        sentence1,
        sentence2,
        truncation=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        add_special_tokens=True,
        max_length=100
    )
    
    input_ids = torch.from_numpy(np.asarray(inputs['input_ids']))
    attention_mask = torch.from_numpy(np.asarray(inputs['attention_mask']))

    return input_ids, attention_mask

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
device = torch.device("cuda")

In [None]:
# 파라미터
epochs = 20
batch_size = 16

In [None]:
# 모델 학습 및 검증
def training(train_dataset,val_dataset, fold):
  best_acc = 0
  
  model = RobertaForSequenceClassification.from_pretrained("klue/roberta-large", num_labels=3).to(device)
  
  dataset_train = TRAINDataset(train_dataset)
  dataset_val = TRAINDataset(val_dataset)

  train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
  valid_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

  optimizer = AdamP(model.parameters(), lr=1e-5, betas=(0.9, 0.999), weight_decay=1e-2)

  total_steps = len(train_loader) * epochs

  # 스케줄러
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps = 0,
                                              num_training_steps = total_steps)

  for e in range(epochs):
    train_acc = 0.0
    valid_acc = 0.0
    model.train()
    for batch_id, (token_ids, attention_masks, label) in tqdm(enumerate(train_loader), total=len(train_loader)):
      optimizer.zero_grad()
      token_ids = token_ids.to(device)
      attention_masks = attention_masks.to(device)
      label = label.to(device)
      out = model(token_ids, attention_masks)[0]
      loss = F.cross_entropy(out, label)
      loss.backward()
      optimizer.step()
      scheduler.step()
      train_acc += calc_accuracy(out, label)

    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

    model.eval()
    for batch_id, (token_ids, attention_masks, label) in tqdm(enumerate(valid_loader), total=len(valid_loader)):
      token_ids = token_ids.to(device)
      attention_masks = attention_masks.to(device)
      label = label.to(device)
      out = model(token_ids, attention_masks)[0]
      valid_acc += calc_accuracy(out, label)
    print("epoch {} valid acc {}".format(e+1, valid_acc / (batch_id+1)))
    torch.save(model, '/content/drive/MyDrive/Colab Notebooks/dacon/nli/model'+str(fold)+'.pt')

In [None]:
# 교차검증
def main():
    seed= 2021 # 재현성을 위한 시드값 고정
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = False  # type: ignore

    # kfold
    kfold=[]

    splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)
    for train_idx, val_idx in splitter.split(train.iloc[:, 0:2],train.iloc[:, 2]):
        kfold.append((train.iloc[train_idx,:],train.iloc[val_idx,:]))

    for fold,(train_datasets, valid_datasets) in enumerate(kfold):
        print(f'fold{fold} 학습중...')
        training(train_dataset=train_datasets,val_dataset=valid_datasets,fold=fold)

In [None]:
main() 

In [None]:
# 예측 
def inference(model, dataset_test):
    test_dataset = TESTDataset(dataset_test)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    model.eval()
    output_pred = []
    with torch.no_grad():
      for batch_id, (token_ids, attention_masks) in tqdm(enumerate(test_loader), total=len(test_loader)):
        token_ids = token_ids.long().to(device)
        attention_masks = attention_masks.long().to(device)
        output=model(token_ids, attention_masks)[0]
        logits = torch.nn.functional.softmax(output, dim=1).detach().cpu().numpy()
        output_pred.extend(logits)
    return output_pred

In [None]:
label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}

In [None]:
# 결과 도출
def inference_main():
  res = np.zeros((len(test),3)) 
  for i in range(5): 
    print(f'fold{i} 모델 추론중...')
    # load my model
    model = torch.load('/content/drive/MyDrive/Colab Notebooks/dacon/nli/model'+str(i)+'.pt')

    pred_answer = inference(model, test)

    res += np.array(pred_answer) / 5 

  ans= np.argmax(res, axis=-1)
  out = [list(label_dict.keys())[_] for _ in ans]
  submission["label"] = out

In [None]:
inference_main()

In [None]:
submission.to_csv("FOLD5(20)_submission.csv", index = False)