<a href="https://colab.research.google.com/github/nazzang49/dacon/blob/main/dacon_nli_competition_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Dacon NLI Competition Practice**
https://dacon.io/competitions/official/235875/overview/description

In [1]:
# !pip install gluonnlp transformers kobert-transformers mxnet

In [2]:
# !pip install git+https://github.com/SKTBrain/KoBERT.git

In [3]:
import pandas as pd
from glob import glob
import os
import numpy as np
from tqdm import tqdm, tqdm_notebook

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import gluonnlp as nlp
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW, BertModel, BertPreTrainedModel, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

### 데이터 불러오기

In [4]:
train = pd.read_csv("/content/train_data.csv")
test = pd.read_csv("/content/test_data.csv")
submission = pd.read_csv("/content/sample_submission.csv")

In [5]:
train.head()

Unnamed: 0,index,premise,hypothesis,label
0,0,"씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나 장정들이...",씨름의 여자들의 놀이이다.,contradiction
1,1,"삼성은 자작극을 벌인 2명에게 형사 고소 등의 법적 대응을 검토 중이라고 하였으나,...",자작극을 벌인 이는 3명이다.,contradiction
2,2,이를 위해 예측적 범죄예방 시스템을 구축하고 고도화한다.,예측적 범죄예방 시스템 구축하고 고도화하는 것은 목적이 있기 때문이다.,entailment
3,3,광주광역시가 재개발 정비사업 원주민들에 대한 종합대책을 마련하는 등 원주민 보호에 ...,원주민들은 종합대책에 만족했다.,neutral
4,4,"진정 소비자와 직원들에게 사랑 받는 기업으로 오래 지속되고 싶으면, 이런 상황에서는...",이런 상황에서 책임 있는 모습을 보여주는 기업은 아주 드물다.,neutral


In [6]:
# pandas profiling

### 데이터셋 만들기

In [7]:
max_len = 70
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

device = torch.device("cuda:0")

## **Model1**

In [8]:
# bertmodel, vocab = get_pytorch_kobert_model(cachedir = ".cache")
# tokenizer = get_tokenizer()
# tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower = False)

## **Model2**

In [9]:
config = AutoConfig.from_pretrained('klue/roberta-base')
config.num_labels = 3
bertmodel = AutoModelForSequenceClassification.from_config(config)
tok = AutoTokenizer.from_pretrained('klue/roberta-base')

In [10]:
bertmodel

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

## **Tokenization**

In [11]:
label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}

In [12]:
train["premise_"] = "[CLS]" + train["premise"] + "[SEP]"
train["hypothesis_"] = train["hypothesis"] + "[SEP]"

test["premise_"] = "[CLS]" + test["premise"] + "[SEP]"
test["hypothesis_"] = test["hypothesis"] + "[SEP]"

train["text_sum"] = train.premise_ + " " + train.hypothesis_
test["text_sum"] = test.premise_ + " " + test.hypothesis_

train_content = []
test_content = []

for i, text in enumerate(train.text_sum):
    train_content.append(list([text, label_dict[train.label[i]]]))
    
for i, text in enumerate(test.text_sum):
    test_content.append([text])
    
dataset_train = train_content[:20000]
dataset_valid = train_content[20000:]
dataset_test = test_content

In [13]:
dataset_train_sentences = [row[0] for row in dataset_train]
dataset_train_labels = [row[1] for row in dataset_train]

dataset_valid_sentences = [row[0] for row in dataset_valid]
dataset_valid_labels = [row[1] for row in dataset_valid]

dataset_test_sentences = [row[0] for row in dataset_test]

In [14]:
tokenized_train_sentences = tok(
    dataset_train_sentences,
    add_special_tokens=True,
    return_token_type_ids=False, # roberta = False
    padding=True,
    truncation=True,
    max_length=100,
    return_tensors="pt"
)

tokenized_valid_sentences = tok(
    dataset_valid_sentences,
    add_special_tokens=True,
    return_token_type_ids=False, # roberta = False
    padding=True,
    truncation=True,
    max_length=100,
    return_tensors="pt"
)

tokenized_test_sentences = tok(
    dataset_test_sentences,
    add_special_tokens=True,
    return_token_type_ids=False, # roberta = False
    padding=True,
    truncation=True,
    max_length=100,
    return_tensors="pt"
)

In [15]:
tokenized_train_sentences['input_ids'][0]

tensor([    0,     0, 14441,  2073, 12382, 13169,  2200,  3797, 21505,  9005,
         2259,  3997,  2031,  2079,  3661, 31221,  5845,  2200,  2112,    16,
         5950, 15351, 17788,  7285,   748,  2088, 22048,  2470,  1132, 21893,
        15351,  6481, 27135,  5417,  4084,  1972,  2145, 17524,  2138, 15526,
         2259,   575, 28674,    18,     2, 14441,  2079,  3883,  2031,  2079,
         5845, 28674,    18,     2,     2,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1])

In [16]:
class BERTDataset(Dataset):
    def __init__(self, tokenized_examples, labels=None):
        self.tokenized_examples = tokenized_examples
        self.labels = labels

    def __getitem__(self, i):
        item = {key: value[i] for key, value in self.tokenized_examples.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[i])
        return item
    
    def __len__(self):
        return len(self.labels)


In [17]:
# create dataset
data_train = BERTDataset(tokenized_train_sentences, dataset_train_labels)
data_valid = BERTDataset(tokenized_valid_sentences, dataset_valid_labels)
data_test = BERTDataset(tokenized_test_sentences)

train_dataloader = torch.utils.data.DataLoader(data_train, batch_size = batch_size, num_workers = 5)
valid_dataloader = torch.utils.data.DataLoader(data_valid, batch_size = batch_size, num_workers = 5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size = batch_size, num_workers = 5)

  cpuset_checked))


In [18]:
data_train[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]),
 'input_ids': tensor([    0,     0, 14441,  2073, 12382, 13169,  2200,  3797, 21505,  9005,
          2259,  3997,  2031,  2079,  3661, 31221,  5845,  2200,  2112,    16,
          5950, 15351, 17788,  7285,   748,  2088, 22048,  2470,  1132, 21893,
         15351,  6481, 27135,  5417,  4084,  1972,  2145, 17524,  2138, 15526,
          2259,   575, 28674,    18,     2, 14441,  2079,  3883,  2031,  2079,
          5845, 28674,    18,     2,     2,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     

In [19]:
len(train_dataloader)

313

### 모델 만들기

In [20]:
model = bertmodel.to(device)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss().to(device)



In [21]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

### 모델 학습

In [22]:
import gc
from torch.cuda.amp import GradScaler, autocast

scaler = GradScaler()

In [None]:
# train and eval for each epochs
for e in range(num_epochs):
    train_acc = 0.0
    valid_acc = 0.0
    model.train()
    for batch_id, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        optimizer.zero_grad()
        
        token_ids = batch['input_ids'].to(device)
        segment_ids = batch['attention_mask'].to(device)
        label = batch['labels'].to(device)

        # use amp
        with autocast():
            out = model(token_ids, segment_ids)
            loss = loss_fn(out.logits, label) # caution => out.logits
        
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        scaler.step(optimizer)
        scaler.update()
        
        train_acc += calc_accuracy(out.logits, label)

    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

    gc.collect()
    torch.cuda.empty_cache()

    model.eval()
    # no gradient calculation
    with torch.no_grad():
        for batch_id, batch in tqdm(enumerate(valid_dataloader), total=len(valid_dataloader)):
            token_ids = batch['input_ids'].to(device)
            segment_ids = batch['attention_mask'].to(device)
            label = batch['labels'].to(device)

            out = model(token_ids, segment_ids)

        valid_acc += calc_accuracy(out.logits, label)
    print("epoch {} valid acc {}".format(e+1, valid_acc / (batch_id+1)))

  cpuset_checked))
100%|██████████| 313/313 [03:09<00:00,  1.65it/s]


epoch 1 train acc 0.3350139776357827


100%|██████████| 79/79 [00:14<00:00,  5.29it/s]

epoch 1 valid acc 0.002109704641350211



 43%|████▎     | 136/313 [01:22<01:46,  1.66it/s]

### 추론

In [None]:
# inference
result = []
model.eval()
with torch.no_grad():
    for batch_id, batch in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        token_ids = batch['input_ids'].to(device)
        segment_ids = batch['attention_mask'].to(device)
        out = model(token_ids, segment_ids)
        result.append(out.logits)

In [None]:
result_ = []
for i in result:
    for j in i:
        result_.append(int(torch.argmax(j)))
        
out = [list(label_dict.keys())[_] for _ in result_]
submission["label"] = out
submission.to_csv("sample_submission.csv", index = False)