<a href="https://colab.research.google.com/github/nazzang49/dacon/blob/main/dacon_nli_competition_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Dacon NLI Competition Practice**
https://dacon.io/competitions/official/235875/overview/description

In [1]:
# !pip install gluonnlp transformers kobert-transformers mxnet

In [2]:
# !pip install git+https://github.com/SKTBrain/KoBERT.git

In [107]:
import pandas as pd
from glob import glob
import os
import numpy as np
from tqdm import tqdm, tqdm_notebook

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import gluonnlp as nlp
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW, BertModel, BertPreTrainedModel, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

### 데이터 불러오기

In [4]:
train = pd.read_csv("/content/train_data.csv")
test = pd.read_csv("/content/test_data.csv")
submission = pd.read_csv("/content/sample_submission.csv")

In [5]:
train.head()

Unnamed: 0,index,premise,hypothesis,label
0,0,"씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나 장정들이...",씨름의 여자들의 놀이이다.,contradiction
1,1,"삼성은 자작극을 벌인 2명에게 형사 고소 등의 법적 대응을 검토 중이라고 하였으나,...",자작극을 벌인 이는 3명이다.,contradiction
2,2,이를 위해 예측적 범죄예방 시스템을 구축하고 고도화한다.,예측적 범죄예방 시스템 구축하고 고도화하는 것은 목적이 있기 때문이다.,entailment
3,3,광주광역시가 재개발 정비사업 원주민들에 대한 종합대책을 마련하는 등 원주민 보호에 ...,원주민들은 종합대책에 만족했다.,neutral
4,4,"진정 소비자와 직원들에게 사랑 받는 기업으로 오래 지속되고 싶으면, 이런 상황에서는...",이런 상황에서 책임 있는 모습을 보여주는 기업은 아주 드물다.,neutral


In [6]:
# pandas profiling

### 데이터셋 만들기

In [7]:
max_len = 70
batch_size = 128
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

device = torch.device("cuda:0")

## **Model1**

In [41]:
# bertmodel, vocab = get_pytorch_kobert_model(cachedir = ".cache")
# tokenizer = get_tokenizer()
# tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower = False)

## **Model2**

In [108]:
config = AutoConfig.from_pretrained('klue/roberta-base')
config.num_labels = 3
bertmodel = AutoModelForSequenceClassification.from_config(config)
tok = AutoTokenizer.from_pretrained('klue/roberta-base')

In [None]:
bertmodel

## **Tokenization**

In [33]:
label_dict = {"entailment" : 0, "contradiction" : 1, "neutral" : 2}

In [42]:
train["premise_"] = "[CLS]" + train["premise"] + "[SEP]"
train["hypothesis_"] = train["hypothesis"] + "[SEP]"

test["premise_"] = "[CLS]" + test["premise"] + "[SEP]"
test["hypothesis_"] = test["hypothesis"] + "[SEP]"

train["text_sum"] = train.premise_ + " " + train.hypothesis_
test["text_sum"] = test.premise_ + " " + test.hypothesis_

train_content = []
test_content = []

for i, text in enumerate(train.text_sum):
    train_content.append(list([text, label_dict[train.label[i]]]))
    
for i, text in enumerate(test.text_sum):
    test_content.append([text])
    
dataset_train = train_content[:20000]
dataset_valid = train_content[20000:]
dataset_test = test_content

In [49]:
dataset_train_sentences = [row[0] for row in dataset_train]
dataset_train_labels = [row[1] for row in dataset_train]

dataset_valid_sentences = [row[0] for row in dataset_valid]
dataset_valid_labels = [row[1] for row in dataset_valid]

dataset_test_sentences = [row[0] for row in dataset_test]

In [55]:
tokenized_train_sentences = tok(
    dataset_train_sentences,
    add_special_tokens=True,
    return_token_type_ids=False, # roberta = False
    padding=True,
    truncation=True,
    max_length=100,
    return_tensors="pt"
)

tokenized_valid_sentences = tok(
    dataset_valid_sentences,
    add_special_tokens=True,
    return_token_type_ids=False, # roberta = False
    padding=True,
    truncation=True,
    max_length=100,
    return_tensors="pt"
)

tokenized_test_sentences = tok(
    dataset_test_sentences,
    add_special_tokens=True,
    return_token_type_ids=False, # roberta = False
    padding=True,
    truncation=True,
    max_length=100,
    return_tensors="pt"
)

In [81]:
tokenized_train_sentences['input_ids'][0]

tensor([    0,     0, 14441,  2073, 12382, 13169,  2200,  3797, 21505,  9005,
         2259,  3997,  2031,  2079,  3661, 31221,  5845,  2200,  2112,    16,
         5950, 15351, 17788,  7285,   748,  2088, 22048,  2470,  1132, 21893,
        15351,  6481, 27135,  5417,  4084,  1972,  2145, 17524,  2138, 15526,
         2259,   575, 28674,    18,     2, 14441,  2079,  3883,  2031,  2079,
         5845, 28674,    18,     2,     2,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1])

In [184]:
class BERTDataset(Dataset):
    def __init__(self, tokenized_examples, labels=None):
        self.tokenized_examples = tokenized_examples
        self.labels = labels

    def __getitem__(self, i):
        item = {key: value[i] for key, value in self.tokenized_examples.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[i])
        return item
    
    def __len__(self):
        return len(self.labels)


In [155]:
# class BERTDataset(Dataset):
#     def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair, mode = "train"):
#         self.mode = mode
#         transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length = max_len, pad = pad, pair = pair)

#         # train
#         if self.mode == "train":
#             self.sentences = [transform([i[sent_idx]]) for i in dataset]
#             self.labels = [np.int32(i[label_idx]) for i in dataset]

#         # test
#         else:
#             self.sentences = [transform(i) for i in dataset]
        
#     def __getitem__(self, i):
#         if self.mode == 'train':
#             return (self.sentences[i] + (self.labels[i], ))
#         else:
#             return self.sentences[i]
    
#     def __len__(self):
#         return (len(self.sentences))

In [185]:
# create dataset
data_train = BERTDataset(tokenized_train_sentences, dataset_train_labels)
data_valid = BERTDataset(tokenized_valid_sentences, dataset_valid_labels)
data_test = BERTDataset(tokenized_test_sentences)

train_dataloader = torch.utils.data.DataLoader(data_train, batch_size = batch_size, num_workers = 5)
valid_dataloader = torch.utils.data.DataLoader(data_valid, batch_size = batch_size, num_workers = 5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size = batch_size, num_workers = 5)

  cpuset_checked))


In [186]:
data_train[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]),
 'input_ids': tensor([    0,     0, 14441,  2073, 12382, 13169,  2200,  3797, 21505,  9005,
          2259,  3997,  2031,  2079,  3661, 31221,  5845,  2200,  2112,    16,
          5950, 15351, 17788,  7285,   748,  2088, 22048,  2470,  1132, 21893,
         15351,  6481, 27135,  5417,  4084,  1972,  2145, 17524,  2138, 15526,
          2259,   575, 28674,    18,     2, 14441,  2079,  3883,  2031,  2079,
          5845, 28674,    18,     2,     2,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     

In [187]:
len(train_dataloader)

157

### 모델 만들기

In [159]:
# class BERTClassifier(nn.Module):
#     def __init__(self, bert, hidden_size = 768, num_classes=3, dr_rate=None, params=None):
#         super(BERTClassifier, self).__init__()
#         self.bert = bert
#         self.dr_rate = dr_rate
                 
#         self.classifier = nn.Linear(hidden_size , num_classes)
#         if dr_rate:
#             self.dropout = nn.Dropout(p=dr_rate)
    
#     def gen_attention_mask(self, token_ids, valid_length):
#         attention_mask = torch.zeros_like(token_ids)
#         for i, v in enumerate(valid_length):
#             attention_mask[i][:v] = 1
#         return attention_mask.float()

#     def forward(self, token_ids, valid_length, segment_ids):
#         attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
#         _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
#         if self.dr_rate:
#             out = self.dropout(pooler)
#         return self.classifier(out)

In [188]:
model = bertmodel.to(device)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss().to(device)



In [189]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

### 모델 학습

In [190]:
# train and eval for each epochs
for e in range(num_epochs):
    train_acc = 0.0
    valid_acc = 0.0
    model.train()
    for batch_id, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        optimizer.zero_grad()
        
        token_ids = batch['input_ids'].to(device)
        segment_ids = batch['attention_mask'].to(device)
        label = batch['labels'].to(device)

        out = model(token_ids, segment_ids)

        loss = loss_fn(out.logits, label) # caution => out.logits
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        
        train_acc += calc_accuracy(out.logits, label)

    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, batch in tqdm(enumerate(valid_dataloader), total=len(valid_dataloader)):
        token_ids = batch['input_ids'].to(device)
        segment_ids = batch['attention_mask'].to(device)
        label = batch['labels'].to(device)
        
        out = model(token_ids, segment_ids)

        valid_acc += calc_accuracy(out.logits, label)
    print("epoch {} valid acc {}".format(e+1, valid_acc / (batch_id+1)))

  cpuset_checked))
100%|██████████| 157/157 [02:53<00:00,  1.11s/it]

epoch 1 train acc 0.33031449044585987



  2%|▎         | 1/40 [00:00<00:18,  2.06it/s]


RuntimeError: ignored

In [167]:
# Example of target with class indices
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(input, target)
output.backward()

# Example of target with class probabilities
# input = torch.randn(3, 5, requires_grad=True)
# target = torch.randn(3, 5).softmax(dim=1)
# output = loss(input, target)
# output.backward()

In [168]:
input

tensor([[ 1.4129, -0.1879, -0.2339,  1.0621, -1.1530],
        [-0.3082,  0.3359, -0.2390,  0.2534, -0.4925],
        [-0.7778, -0.6368,  0.6370,  0.2805, -0.0027]], requires_grad=True)

In [169]:
target

tensor([2, 2, 4])

### 추론

In [None]:
# inference
result = []
model.eval()
with torch.no_grad():
    for batch_id, (token_ids, valid_length, segment_ids) in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        result.append(model(token_ids, valid_length, segment_ids))

In [None]:
result_ = []
for i in result:
    for j in i:
        result_.append(int(torch.argmax(j)))
        
out = [list(label_dict.keys())[_] for _ in result_]
submission["label"] = out
submission.to_csv("sample_submission.csv", index = False)

In [None]:
submission.sample(3)