# DataLoad

In [None]:
import pandas as pd
df_train = pd.read_csv("/content/drive/MyDrive/AIHUB/감성대화/Training/감성대화말뭉치_Training.csv",encoding='utf-8')

# Importing

In [None]:
!pip install mxnet
!pip install gluonnlp
!pip install transformers
!pip install sentencepiece
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-ls8ya0tu
  Running command git clone -q 'https://****@github.com/SKTBrain/KoBERT.git' /tmp/pip-req-build-ls8ya0tu


In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

In [None]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [None]:
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

In [None]:
device = torch.device("cuda:0")
context_bert, vocab = get_pytorch_kobert_model()
response_bert, _ = get_pytorch_kobert_model()

tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model
using cached model
using cached model
using cached model
using cached model


# DataLoader

In [None]:
from torch.utils.data import Dataset, DataLoader
import time
import random
import numpy as np

class BERTDataset(Dataset):
    def __init__(self, dataset, sampling_size, bert_tokenizer, max_len, pad, pair):
      '''
      형식은 [context_input, response_input, lbael]
      구조는 [총데이터/샘플링사이즈, 샘플링사이즈^2, 3]
      '''
      self.transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
      self.sampling_size = sampling_size
      self.dataset = dataset
      self.dataset_index = self.negative_sampling_indexing(self.sampling_size,self.dataset)

      self.transformed_sentences = self.transformation(self.dataset.Q)
      self.transformed_responses = self.transformation(self.dataset.A)
      self.sentences = self.get_utter(self.transformed_sentences,self.dataset_index[:,:,0])
      self.responses = self.get_utter(self.transformed_responses,self.dataset_index[:,:,1])
      self.labels = self.dataset_index[:,:,2].flatten()

    def get_utter(self, transformed_data, dataset_index):
      print(f"get utterance...")
      return [transformed_data[ins] for row in dataset_index for ins in row]

    def negative_sampling_indexing(self, num, df):
      print(f"indexing...")
      container=[]
      total_data = list(df.index)
      random.shuffle(total_data)
      eye_matrix = np.eye(num)
      start = time.time()
      for i in range(int(len(total_data)/num)):
        inner_container=[]
        sampled = total_data[i*num:(i+1)*num]
        for s1,r in zip(sampled,eye_matrix):
          for s2,v in zip(sampled,r):
            inner_container.append([s1,s2,v])
        end = time.time()
        container.append(inner_container)
      print(f"shuffle and batch : {round(time.time()-start,3)}s")
      return np.array(container,dtype='int32')

    def transformation(self,data_to_transform):
      print(f"transforming...")
      return [self.transform([d]) for d in data_to_transform]

    def re_initializing(self):
      start = time.time()
      self.dataset_index = self.negative_sampling_indexing(self.sampling_size,self.dataset)
      self.sentences = self.get_utter(self.transformed_sentences,self.dataset_index[:,:,0])
      self.responses = self.get_utter(self.transformed_responses,self.dataset_index[:,:,1])
      print(f"reinitialized! {round(time.time()-start,3)}s")

    def __getitem__(self, i):
        return (self.sentences[i], self.responses[i], (self.labels[i]))

    def __len__(self):
        return (len(self.labels))


In [None]:
max_len=64
batch_size=10
train_dataset = BERTDataset(df_train.iloc[:1000], 10, tok, max_len, True, False)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)

indexing...
shuffle and batch : 0.005s
transforming...
transforming...
get utterance...
get utterance...


# Encoder

In [None]:
class BiEncoder(nn.Module):
    def __init__(self,
                 context_bert,
                 response_bert,
                 num_classes=1,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BiEncoder, self).__init__()
        self.context_bert = context_bert
        self.response_bert = response_bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(self.context_bert.pooler.dense.out_features + self.response_bert.pooler.dense.out_features, 
                                    num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
        self.sigmoid = nn.Sigmoid()
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids_cnt, segment_ids_cnt, valid_length_cnt,
                      token_ids_rsp, segment_ids_rsp, valid_length_rsp):
        attention_mask_cnt = self.gen_attention_mask(token_ids_cnt, valid_length_cnt)
        attention_mask_rsp = self.gen_attention_mask(token_ids_rsp, valid_length_rsp)
        
        _, pooler_cnt =  self.context_bert(input_ids = token_ids_cnt, 
                                           token_type_ids = segment_ids_cnt.long(), 
                                           attention_mask = attention_mask_cnt.float().to(token_ids_cnt.device))
        _, pooler_rsp = self.response_bert(input_ids = token_ids_rsp, 
                                           token_type_ids = segment_ids_rsp.long(), 
                                           attention_mask = attention_mask_rsp.float().to(token_ids_rsp.device))
        output = torch.cat((pooler_cnt,pooler_rsp),1)
        # output = torch.flatten(output)
        if self.dr_rate:
            output = self.dropout(output)
        output = self.classifier(output)
        output = self.sigmoid(output)
        return output

model = BiEncoder(context_bert, response_bert, dr_rate=0.5).to(device)
model

BiEncoder(
  (context_bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

# Parameters


In [None]:
# Setting parameters
max_len = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 100
learning_rate =  0.01

In [None]:
#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.BCELoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

#정확도 측정을 위한 함수 정의

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

# Train

In [None]:
for e in range(num_epochs):
    train_dataset.re_initializing()
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (context_input, response_input, label) in enumerate(tqdm_notebook(train_dataloader)):
        token_ids_cnt, valid_length_cnt, segment_ids_cnt = context_input
        token_ids_rsp, valid_length_rsp, segment_ids_rsp = response_input
        optimizer.zero_grad()

        token_ids_cnt = token_ids_cnt.long().to(device)
        segment_ids_cnt = segment_ids_cnt.long().to(device)
        valid_length_cnt = valid_length_cnt
        token_ids_rsp = token_ids_rsp.long().to(device)
        segment_ids_rsp = segment_ids_rsp.long().to(device)
        valid_length_rsp= valid_length_rsp

        label = label.float().to(device)

        out = model(token_ids_cnt, segment_ids_cnt, valid_length_cnt,
                    token_ids_rsp, segment_ids_rsp, valid_length_rsp)
        
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))

    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    # model.eval()
    # for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
    #     token_ids = token_ids.long().to(device)
    #     segment_ids = segment_ids.long().to(device)
    #     valid_length= valid_length
    #     label = label.long().to(device)
    #     out = model(token_ids, valid_length, segment_ids)
    #     test_acc += calc_accuracy(out, label)
    # print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

indexing...
shuffle and batch : 0.004s
get utterance...
get utterance...
reinitialized! 0.018s


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


  0%|          | 0/1000 [00:00<?, ?it/s]

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


epoch 1 batch id 1 loss 0.716304361820221 train acc 0.9
epoch 1 batch id 101 loss 0.30035343766212463 train acc 0.9000000000000009
epoch 1 batch id 201 loss 0.43312349915504456 train acc 0.9000000000000032
epoch 1 batch id 301 loss 0.37687134742736816 train acc 0.9000000000000026
epoch 1 batch id 401 loss 0.3880786895751953 train acc 0.8999999999999962
epoch 1 batch id 501 loss 0.33543866872787476 train acc 0.8999999999999925
epoch 1 batch id 601 loss 0.3242604732513428 train acc 0.8999999999999899
epoch 1 batch id 701 loss 0.4002767503261566 train acc 0.8999999999999881
epoch 1 batch id 801 loss 0.365792840719223 train acc 0.8999999999999868
epoch 1 batch id 901 loss 0.3120831549167633 train acc 0.8999999999999857
epoch 1 train acc 0.8999999999999849
indexing...
shuffle and batch : 0.004s
get utterance...
get utterance...
reinitialized! 0.017s


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.44076815247535706 train acc 0.9
epoch 2 batch id 101 loss 0.24057359993457794 train acc 0.9000000000000009
epoch 2 batch id 201 loss 0.2871244549751282 train acc 0.9000000000000032
epoch 2 batch id 301 loss 0.45161694288253784 train acc 0.9000000000000026
epoch 2 batch id 401 loss 0.3355695307254791 train acc 0.8999999999999962
epoch 2 batch id 501 loss 0.2913588583469391 train acc 0.8999999999999925
epoch 2 batch id 601 loss 0.28698015213012695 train acc 0.8999999999999899
epoch 2 batch id 701 loss 0.35846084356307983 train acc 0.8999999999999881
epoch 2 batch id 801 loss 0.30579033493995667 train acc 0.8999999999999868
epoch 2 batch id 901 loss 0.3271413743495941 train acc 0.8999999999999857
epoch 2 train acc 0.8999999999999849
indexing...
shuffle and batch : 0.003s
get utterance...
get utterance...
reinitialized! 0.017s


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.2915363907814026 train acc 0.9
epoch 3 batch id 101 loss 0.3742982745170593 train acc 0.9000000000000009
epoch 3 batch id 201 loss 0.41195282340049744 train acc 0.9000000000000032
epoch 3 batch id 301 loss 0.3531269431114197 train acc 0.9000000000000026
epoch 3 batch id 401 loss 0.2616497874259949 train acc 0.8999999999999962
epoch 3 batch id 501 loss 0.3172745704650879 train acc 0.8999999999999925
epoch 3 batch id 601 loss 0.3290972113609314 train acc 0.8999999999999899
epoch 3 batch id 701 loss 0.35321205854415894 train acc 0.8999999999999881
epoch 3 batch id 801 loss 0.39585983753204346 train acc 0.8999999999999868
epoch 3 batch id 901 loss 0.2751792073249817 train acc 0.8999999999999857
epoch 3 train acc 0.8999999999999849
indexing...
shuffle and batch : 0.003s
get utterance...
get utterance...
reinitialized! 0.017s


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.35044437646865845 train acc 0.9
epoch 4 batch id 101 loss 0.38359689712524414 train acc 0.9000000000000009
epoch 4 batch id 201 loss 0.3252977132797241 train acc 0.9000000000000032
epoch 4 batch id 301 loss 0.3655390739440918 train acc 0.9000000000000026
epoch 4 batch id 401 loss 0.28963255882263184 train acc 0.8999999999999962
epoch 4 batch id 501 loss 0.3408462703227997 train acc 0.8999999999999925
epoch 4 batch id 601 loss 0.3620067834854126 train acc 0.8999999999999899
epoch 4 batch id 701 loss 0.315118670463562 train acc 0.8999999999999881
epoch 4 batch id 801 loss 0.27874261140823364 train acc 0.8999999999999868
epoch 4 batch id 901 loss 0.29494887590408325 train acc 0.8999999999999857
epoch 4 train acc 0.8999999999999849
indexing...
shuffle and batch : 0.003s
get utterance...
get utterance...
reinitialized! 0.019s


  0%|          | 0/1000 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.3859056234359741 train acc 0.9
epoch 5 batch id 101 loss 0.35576027631759644 train acc 0.9000000000000009
epoch 5 batch id 201 loss 0.386268675327301 train acc 0.9000000000000032
epoch 5 batch id 301 loss 0.42352741956710815 train acc 0.9000000000000026
epoch 5 batch id 401 loss 0.3448570966720581 train acc 0.8999999999999962
epoch 5 batch id 501 loss 0.28170740604400635 train acc 0.8999999999999925
epoch 5 batch id 601 loss 0.39323538541793823 train acc 0.8999999999999899
epoch 5 batch id 701 loss 0.3356925845146179 train acc 0.8999999999999881
epoch 5 batch id 801 loss 0.2650824785232544 train acc 0.8999999999999868
epoch 5 batch id 901 loss 0.45069870352745056 train acc 0.8999999999999857
epoch 5 train acc 0.8999999999999849


# Inference

In [None]:
reference_data = df_train[df_train.Q.duplicated()].Q.reset_index(drop=True)
reference_data

0                             아내분이 출산을 하시는군요. 정말 축하드려요.
1                                         아 지금 정말 신이 나.
2                                             잘 된 일이네요.
3                    아기가 점점 클게 벌써 기대가 되네. 내가 많이 놀아줘야지. 
4                                    약 종류가 많아 번거로우시겠어요.
                              ...                      
157878               운동으로 뭉친 근육을 풀어주는 것 같아. 그럼 덜 피로하겠지.
157879       많이 서운하셨겠어요. 친구들에게 서운함을 느꼈을 때 해결하는 방법이 있나요?
157880               카페에 가서 대화하거나 같이 술 마시면서 이야기했던 것 같아.
157881    대화로 문제를 해결해오셨던 거군요? 그렇다면 이번 일에는 어떻게 하실 예정인가요?
157882                         직접 서운한 감정을 친구에게 얘기하려고 해.
Name: Q, Length: 157883, dtype: object

In [None]:
from torch.utils.data import Dataset, DataLoader
import time
import random
import numpy as np

class InferenceSet(Dataset):
    def __init__(self, input_sentence, dataset, bert_tokenizer, max_len, pad, pair):
      self.transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
      self.dataset = dataset
      self.input_sentence = input_sentence
      self.transformed_sentences = self.transformation(self.dataset)
      self.transformed_input = self.transformation([self.input_sentence])

    def transformation(self,data_to_transform):
      print(f"transforming...")
      return [self.transform([d]) for d in data_to_transform]


    def __getitem__(self, i):
        return (self.transformed_input, self.transformed_sentences[i])

    def __len__(self):
        return (len(self.transformed_sentences))

input_text = "내 얼굴에 뭐가 묻어서 보는거니?"


transforming...
transforming...


In [None]:
def inference(input_sentence,transform,model):
  scoring = {"index":[], "score":[]}

  print("constructing....")
  inference_set = InferenceSet(input_sentence, df_train_du.iloc[:1000], tok, max_len, True, False)
  inference_dataloader = torch.utils.data.DataLoader(inference_set, batch_size=batch_size)
  print("complete!")

  print("inference...")
  model.eval()

  for batch_id, (context_input, response_input) in enumerate(tqdm_notebook(inference_dataloader)):
      token_ids_cnt, valid_length_cnt, segment_ids_cnt = context_input[0]
      token_ids_rsp, valid_length_rsp, segment_ids_rsp = response_input
      token_ids_rsp = token_ids_rsp.long().to(device)
      segment_ids_rsp = segment_ids_rsp.long().to(device)
      valid_length_rsp= valid_length_rsp
      token_ids_cnt = token_ids_cnt.long().to(device)
      segment_ids_cnt = segment_ids_cnt.long().to(device)
      valid_length_cnt = valid_length_cnt

      out = model(token_ids_cnt, segment_ids_cnt, valid_length_cnt,
                  token_ids_rsp, segment_ids_rsp, valid_length_rsp)
      scoring['index'].append(batch_id)
      scoring['score'].append(out)
      torch.cuda.empty_cache()
  return scoring

sentence_transform = nlp.data.BERTSentenceTransform(tok, max_seq_length=max_len, pad=True, pair=False)
result = inference(input_sentence, sentence_transform, model)
result