In [None]:
!pip install gluonnlp pandas tqdm   
!pip install mxnet
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m344.5/344.5 KB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.10.0-cp38-cp38-linux_x86_64.whl size=688994 sha256=e3d8983d68fb371c69ef332e21a4e16ed2766ef4ca1752d7cfc52a3bfced8067
  Stored in directory: /root/.cache/pip/wheels/b6/93/9d/2237550c409eb3ed725d6302b7897ddd9a037b40cef66dcd9c
Successfully built gluonnlp
Installing collected packages: gluonnlp
Successfully installed gluonnlp-0.10.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mxnet
  Downloading mxnet-1.9.1-py3-none-manylinux2014

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
import pandas as pd
from sklearn.model_selection import train_test_split
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from google.colab import drive

In [None]:
drive.mount('/content/drive')
device = torch.device("cuda:0")

Mounted at /content/drive


In [None]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
tok = tokenizer.tokenize

# Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


Downloading (…)lve/main/config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/369M [00:00<?, ?B/s]

In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):
   
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(float(i[label_idx])) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
         
    def __len__(self):
        return (len(self.labels))

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=6, # major_emotion 라벨 개수
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc
    
def predict(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=2)
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

In [None]:
%cd './drive/MyDrive/Colab Notebooks/ufriends-chatbot/'

/content/drive/MyDrive/Colab Notebooks/ufriends-chatbot


In [None]:
!pwd

/content/drive/MyDrive/Colab Notebooks/ufriends-chatbot


In [None]:
train_df = pd.read_csv('Dataset/train_df_preprocess.csv')
validation_df = pd.read_csv('Dataset/validation_df_preprocess.csv')

In [None]:
train_df

Unnamed: 0,keywords,major_emotions,user1,chatbot1
0,2.0,5.0,아내가 드디어 출산하게 되어서 정말 신이 나,아내분이 출산을 하시는군요 정말 축하드려요
1,3.0,2.0,당뇨랑 합병증 때문에 먹어야 할 약이 열 가지가 넘어가니까 스트레스야,약 종류가 많아 번거로우시겠어요
2,5.0,4.0,고등학교에 올라오니 중학교 때보다 수업이 갑자기 어려워져서 당황스러워,고등학교 수업이 중학교 때와는 다르게 갑자기 어려워서 당황스러우시군요
3,4.0,5.0,재취업이 돼서 받게 된 첫 월급으로 온 가족이 외식을 할 예정이야 너무 행복해,재취업 후 첫 월급이라 정말 기쁘시겠어요
4,4.0,5.0,빚을 드디어 다 갚게 되어서 이제야 안도감이 들어,기분 좋으시겠어요 앞으로는 어떻게 하실 생각이신가요
...,...,...,...,...
38962,3.0,5.0,지난주에 건강검진 결과가 나왔는데 정상이라고 결과가 나왔어,검진 결과가 좋게 나와서 안도하겠네요
38963,7.0,1.0,엄마는 내 꿈인 작가를 응원해 주고는 했는데 지금은 안 그래 너무 슬퍼,어머니의 응원이 사라져서 슬프시겠어요 조금 더 자세히 말해주시겠어요
38964,4.0,5.0,이렇게 좋은 운동 시설에서 경로 우대로 운동할 수 있다니 참 행운이야,운동 시설을 쓸 수 있어서 기분이 좋으시겠어요
38965,0.0,2.0,친구 관계가 너무 힘들어 베푸는 만큼 돌아오지 않는 것 같아,많이 서운하셨겠어요 친구들에게 서운함을 느꼈을 때 해결하는 방법이 있나요


In [None]:
train_set = train_df.loc[:, ['major_emotions', 'user1']]
validation_set = validation_df.loc[:, ['major_emotions', 'user1']]

In [None]:
train_set.dropna(inplace=True)
validation_set.dropna(inplace=True)

In [None]:
train_set_data = [[i, str(j)] for i, j in zip(train_set['user1'], train_set['major_emotions'])]
validation_set_data = [[i, str(j)] for i, j in zip(validation_set['user1'], validation_set['major_emotions'])]

train_set_data, test_set_data = train_test_split(train_set_data, test_size = 0.2, random_state=4)

train_set_data = BERTDataset(train_set_data, 0, 1, tok, vocab, max_len, True, False)
test_set_data = BERTDataset(test_set_data, 0, 1, tok, vocab, max_len, True, False)
train_dataloader = torch.utils.data.DataLoader(train_set_data, batch_size=batch_size, num_workers=2)
test_dataloader = torch.utils.data.DataLoader(test_set_data, batch_size=batch_size, num_workers=2)

In [None]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# epoch을 5로 설정
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0

    # train set
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    # test set
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/488 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.870856761932373 train acc 0.140625
epoch 1 batch id 201 loss 1.3948308229446411 train acc 0.2814832089552239
epoch 1 batch id 401 loss 1.0509400367736816 train acc 0.42565461346633415
epoch 1 train acc 0.4551421618852459


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/122 [00:00<?, ?it/s]

epoch 1 test acc 0.6155379098360656


  0%|          | 0/488 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 1.1282193660736084 train acc 0.59375
epoch 2 batch id 201 loss 1.1133124828338623 train acc 0.5982587064676617
epoch 2 batch id 401 loss 0.9728301763534546 train acc 0.6235193266832918
epoch 2 train acc 0.6291944159836066


  0%|          | 0/122 [00:00<?, ?it/s]

epoch 2 test acc 0.6296823770491804


  0%|          | 0/488 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.9584364891052246 train acc 0.640625
epoch 3 batch id 201 loss 0.8757079243659973 train acc 0.6580379353233831
epoch 3 batch id 401 loss 0.835349440574646 train acc 0.6811486907730673
epoch 3 train acc 0.6857389856557377


  0%|          | 0/122 [00:00<?, ?it/s]

epoch 3 test acc 0.6335040983606557


  0%|          | 0/488 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.8602699637413025 train acc 0.734375
epoch 4 batch id 201 loss 0.7212072610855103 train acc 0.716806592039801
epoch 4 batch id 401 loss 0.7257534265518188 train acc 0.7415056109725686
epoch 4 train acc 0.7449731045081968


  0%|          | 0/122 [00:00<?, ?it/s]

epoch 4 test acc 0.6399077868852459


  0%|          | 0/488 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.7776737809181213 train acc 0.78125
epoch 5 batch id 201 loss 0.6665778756141663 train acc 0.7623600746268657
epoch 5 batch id 401 loss 0.6635522246360779 train acc 0.7801979426433915
epoch 5 train acc 0.7814741290983607


  0%|          | 0/122 [00:00<?, ?it/s]

epoch 5 test acc 0.6352049180327869


In [None]:
!pwd

/content/drive/MyDrive/Colab Notebooks/ufriends-chatbot


In [None]:
torch.save(model, f'Model/SentimentAnalysisKOBert.pt')  # 모델 전체를 저장하는 첫 번째 방법
torch.save(model.state_dict(), f'Model/SentimentAnalysisKOBert_StateDict.pt') # 상태 파라미터 값만 저장하는 두 번째 방법

In [None]:
# 분노 0 / 슬픔 1 / 불안 2 / 상처 3 / 당황 4 / 기쁨 5

In [None]:
sentence = '저런 사람은 친하게 지내고 싶지 않아'
predict(sentence)

0

In [None]:
sentence = '오늘 너무 우울하다'
predict(sentence)

1

In [None]:
sentence = '시험에서 떨어지면 어떡하지?'
predict(sentence)

2

In [None]:
sentence = '친구랑 싸워서 속상해'
predict(sentence)

3

In [None]:
sentence = '지갑을 잃어버렸어'
predict(sentence)

4

In [None]:
sentence = '오늘 날씨 좋다'
predict(sentence)

5