In [1]:
!pip install gluonnlp pandas tqdm   
!pip install mxnet
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m344.5/344.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.10.0-cp310-cp310-linux_x86_64.whl size=681108 sha256=d78402edda356caa9ed31f04ed53138d7d4acf1045d98d10e7788e3899db3aa8
  Stored in directory: /root/.cache/pip/wheels/1a/1e/0d/99f55911d90f2b95b9f7c176d5813ef3622894a4b30fde6bd3
Successfully built gluonnlp
Installing collected packages: gluonnlp
Successfully installed gluonnlp-0.10.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mxnet
  Downloading mxnet-1.9.1-py3-none-manylinux20

In [2]:
!pip uninstall -y gluonnlp
!pip install gluonnlp==0.9.1

Found existing installation: gluonnlp 0.10.0
Uninstalling gluonnlp-0.10.0:
  Successfully uninstalled gluonnlp-0.10.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gluonnlp==0.9.1
  Downloading gluonnlp-0.9.1.tar.gz (252 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.8/252.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.9.1-cp310-cp310-linux_x86_64.whl size=557742 sha256=ff95b3ad30114a26a569f02107fb53b84195af0ad3e3283804c60fda91ce3d6e
  Stored in directory: /root/.cache/pip/wheels/fc/5b/9c/3295bb07f7c5544a96303a48988707816f44a536e8e1413922
Successfully built gluonnlp
Installing collected packages: gluonnlp
Successfully installed gluonnlp-0.9.1


In [3]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm, tqdm_notebook
import pandas as pd
from sklearn.model_selection import train_test_split
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from google.colab import drive
import gluonnlp as nlp

In [4]:
drive.mount('/content/drive')
device = torch.device("cuda:0")

Mounted at /content/drive


In [5]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
tok = tokenizer.tokenize

# Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


Downloading (…)lve/main/config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/369M [00:00<?, ?B/s]

In [6]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):
   
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(float(i[label_idx])) for i in dataset]

    def __getitem__(self, i):
        return self.sentences[i] + (self.labels[i],)
         
    def __len__(self):
        return len(self.labels)

In [7]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=8, # keywords 라벨 개수
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [8]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc
    
def predict(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=2)
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

In [9]:
%cd './drive/MyDrive/Colab Notebooks/Github/ufriends-chatbot/'

/content/drive/MyDrive/Colab Notebooks/Github/ufriends-chatbot


In [10]:
!pwd

/content/drive/MyDrive/Colab Notebooks/Github/ufriends-chatbot


In [11]:
train_df = pd.read_csv('Dataset/Preprocessing/train_df_preprocess.csv')
validation_df = pd.read_csv('Dataset/Preprocessing/validation_df_preprocess.csv')

In [12]:
train_df

Unnamed: 0,keywords,major_emotions,Q,A
0,2.0,5.0,아내가 드디어 출산하게 되어서 정말 신이 나,아내분이 출산을 하시는군요 정말 축하드려요
1,3.0,2.0,당뇨랑 합병증 때문에 먹어야 할 약이 열 가지가 넘어가니까 스트레스야,약 종류가 많아 번거로우시겠어요
2,5.0,4.0,고등학교에 올라오니 중학교 때보다 수업이 갑자기 어려워져서 당황스러워,고등학교 수업이 중학교 때와는 다르게 갑자기 어려워서 당황스러우시군요
3,4.0,5.0,재취업이 돼서 받게 된 첫 월급으로 온 가족이 외식을 할 예정이야 너무 행복해,재취업 후 첫 월급이라 정말 기쁘시겠어요
4,4.0,5.0,빚을 드디어 다 갚게 되어서 이제야 안도감이 들어,기분 좋으시겠어요 앞으로는 어떻게 하실 생각이신가요
...,...,...,...,...
175139,3.0,2.0,미리 미리 건강 챙기고 모두 안 아팠으면 좋겠어,친구들과의 전화로 기분이 나아졌으면 좋겠어요
175140,4.0,4.0,주변에 믿음직한 사람들에게서 정보도 많이 얻고 달콤한 말은 항상 의심하고 볼래,신뢰가 있는 사람들에게서 정보도 얻고 하고자 하는 일도 잘되길 바라요
175141,3.0,2.0,친구들에게 내 마음을 터놓고 얘기하면 좀 나아질 것 같아,친구들과 술 한잔하시고 기분이 좋아지셨으면 좋겠어요
175142,0.0,4.0,남편에게 이런 내 마음을 솔직하게 얘기해 봐야겠어,솔직하게 얘기하셔서 외롭고 우울한 마음이 나아지셨으면 좋겠어요


In [13]:
train_set = train_df.loc[:, ['keywords', 'Q']]
validation_set = validation_df.loc[:, ['keywords', 'Q']]

In [14]:
train_set.dropna(inplace=True)
validation_set.dropna(inplace=True)

In [15]:
train_set_data = [[i, str(j)] for i, j in zip(train_set['Q'], train_set['keywords'])]
validation_set_data = [[i, str(j)] for i, j in zip(validation_set['Q'], validation_set['keywords'])]

train_set_data, test_set_data = train_test_split(train_set_data, test_size = 0.2, random_state=4)

train_set_data = BERTDataset(train_set_data, 0, 1, tok, vocab, max_len, True, False)
test_set_data = BERTDataset(test_set_data, 0, 1, tok, vocab, max_len, True, False)
train_dataloader = torch.utils.data.DataLoader(train_set_data, batch_size=batch_size, num_workers=2)
test_dataloader = torch.utils.data.DataLoader(test_set_data, batch_size=batch_size, num_workers=2)

In [16]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# epoch을 5로 설정
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0

    # train set
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    # test set
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/2190 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 2.120027542114258 train acc 0.0625
epoch 1 batch id 201 loss 1.9307321310043335 train acc 0.20942164179104478
epoch 1 batch id 401 loss 1.2753227949142456 train acc 0.3341256234413965
epoch 1 batch id 601 loss 1.3561736345291138 train acc 0.4122816139767055
epoch 1 batch id 801 loss 1.2442179918289185 train acc 0.4564021535580524
epoch 1 batch id 1001 loss 1.21511971950531 train acc 0.48284527972027974
epoch 1 batch id 1201 loss 1.3567055463790894 train acc 0.5005334096586178
epoch 1 batch id 1401 loss 0.8288720846176147 train acc 0.5145989471805853
epoch 1 batch id 1601 loss 1.0675550699234009 train acc 0.5250722204871955
epoch 1 batch id 1801 loss 1.0726985931396484 train acc 0.534139019988895
epoch 1 batch id 2001 loss 0.9659137725830078 train acc 0.5420180534732634
epoch 1 train acc 0.5472645547945206


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/548 [00:00<?, ?it/s]

epoch 1 test acc 0.6183907390510949


  0%|          | 0/2190 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.7697555422782898 train acc 0.734375
epoch 2 batch id 201 loss 0.8571300506591797 train acc 0.6169154228855721
epoch 2 batch id 401 loss 1.0040977001190186 train acc 0.6182200748129676
epoch 2 batch id 601 loss 1.271807074546814 train acc 0.617928452579035
epoch 2 batch id 801 loss 1.1075308322906494 train acc 0.6223665730337079
epoch 2 batch id 1001 loss 1.1644971370697021 train acc 0.6251404845154845
epoch 2 batch id 1201 loss 1.2181308269500732 train acc 0.6286558076602831
epoch 2 batch id 1401 loss 0.7507872581481934 train acc 0.6315689685938616
epoch 2 batch id 1601 loss 0.8412795066833496 train acc 0.6335688632104934
epoch 2 batch id 1801 loss 0.9367702603340149 train acc 0.636148320377568
epoch 2 batch id 2001 loss 0.8375950455665588 train acc 0.6386728510744628
epoch 2 train acc 0.6398496955859969


  0%|          | 0/548 [00:00<?, ?it/s]

epoch 2 test acc 0.630457344890511


  0%|          | 0/2190 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.7131105065345764 train acc 0.78125
epoch 3 batch id 201 loss 0.7842753529548645 train acc 0.6666666666666666
epoch 3 batch id 401 loss 0.8847904205322266 train acc 0.6661860972568578
epoch 3 batch id 601 loss 1.127920150756836 train acc 0.6675332778702163
epoch 3 batch id 801 loss 0.8864397406578064 train acc 0.6731039325842697
epoch 3 batch id 1001 loss 0.9314820170402527 train acc 0.6770417082917083
epoch 3 batch id 1201 loss 1.0059678554534912 train acc 0.6812161740216486
epoch 3 batch id 1401 loss 0.6102193593978882 train acc 0.6843772305496074
epoch 3 batch id 1601 loss 0.6924132108688354 train acc 0.6875097595252967
epoch 3 batch id 1801 loss 0.7581818699836731 train acc 0.690545183231538
epoch 3 batch id 2001 loss 0.7371646761894226 train acc 0.6931456146926537
epoch 3 train acc 0.6945999809741248


  0%|          | 0/548 [00:00<?, ?it/s]

epoch 3 test acc 0.635036496350365


  0%|          | 0/2190 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.6225563287734985 train acc 0.765625
epoch 4 batch id 201 loss 0.5908764600753784 train acc 0.7240360696517413
epoch 4 batch id 401 loss 0.7010707855224609 train acc 0.7238154613466334
epoch 4 batch id 601 loss 0.9551364779472351 train acc 0.7262635191347754
epoch 4 batch id 801 loss 0.7827761173248291 train acc 0.730727215980025
epoch 4 batch id 1001 loss 0.767422080039978 train acc 0.733735014985015
epoch 4 batch id 1201 loss 0.7898653745651245 train acc 0.7378226477935054
epoch 4 batch id 1401 loss 0.4631989300251007 train acc 0.7405201641684511
epoch 4 batch id 1601 loss 0.552342414855957 train acc 0.7429438632104934
epoch 4 batch id 1801 loss 0.6362242698669434 train acc 0.7446904497501388
epoch 4 batch id 2001 loss 0.5215290784835815 train acc 0.7463065342328835
epoch 4 train acc 0.7474200913242008


  0%|          | 0/548 [00:00<?, ?it/s]

epoch 4 test acc 0.6357835310218979


  0%|          | 0/2190 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.5118855237960815 train acc 0.84375
epoch 5 batch id 201 loss 0.4731064438819885 train acc 0.7690453980099502
epoch 5 batch id 401 loss 0.6250147223472595 train acc 0.7676122194513716
epoch 5 batch id 601 loss 0.8610291481018066 train acc 0.7671589018302829
epoch 5 batch id 801 loss 0.6813979744911194 train acc 0.7715941011235955
epoch 5 batch id 1001 loss 0.5448724031448364 train acc 0.772977022977023
epoch 5 batch id 1201 loss 0.6338707208633423 train acc 0.7757207535387177
epoch 5 batch id 1401 loss 0.4146609604358673 train acc 0.7772350107066381
epoch 5 batch id 1601 loss 0.48305246233940125 train acc 0.778829637726421
epoch 5 batch id 1801 loss 0.6427874565124512 train acc 0.7798011521377013
epoch 5 batch id 2001 loss 0.45119863748550415 train acc 0.780250499750125
epoch 5 train acc 0.7805270167427703


  0%|          | 0/548 [00:00<?, ?it/s]

epoch 5 test acc 0.6345346715328467


In [17]:
!pwd

/content/drive/MyDrive/Colab Notebooks/Github/ufriends-chatbot


In [18]:
torch.save(model, f'Model/keywords-train-SentimentAnalysisKOBert.pt')
torch.save(model.state_dict(), f'Model/keywords-train-SentimentAnalysisKOBert_StateDict.pt')

In [None]:
# 대인관계, 대인관계(부부, 자녀) - 0 
# 진로, 취업, 직장, 업무 스트레스 - 1
# 연애, 결혼, 출산 - 2
# 건강, 죽음 - 3
# 재정, 은퇴, 노후준비 - 4
# 학업 및 진로 - 5
# 학교폭력, 따돌림 - 6
# 가족관계 - 7

In [19]:
sentence = '친구랑 싸워서 속상해'
predict(sentence)

0

In [20]:
sentence = '회사에서 해고 당했어'
predict(sentence)

1

In [21]:
sentence = '아이를 키우려면 돈이 많이 들어'
predict(sentence)

2

In [22]:
sentence = '비타민 챙겨 먹어야지'
predict(sentence)

3

In [23]:
sentence = '자식들과 여행하며 살고싶어'
predict(sentence)

4

In [24]:
sentence = '시험에서 떨어지면 어떡하지?'
predict(sentence)

5

In [25]:
sentence = '친구들을 괴롭히는 것은 나빠'
predict(sentence)

6

In [26]:
sentence = '부모님과 싸웠어'
predict(sentence)

7