In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3.0.2
!pip install torch

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import traceback

In [None]:
!pip install --upgrade -q pyproj

In [None]:
def mount_drive():
    from google.colab import drive
    drive.mount('/content/gdrive')
    %cd /content/gdrive/MyDrive/NLP

In [None]:
if __name__ == '__main__':
  mount_drive()

In [None]:
if __name__ == '__main__':
  whole_dataset = pd.read_excel('data/chat_data.xlsx')
  whole_dataset.head()

In [None]:
def data_processing(raw_data):    

    # 0: '공포', 1: '놀람', 2: '분노', 3: '슬픔', 4: '중립', 5: '행복, 6: '혐오'
    # Emotion 열에 있는 문자열을 대응되는 정수로 변환해주자
    # Hint : loc 함수를 활용하여 바꾸어 보자.
    ## 여기에 코드 작성

    # 판다스의 concat을 활용하여 'document' 데이터와 'label' 데이터를 연결해보자.
    # 연결한 데이터의 이름은 processed_data라고 하자.
    ## 여기에 코드 작성

    processed_data.columns = ['sentence', 'label']

    return processed_data

In [None]:
def data_to_token_ids(tokenizer, single_sentence):
    # CLS 토큰과 SEP 토큰을 문장의 시작과 끝에 붙여보자.
    special_token_added = "[CLS] " + str(single_sentence) + " [SEP]"
    
    # KoBERTTokenizer의 tokenize 함수를 활용하여 문장을 토큰화해보자.
    tokenized_text = tokenizer.tokenize(special_token_added)

    # KoBERTTokenizer의 convert_tokens_to_ids 함수를 활용하여 생성된 토큰을 숫자 형태로 바꿔주자.
    token_ids = [tokenizer.convert_tokens_to_ids(tokenized_text)]

    MAX_LEN = 128
    # pad_sequences 함수를 활용하여 문장의 빈 칸에 padding을 넣어주자.
    token_ids_padded = pad_sequences(token_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    token_ids_flatten = token_ids_padded.flatten()
    return token_ids_flatten

In [None]:
def token_ids_to_mask(token_ids):
    
    # token_id에서 0보다 큰 숫자만 유효하도록 하는 'mask' 리스트를 만들자.
    mask = [float(i>0) for i in token_ids]
    
    return mask

In [None]:
def tokenize_processed_data(tokenizer, processed_dataset):
    labels = processed_dataset['label'].to_numpy()

    labels = labels.astype(np.int)
    
    # list comprehension을 활용하여 processed_dataset의 'sentence' 데이터를 id값으로 토큰화하자.
    tokenized_data = [data_to_token_ids(tokenizer, processed_data) for processed_data in processed_dataset['sentence']]

    # list comprehension을 활용하여 앞서 토큰화한 데이터 id를 mask로 변환하자.
    attention_masks = [token_ids_to_mask(token_ids) for token_ids in tokenized_data]
    
    return tokenized_data, labels, attention_masks

In [None]:
def split_into_train_validation(whole_data, whole_label, whole_masks):
    print("length of whole_data : " + str(len(whole_data)))
    
    # split_into_train_test의 코드를 참조하여 data와 mask를  train을 위한 것과 validation을 위한 것으로 나누자.
    
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(whole_data,
                                                                                    whole_label, 
                                                                                    random_state=2022, 
                                                                                    test_size=0.1)
    train_masks, validation_masks, _, _ = train_test_split(whole_masks, 
                                                       whole_data,
                                                       random_state=2022, 
                                                       test_size=0.1)
    
    print("length of train_data : " + str(len(train_data)))
    
    return train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks

In [None]:
def data_to_tensor(inputs, labels, masks):
    inputs_tensor = torch.tensor(inputs)
    labels_tensor = torch.tensor(labels)
    masks_tensor = torch.tensor(masks)
    return inputs_tensor, labels_tensor, masks_tensor

In [None]:
def tensor_to_dataloader(inputs, labels, masks, mode):
    from torch.utils.data import RandomSampler, SequentialSampler
    
    batch_size=32
    data = TensorDataset(inputs, masks, labels)
    
    if mode == "train":
        # train에 적합한 sampler을 지정하자.
        sampler = RandomSampler(data)
    else:
        # test에 적합한 sampler을 지정하자.
        sampler = SequentialSampler(data)
    
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
    
    return dataloader

In [None]:
def preproc(tokenizer, whole_dataset):
    # whole_dataset을 전처리하자.
    processed_dataset = data_processing(whole_dataset)
    
    # 전처리한 전체 데이터를 토큰화하자.
    tokenized_dataset, labels, attention_masks = tokenize_processed_data(tokenizer, processed_dataset)

    # 토큰화한 train용 데이터를 train용과 validation용으로 분리하자.
    train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = split_into_train_validation(tokenized_dataset, labels, attention_masks)

    # train용, validation용 데이터 각각을 텐서로 변환하자.
    train_inputs, train_labels, train_masks = data_to_tensor(train_inputs, train_labels, train_masks)
    validation_inputs, validation_labels, validation_masks = data_to_tensor(validation_inputs, validation_labels, validation_masks)

    # train용, validation용 텐서를 dataloader로 변환하자. 
    train_dataloader = tensor_to_dataloader(train_inputs, train_labels, train_masks, "train")
    validation_dataloader = tensor_to_dataloader(validation_inputs, validation_labels, validation_masks, "validation")

    return train_dataloader, validation_dataloader

In [None]:
def main():
    from nlp_tokenization import KoBertTokenizer

    # 전체 데이터를 불러오자.
    whole_dataset = pd.read_excel('/content/gdrive/MyDrive/NLP/data/chat_data.xlsx')        

    # KoBERTTokenizer를 불러오자.
    tokenizer = KoBertTokenizer.from_pretrained("monologg/kobert")
  
    train, valid = preproc(tokenizer, whole_dataset)        

In [None]:
if __name__ == '__main__':
    main()