In [1]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
from tokenizers import Tokenizer
from typing import Dict, List, Optional
from torch.utils.data import Dataset
import torch

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random


from IPython.display import display
from typing import Dict

import datetime

In [2]:
Style_Data = pd.read_csv("./Style_Dataset/style_dataset.csv", sep="\t")
# display(df.head())
# display(df.isna().mean())
# display(df.describe())
# print(df.shape)

In [3]:
row_notna_count = Style_Data.notna().sum(axis=1)
# row_notna_count.plot.hist(bins=row_notna_count.max())
plt.show()

Style_Data = Style_Data[row_notna_count >= 2]
print(len(Style_Data))

3470


In [4]:
BOS = "</s>"
EOS = "</s>"
PAD = "<pad>"
MASK = "<unused0>"
Q_TKN = "<unused1>"
S_TKN = {"formal": "<unused10>", 
         "informal":"<unused11>",
         "android": "<unused12>",
         "azae": "<unused13>",
         "chat": "<unused14>",
         "choding": "<unused15>",
         "emoticon": "<unused16>",
         "enfp": "<unused17>",
         "gentle": "<unused18>", 
         "halbae": "<unused19>",
         "halmae": 	"<unused20>",
         "joongding": "<unused21>",
         "king": "<unused22>",
         "naruto": "<unused23>",
         "seonbi": "<unused24>",
         "sosim": "<unused25>",
         "translator": "<unused26>"
}
A_TKN = "<unused3>"
SENT = "<unused4>"

styles = ["formal",
          "informal",
          "android",
          "choding",
          "emoticon",
          "king",
          "naruto",
          "seonbi"
]

all_styles = ["formal",
            "informal",
            "android",
            "azae",
            "chat",
            "choding",
            "emoticon",
            "enfp",
            "gentle",
            "halbae",
            "halmae",
            "joongding",
            "king",
            "naruto",
            "seonbi",
            "sosim",
            "translator"
]

style_map = {
    'formal': '문어체',
    'informal': '구어체',
    'android': '안드로이드',
    'azae': '아재',
    'chat': '채팅',
    'choding': '초등학생',
    'emoticon': '이모티콘',
    'enfp': 'enfp',
    'gentle': '신사',
    'halbae': '할아버지',
    'halmae': '할머니',
    'joongding': '중학생',
    'king': '왕',
    'naruto': '나루토',
    'seonbi': '선비',
    'sosim': '소심한',
    'translator': '번역기'
}

model_name = "gogamza/kobart-base-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name, 
   bos_token=BOS, 
   eos_token=EOS, 
   unk_token="<unk>", 
   pad_token=PAD, 
   mask_token=MASK,)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [5]:
def generate_text1(df: pd.DataFrame, index):
    text = ""
    while (text == ""):
        idx = 1
        randnum = random.random()
        if(randnum > 0.8):
            idx = 0
        text = df.iloc[index, :][idx]
    return text

def generate_text2(df: pd.DataFrame, index):
    text = ""
    while (text == ""):
        randnum = random.randint(2, len(styles) - 1)
        target_style = styles[randnum]
        text = df.iloc[index, :][target_style]
    return text, target_style

In [6]:
class TextStyleTransferDataset(Dataset):
  def __init__(self, 
               df: pd.DataFrame, 
               tokenizer: Tokenizer
               ):
    self.df = df
    self.tokenizer = tokenizer
    
  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    row = self.df.iloc[index, :].dropna().sample(2)
    text1 = row[0]
    text2 = row[1]
    target_style = row.index[1]
    # text1 = generate_text1(self.df, index)
    # text2, target_style = generate_text2(self.df, index)

    encoder_text = f"{S_TKN[target_style]} translate: {text1}"
    decoder_text = f"{text2}{self.tokenizer.eos_token}"
    model_inputs = self.tokenizer(encoder_text, max_length=64, truncation=True)
    with self.tokenizer.as_target_tokenizer():
      labels = self.tokenizer(decoder_text, max_length=64, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    del model_inputs['token_type_ids']

    return model_inputs
  
  

In [7]:
dataset = TextStyleTransferDataset(Style_Data, tokenizer)
out = dataset[0]
print(out['input_ids'])
print(out['labels'])
print(tokenizer.decode(out['input_ids']))
print(tokenizer.decode(out['labels']))

out = dataset[1]
print(out['input_ids'])
print(out['labels'])
print(tokenizer.decode(out['input_ids']))
print(tokenizer.decode(out['labels']))

[27, 18090, 313, 15195, 314, 307, 22409, 257, 22465, 11699, 9592, 325, 232, 14054, 17849, 12034, 14195, 10496, 24665, 14947, 11914, 9754, 262, 18579, 25442, 12034, 12348, 325]
[14042, 11986, 14044, 10834, 9042, 1700, 9147, 12034, 9615, 14195, 26832, 22679, 14304, 26472, 14674, 9059, 9567, 1]
<unused20> translate: 안녕안녕~! 나 고양이 6마리나 키운다? 완전 대박이징~
하유 시벌것 괭이놈 6마리 키우는데 힘들어 죽겟네</s>
[32, 18090, 313, 15195, 314, 307, 22409, 257, 17849, 15188, 14195, 10496, 24665, 262, 25144, 9031, 15994, 12332, 14449, 8981, 262]
[17849, 12034, 14176, 253, 10496, 24665, 262, 1700, 1275, 25144, 9034, 20604, 14105, 13848, 17714, 14176, 262, 1]
<unused25> translate: 고양이를 6마리나? 키우는거 힘들지 않는가?
고양이..6마리나? ᅲ 키우는건 혹시 안힘들어..?</s>




In [8]:
from sklearn.model_selection import train_test_split

# 학습을 위해 train, test set으로 나눈다.

df_train, df_test = train_test_split(Style_Data, test_size=0.1, random_state=42)
df_train = pd.concat([df_train, df_train, df_train])
print(len(df_train), len(df_test))

9369 347


In [11]:
train_dataset = TextStyleTransferDataset(
    df_train,
    tokenizer
)
test_dataset = TextStyleTransferDataset(
    df_test,
    tokenizer
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Device:', device)
# if(device != "cpu"):
#     print('Current cuda device:', torch.cuda.current_device())
#     print('Count of using GPUs:', torch.cuda.device_count())

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

model = model.to(device)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, model=model
)

Device: cpu


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [None]:
model_path = "./text-transfer_1128" + datetime.datetime.now().strftime("%m월 %d일 %H시 %M분")

training_args = Seq2SeqTrainingArguments(
    output_dir=model_path, #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=24, # number of training epochs
    per_device_train_batch_size=16, # batch size for training
    per_device_eval_batch_size=16,  # batch size for evaluation
    eval_steps=500, # Number of update steps between two evaluations.
    save_steps=1000, # after # steps model is saved 
    warmup_steps=300,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    evaluation_strategy="steps",
    save_total_limit=3
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 9369
  Num Epochs = 24
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 14064
  Number of trainable parameters = 123859968



[A[A[AYou're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A[E thread_pool.cpp:113] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:113] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:113] Exception in thread pool task: mutex lock failed: Invalid argument


KeyboardInterrupt: 

In [None]:
trainer.save_model()

Saving model checkpoint to ./text-transfer_112812월 04일 16시 04분
Configuration saved in ./text-transfer_112812월 04일 16시 04분/config.json
Model weights saved in ./text-transfer_112812월 04일 16시 04분/pytorch_model.bin


In [None]:
from transformers import pipeline

selected_model_path = "text-transfer_112812월 02일 22시 10분"

nlg_pipeline = pipeline('text2text-generation',model=model_path, tokenizer=model_name)

loading configuration file ./text-transfer_112812월 04일 16시 04분/config.json
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
Model config BartConfig {
  "_name_or_path": "./text-transfer_112812\uc6d4 04\uc77c 16\uc2dc 04\ubd84",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "bos_token_id": 1,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.1,
  "d_model": 768,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 1,
  "do_blenderbot_90_layernorm": false,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "e

In [None]:
def generate_text(pipe, text, target_style, num_return_sequences=5, max_length=60):
  text = f"{S_TKN[target_style]} translate: {text}"
  out = pipe(text, num_return_sequences=num_return_sequences, max_length=max_length)
  return [x['generated_text'] for x in out]

In [None]:
src_text = "말투가 바뀌긴 하는데 성능이 그렇게 좋은지는 잘 모르겠다 이거 어쩌지"

# print(generate_text(nlg_pipeline, src_text, "chat", num_return_sequences=1, max_length=1000))
print("input : ", src_text)
for style in styles[2:]:
  print(style, generate_text(nlg_pipeline, src_text, style, num_return_sequences=1, max_length=1000)[0])

input :  말투가 바뀌긴 하는데 성능이 그렇게 좋은지는 잘 모르겠다 이거 어쩌지
android 말투. 변화. 그러나. 성능. 좋음. 이유. 모름.
choding 말투 바뀌긴 하는데 성능이 그렇게 좋은지는 잘 모름
emoticon 말투 바뀌긴 하는데 성능이 그렇게 좋은지는 모르겠어...(; ́д`)ゞ
king 말투가 바뀌었는디, 성능이 그렇게 좋은지는 잘 모르겠소.
naruto 말투가 바뀌긴 하는데 성능이 그렇게 좋은지는 모르겠다니깐!
seonbi 말투가 바뀌긴 하는데 성능이 그렇게 좋은지는 잘 모르겠소!


In [None]:
style = "choding"

while(1):
    src_text = input()
    print(style, generate_text(nlg_pipeline, src_text, style, num_return_sequences=1, max_length=1000)[0])


choding 회사 출근 8시?
choding 회사 출근은 언제?
choding 회사 출근은 8시임
choding 회사 출근 8시임
choding 회사 출근 7번임
choding 회사 출근 7시임
choding 회사 출근은 8시임
choding ᄋᄋ 그래서 일찍 일어나
choding 8시 30분쯤 일어나야 쌉가능
choding 나 약속 8시
choding ᄋᄋ 재택 근무 중
choding ᄅᄋ? 그럼 내가 먼저 일어나서 밥 먹고 출발해야겠다
choding 재택 근무하면 뭐가 좋음?
choding 아빠는 잘 모르는데
choding 아빠는 누나랑 잘 싸움
choding ᄋᄋ 오빠는 얼마 안 벌고 있음
choding 한달에 쯤은 셤
choding ᄋᄋᄋᄋᄋᄋᄋᄋᄋᄋᄋᄋᄋᄋᄋᄋᄋᄋᄋᄋᄋᄋᄋ
choding 공부원 중임
choding ᄋᄋ
choding 난 공무원임
choding 담주 화욜에 셤
choding 커닝시티?
choding 난 그거 안해봄
choding 그거 게임으로 돈 벌 수 있음
choding 그거 넷플에서 하는 영화임
choding 디즈니 스프린터에도 있음?
choding 그래야겠다
choding ᄋᄋ 밥 먹고 영화보면서 팝콘 먹자


KeyboardInterrupt: Interrupted by user