In [None]:
import transformers
import torch
import pandas as pd
import numpy as np
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW, BitsAndBytesConfig
from tqdm.notebook import tqdm

# Check for CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
!pip install torch torchvision torchaudio
!pip install datasets



# 1. 데이터 수집 및 준비

In [None]:
train = pd.read_csv('./df.csv').iloc[:,1:]
val = pd.read_csv('./df_val.csv').iloc[:,1:]
train.head()

Unnamed: 0,speaker_id,form,standard_form,dialect_form,isDialect
0,2.0,난 진짜 벌초 할때 뱀나오잖아 뱀도,난 진짜 벌초 할때 뱀나오잖아 뱀도,난 진짜 벌초 할때 뱀나오잖아 뱀도,False
1,1.0,어어 그니까,어어 그니까,어어 그니까,False
2,2.0,아 잘도,아 잘도,아 잘도,False
3,1.0,예초기 뱀도 짤려,예초기 뱀도 짤려,예초기 뱀도 짤려,False
4,2.0,어 그니까 완전 끔찍해,어 그니까 완전 끔찍해,어 그니까 완전 끔찍해,False


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159636 entries, 0 to 159635
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   speaker_id     159633 non-null  float64
 1   form           159636 non-null  object 
 2   standard_form  157798 non-null  object 
 3   dialect_form   157800 non-null  object 
 4   isDialect      159636 non-null  bool   
dtypes: bool(1), float64(1), object(3)
memory usage: 5.0+ MB


In [None]:
train = train.loc[train['standard_form'].notnull()]

In [None]:
# speaker_id는 모델 학습에 사용하지 않기 때문에 결측치가 있어도 넘어갑니다.
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157798 entries, 0 to 159635
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   speaker_id     157795 non-null  float64
 1   form           157798 non-null  object 
 2   standard_form  157798 non-null  object 
 3   dialect_form   157798 non-null  object 
 4   isDialect      157798 non-null  bool   
dtypes: bool(1), float64(1), object(3)
memory usage: 6.2+ MB


# 2. 모델 선택 및 수정
- 모델은 KoBART 모델을 사용하고 싶음
- 한국어로 학습되어 있고 무게가 그렇게 무겁지 않아 사용하기에 적절하다고 판단
- URL : gogamza/kobart-base-v2

In [None]:
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration

# 모델과 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-base-v2")
model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-base-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


# 3. 모델 학습 및 fine-tuning

In [None]:
#! pip install -U accelerate
#! pip install -U transformers

import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.38.2', '0.28.0')

In [None]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='../chatbot/',          # output directory for model checkpoints
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=1,   # batch size per device during training
    per_device_eval_batch_size=1,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)


In [None]:
# Implement a subclass of `Dataset` to use with `Trainer`
from torch.utils.data import Dataset

class TranslationDataset(Dataset):
    def __init__(self, tokenizer, data, max_length=128):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = self.data.iloc[idx]['dialect_form']
        target_text = self.data.iloc[idx]['standard_form']
        source = self.tokenizer(source_text, return_tensors="pt", max_length=self.max_length, padding="max_length", truncation=True)
        target = self.tokenizer(target_text, return_tensors="pt", max_length=self.max_length, padding="max_length", truncation=True)
        source_ids = source["input_ids"].squeeze()
        target_ids = target["input_ids"].squeeze()
        return {"input_ids": source_ids, "labels": target_ids}

# Instantiate the dataset
train_dataset = TranslationDataset(tokenizer, train)
val_dataset = TranslationDataset(tokenizer, val)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
10,12.3587
20,9.8127
30,7.8248
40,6.2861
50,5.2808
60,4.5275
70,4.3151
80,3.9711
90,3.5588
100,3.2998


Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
