In [1]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset

In [2]:
from sklearn.model_selection import train_test_split
data = pd.read_excel('./dataset/alzhemer/translated_alzhemer.xlsx')

train, _ = train_test_split(data, test_size=0.15,random_state=42,shuffle=True)
validation, test = train_test_split(_, test_size=0.5,random_state=42,shuffle=True)

In [None]:
train.to_csv('./model/save_data/data_train.csv')
validation.to_csv('./model/save_data/data_validation.csv')
test.to_csv('./model/save_data/data_test.csv')

In [None]:
import re

def clean_text(text):
    # Menghapus karakter yang tidak diperlukan
    text = re.sub(r"[^a-zA-Z0-9.,!?/:;(){}\[\]\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Membersihkan data
train['Questions'] = train['Questions'].apply(clean_text)
train['Answers'] = train['Answers'].apply(clean_text)
validation['Questions'] = validation['Questions'].apply(clean_text)
validation['Answers'] = validation['Answers'].apply(clean_text)

In [None]:
train['text'] = "<bos> " + train['Questions'] + " <bot> " + train['Answers'] + " <eos>"
validation['text'] = "<bos> " + validation['Questions'] + " <bot> " + validation['Answers'] + " <eos>"
test['text'] = "<bos> " + test['Questions'] + " <bot> " + test['Answers'] + " <eos>"

train = train.dropna(subset=['text'])
validation = validation.dropna(subset=['text'])
test = validation.dropna(subset=['text'])

train_dataset = Dataset.from_pandas(train[['text']])
validation_dataset = Dataset.from_pandas(validation[['text']])

In [None]:
model_name = 'cahya/gpt2-small-indonesian-522M'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

tokenizer.add_special_tokens({"pad_token": "<pad>", 
                                "bos_token": "<bos>",
                                "eos_token": "<eos>"})
tokenizer.add_tokens(["<bot>"])

model.resize_token_embeddings(len(tokenizer))

In [None]:
def add_labels(example):
    tokens = tokenizer(
        example['text'],
        truncation=True,
        padding='max_length',
        max_length=400
    )
    return {
        'input_ids': tokens['input_ids'],
        'attention_mask': tokens['attention_mask'],
    }

tokenized_datasets_train = train_dataset.map(add_labels, batched = True)
tokenized_datasets_val = validation_dataset.map(add_labels, batched = True)

In [10]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    return {
        'input_ids': torch.tensor([item['input_ids'] for item in batch]),
        'attention_mask': torch.tensor([item['attention_mask'] for item in batch]),
    }

train_dataloader = DataLoader(tokenized_datasets_train, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(tokenized_datasets_val, batch_size=2, collate_fn=collate_fn)


## Fine Tuning with Torch

In [11]:
from torch.optim import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=2e-4)

num_training_steps = len(train_dataloader) * 5  
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:
import torch
from tqdm import tqdm

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

progress_bar = tqdm(range(num_training_steps))

for epoch in range(10):
    model.train()
    for batch in train_dataloader:
        batch = {key: val.clone().detach().to(device) for key, val in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        progress_bar.update(1)

    model.eval()
    val_loss = 0
    for batch in val_dataloader:
        with torch.no_grad():
            batch = {key: val.clone().detach().to(device) for key, val in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}: Validation Loss = {avg_val_loss}")


In [None]:
model.save_pretrained('./model/chatbot_gpt_2')
tokenizer.save_pretrained('./model/chatbot_gpt_2')