In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [5]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset

In [6]:
from sklearn.model_selection import train_test_split
data = pd.read_excel('./dataset/alzhemer/translated_alzhemer.xlsx')

train, _ = train_test_split(data, test_size=0.15,random_state=42,shuffle=True)
validation, test = train_test_split(_, test_size=0.5,random_state=42,shuffle=True)

In [7]:
print(len(train))
print(len(validation))
print(len(test))

15300
1350
1350


In [4]:
train.to_csv('./model/save_data/data_train.csv')
validation.to_csv('./model/save_data/data_validation.csv')
test.to_csv('./model/save_data/data_test.csv')

In [8]:
import re

def clean_text(text):
    # Menghapus karakter yang tidak diperlukan
    text = re.sub(r"[^a-zA-Z0-9.,!?/:;(){}\[\]\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

train['Questions'] = train['Questions'].apply(clean_text)
train['Answers'] = train['Answers'].apply(clean_text)
validation['Questions'] = validation['Questions'].apply(clean_text)
validation['Answers'] = validation['Answers'].apply(clean_text)

In [9]:
train['text'] = "<bos> " + train['Questions'] + " <bot> " + train['Answers'] + " <eos>"
validation['text'] = "<bos> " + validation['Questions'] + " <bot> " + validation['Answers'] + " <eos>"
test['text'] = "<bos> " + test['Questions'] + " <bot> " + test['Answers'] + " <eos>"

train = train.dropna(subset=['text'])
validation = validation.dropna(subset=['text'])
test = validation.dropna(subset=['text'])

train_dataset = Dataset.from_pandas(train[['text']])
validation_dataset = Dataset.from_pandas(validation[['text']])

In [11]:
train['text_length'] = train['text'].str.len()
validation['text_length'] = validation['text'].str.len()

print("Panjang maksimum pada dataset train:", train['text_length'].mean())
print("Panjang maksimum pada dataset validation:", validation['text_length'].mean())

Panjang maksimum pada dataset train: 401.817908496732
Panjang maksimum pada dataset validation: 399.8103703703704


In [12]:
model_name = 'cahya/gpt2-small-indonesian-522M'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

tokenizer.add_special_tokens({"pad_token": "<pad>", 
                                "bos_token": "<bos>",
                                "eos_token": "<eos>"})
tokenizer.add_tokens(["<bot>"])

model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50261, 768)

In [13]:
def add_labels(example):
    tokens = tokenizer(
        example['text'],
        truncation=True,
        padding='max_length',
        max_length=400
    )
    return {
        'input_ids': tokens['input_ids'],
        'attention_mask': tokens['attention_mask'],
    }

tokenized_datasets_train = train_dataset.map(add_labels, batched = True)
tokenized_datasets_val = validation_dataset.map(add_labels, batched = True)

Map:   0%|          | 0/15300 [00:00<?, ? examples/s]

Map:   0%|          | 0/1350 [00:00<?, ? examples/s]

In [14]:
from transformers import DataCollatorForLanguageModeling

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

data_collator = load_data_collator(tokenizer)

In [10]:
training_args = TrainingArguments(
        output_dir='/kaggle/working/mental_health_gpt_1_f',
        overwrite_output_dir=True,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=10,
        eval_strategy="epoch",
        eval_steps=500,
        save_total_limit=2,
        fp16=True,
        logging_dir="./logs",
        logging_steps=100,
        save_steps=500,
        learning_rate=2e-4,
        weight_decay=0.001,
        report_to=[],
    )

In [None]:
trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_datasets_train,
        eval_dataset=tokenized_datasets_val
    )

In [None]:
train = trainer.train()

In [None]:
model.save_pretrained('./model/chatbot_gpt_2')
tokenizer.save_pretrained('./model/chatbot_gpt_2')

In [15]:
# Function to generate a response
def generate_response(user_input):
    # Format the input with special tokens
    input_text = f"<bos> {user_input} <bot>"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    input_ids = input_ids.to(model.device)
    
    output_ids = model.generate(
        input_ids=input_ids,
        do_sample=True,
        max_length=512,
        top_k=10,
        top_p=0.95,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    if "<bot>" in response:
        response = response.split("<bot>")[-1].strip()
    return response

user_input = "Bagaimana pola makan memengaruhi kesehatan pembuluh darah pada pasien Alzheimer?"
print(user_input)
response = generate_response(user_input)
print(f"Chatbot: {response}")

Bagaimana pola makan memengaruhi kesehatan pembuluh darah pada pasien Alzheimer?
Chatbot: Pola makan yang sehat dapat mengurangi risiko pembuluh darah, yang terkait dengan meningkatnya risiko penyakit jantung dan penyakit Alzheimer.


# Try

In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_path = './model/chatbot_gpt_2'
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

tokenizer.pad_token = "<pad>"
tokenizer.bos_token = "<bos>"
tokenizer.eos_token = "<eos>"

model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50261, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50261, bias=False)
)

In [4]:
def generate_response(user_input):
    input_text = f"<bos> {user_input} <bot>"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    input_ids = input_ids.to(model.device)
    
    output_ids = model.generate(
        input_ids=input_ids,
        do_sample=True,
        max_length=512,
        top_k=10,
        top_p=0.95,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    if "<bot>" in response:
        response = response.split("<bot>")[-1].strip()
    return response

user_input = "Apa hubungan antara penyakit Alzheimer dan hilangnya minat dalam beraktivitas?"
print(user_input)
response = generate_response(user_input)
print(f"Chatbot: {response}")

Apa hubungan antara penyakit Alzheimer dan hilangnya minat dalam beraktivitas?
Chatbot: Penyakit Alzheimer dapat memengaruhi jalur kognisi, jalur saraf yang penting untuk mengirimkan energi dan aktivitas di otak. Disregulasi dalam kognisi ini, yang umum terjadi pada tahap awal penyakit, termasuk kesulitan dalam memperhatikan dan memahami hubungan spasial antara neuron dan jalur saraf.


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [11]:
while True:
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit"]:
        print("Chatbot: Sampai jumpa!")
        break
    response = generate_response(user_input)
    print(f"You: {user_input}")
    print(f"Bot: {response}")

You: Bisakah yoga dan meditasi membantu mengurangi risiko Alzheimer setelah cedera kepala?
Bot: Ya, yoga dan praktik meditasi dapat membantu mengurangi stres, meningkatkan kesejahteraan emosional, dan meningkatkan fungsi kognitif, sehingga berpotensi mengurangi risiko Alzheimer setelah cedera kepala.
You: Bagaimana tekanan darah tinggi memengaruhi produksi faktor pertumbuhan di otak, dan dapatkah perubahan kadar faktor pertumbuhan memengaruhi risiko Alzheimer?
Bot: Tekanan darah tinggi dapat memengaruhi produksi faktor pertumbuhan di otak, yang berpotensi memengaruhi risiko Alzheimer. Kontrol tekanan darah sangat penting untuk mempertahankan kadar faktor pertumbuhan yang optimal dan mendukung kesehatan kognitif.
Chatbot: Sampai jumpa!


# Evaluate

In [16]:
# Dataset diubah menjadi format Excel secara manual karena terdapat masalah saat pembacaan format CSV
test = pd.read_excel('./model/save_data/dup_test.xlsx')
print(test.head())
print(len(test))

                                           Questions  \
0  Bagaimana perkembangan penyakit Alzheimer meme...   
1  Dapatkah penelitian terapi gen menjelaskan dam...   
2  Tradisi atau ritual apa yang menurut Anda berm...   
3  Apakah ada cara untuk mengurangi risiko penyak...   
4  Bagaimana penyakit Alzheimer diobati, dan apak...   

                                             Answers  
0  Seiring perkembangan penyakit Alzheimer, pende...  
1  Penelitian terapi gen bertujuan untuk menjelas...  
2  Berjalan-jalan santai di taman terdekat setiap...  
3  Meskipun risikonya tidak dapat sepenuhnya dihi...  
4  Meskipun tidak ada obatnya, latihan fisik, akt...  
1350


In [68]:
test['generate'] = test['Questions'].apply(generate_response)

In [69]:
# Simpan hasil generate response ke dalam file Excel agar dapat dievaluasi dengan mudah sewaktu-waktu
output_path = './model/save_data/response/response.xlsx'
test.to_excel(output_path, index=False)

In [17]:
import torch
import numpy as np
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import single_meteor_score

# Fungsi untuk menghitung perplexity
def calculate_perplexity(model, tokenizer, text):
    encodings = tokenizer(text, return_tensors="pt")
    input_ids = encodings['input_ids'].to(model.device)

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss.item()

    perplexity = np.exp(loss)
    return perplexity

# Fungsi untuk menghitung metrik evaluasi
def evaluate_metrics(reference, hypothesis):
    smooth_fn = SmoothingFunction().method2
    bleu_score = sentence_bleu([reference], hypothesis, smoothing_function=smooth_fn)

    rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = rouge.score(reference, hypothesis)

    meteor_score = single_meteor_score(reference.split(), hypothesis.split())

    return bleu_score, rouge_scores, meteor_score

data_file = './model/save_data/response/response.xlsx'
data = pd.read_excel(data_file)

if 'Answers' in data.columns and 'generate' in data.columns:
    results = []

    for _, row in data.iterrows():
        question = row['Questions']
        reference = row['Answers']
        hypothesis = row['generate']

        # Hitung metrik evaluasi
        bleu, rouge, meteor = evaluate_metrics(reference, hypothesis)

        # Hitung perplexity
        perplexity = calculate_perplexity(model, tokenizer, hypothesis)

        results.append({
            'Questions': question,
            'Reference': reference,
            'Hypothesis': hypothesis,
            'BLEU': bleu,
            'ROUGE-1': rouge['rouge1'].fmeasure,
            'ROUGE-L': rouge['rougeL'].fmeasure,
            'METEOR': meteor,
            'Perplexity': perplexity,
        })

    results_df = pd.DataFrame(results)

    print("Rata-rata Metrik Evaluasi:")
    print(f"BLEU: {results_df['BLEU'].mean():.4f}")
    print(f"ROUGE-1: {results_df['ROUGE-1'].mean():.4f}")
    print(f"ROUGE-L: {results_df['ROUGE-L'].mean():.4f}")
    print(f"METEOR: {results_df['METEOR'].mean():.4f}")
    print(f"Perplexity: {results_df['Perplexity'].mean():.4f}")
else:
    print("Kolom 'Answers' atau 'generate' tidak ditemukan dalam data.")


Rata-rata Metrik Evaluasi:
BLEU: 0.6315
ROUGE-1: 0.5919
ROUGE-L: 0.5410
METEOR: 0.5157
Perplexity: 90.0284
