<a href="https://colab.research.google.com/github/nuratika19-beep/Tugas-api-Atika/blob/main/TUGAS_API_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies
!pip install datasets transformers accelerate

from datasets import DatasetDict, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
import os

# Step 1: Load Dataset
dataset = load_dataset('heliosbrahma/mental_health_chatbot_dataset')

# Debug: Periksa contoh data mentah
print("Raw dataset sample:")
print(dataset["train"][0])

# Step 2: Split 'text' menjadi input (HUMAN) dan output (ASSISTANT)
def split_text(example):
    text = example["text"]
    if "<HUMAN>:" in text and "<ASSISTANT>:" in text:
        # Cari bagian input dan output berdasarkan posisi tag
        human_start = text.find("<HUMAN>:") + len("<HUMAN>:")
        assistant_start = text.find("<ASSISTANT>:") + len("<ASSISTANT>:")
        human_text = text[human_start:text.find("<ASSISTANT>:")].strip()
        assistant_text = text[assistant_start:].strip()
        return {"input": human_text, "output": assistant_text}
    else:
        # Debug jika format tidak sesuai
        print(f"Skipped example: {text}")
        return {"input": None, "output": None}

# Terapkan fungsi split_text ke dataset
processed_dataset = dataset["train"].map(split_text)

# Step 3: Filter data kosong
processed_dataset = processed_dataset.filter(lambda example: example["input"] is not None and example["output"] is not None)

# Debug: Periksa contoh data yang diproses
print("Processed dataset sample after split:")
if len(processed_dataset) > 0:
    print(processed_dataset[0])
else:
    print("No valid examples found.")

# Step 4: Pecah dataset menjadi train/test split
train_test_split = processed_dataset.train_test_split(test_size=0.2)
dataset = DatasetDict({
    "train": train_test_split["train"],
    "test": train_test_split["test"],
})

# Debug: Periksa ukuran dataset
print(f"Number of training samples: {len(dataset['train'])}")
print(f"Number of test samples: {len(dataset['test'])}")

if len(dataset["train"]) == 0 or len(dataset["test"]) == 0:
    raise ValueError("Dataset is empty. Please check the preprocessing and filtering steps.")

# Step 5: Initialize Tokenizer dan Model
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Step 6: Preprocess Data
def preprocess_data(batch):
    inputs = ["question: " + question for question in batch["input"]]
    targets = batch["output"]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)

# Debug: Periksa hasil tokenisasi
print("Sample tokenized data:", tokenized_datasets["train"][0])

# Step 7: Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=100,
    remove_unused_columns=False,
)

# Step 8: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

# Step 9: Train Model
trainer.train()

# Step 10: Save Model
model.save_pretrained("./mental_health_chatbot_model")
tokenizer.save_pretrained("./mental_health_chatbot_model")

# Step 11: Inferensi Model
# Load ulang model yang telah dilatih
model = AutoModelForSeq2SeqLM.from_pretrained("./mental_health_chatbot_model")
tokenizer = AutoTokenizer.from_pretrained("./mental_health_chatbot_model")

# Fungsi untuk menghasilkan respons
def generate_response(input_text):
    inputs = tokenizer("question: " + input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate output
    outputs = model.generate(
        inputs.input_ids,
        max_length=150,          # Panjang maksimum respons
        num_beams=5,             # Beam search
        early_stopping=True,     # Hentikan jika selesai
        no_repeat_ngram_size=2,  # Hindari pengulangan frasa
        temperature=0.7,         # Keacakan
        top_p=0.9,               # Sampling nucleus
        top_k=50,                # Sampling top-k
        do_sample=True           # Aktifkan sampling
    )

    # Decode hasilnya
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Contoh pertanyaan
example_question = "What is a panic attack?"
response = generate_response(example_question)
print(f"Question: {example_question}")
print(f"Response: {response}")

Raw dataset sample:
{'text': '<HUMAN>: What is a panic attack?\n<ASSISTANT>: Panic attacks come on suddenly and involve intense and often overwhelming fear. They’re accompanied by very challenging physical symptoms, like a racing heartbeat, shortness of breath, or nausea. Unexpected panic attacks occur without an obvious cause. Expected panic attacks are cued by external stressors, like phobias. Panic attacks can happen to anyone, but having more than one may be a sign of panic disorder, a mental health condition characterized by sudden and repeated panic attacks.'}
Processed dataset sample after split:
{'text': '<HUMAN>: What is a panic attack?\n<ASSISTANT>: Panic attacks come on suddenly and involve intense and often overwhelming fear. They’re accompanied by very challenging physical symptoms, like a racing heartbeat, shortness of breath, or nausea. Unexpected panic attacks occur without an obvious cause. Expected panic attacks are cued by external stressors, like phobias. Panic atta

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Sample tokenized data: {'input_ids': [822, 10, 2645, 225, 27, 1350, 12, 81, 2550, 533, 58, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,3.169034
2,No log,1.618814
3,No log,1.438371
4,No log,1.403935
5,No log,1.394644


Question: What is a panic attack?
Response: Sollte es irgendetwas zu tun haben, so wenden Sie sich bitte an uns.
