# Import Library

In [4]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import pandas as pd
from datasets import Dataset
from utils.preprocessing import cleaningText
from utils.preprocessing import casefoldingText
from utils.preprocessing import stemmingText
from utils.preprocessing import fix_slangwords
from utils.preprocessing import tokenize
from utils.preprocessing import filteringText
from utils.preprocessing import tokenizingText
from utils.preprocessing import toSentence
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score
os.environ["WANDB_DISABLED"] = "true"
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

# Loading Dataset

In [5]:
# Ubah URL menjadi raw URL GitHub
chatbot_url = "https://raw.githubusercontent.com/novaandini/arkanesia/refs/heads/main/data/chatbot.csv"

# Baca CSV (ganti 'nama_file.csv' dengan file yang diunggah)
chatbot_df = pd.read_csv(chatbot_url)

# Cek kolom yang ada
print(chatbot_df.head())

                                          Pertanyaan         Intent  \
0                     Bro, ini kenapa wifi ku lelet?  uncategorized   
1   Apa tempat wisata alam terbaik di [nama_daerah]?    wisata_alam   
2  Di [nama_daerah], di mana saya bisa menikmati ...    wisata_alam   
3  Tempat wisata alam apa yang wajib dikunjungi d...    wisata_alam   
4  Ada apa saja di [nama_daerah] yang cocok untuk...    wisata_alam   

                                             Jawaban  
0  brooo 😵 itu mah urusan teknisi, aku mah jagony...  
1  Di [nama_daerah], kamu bisa nemuin banyak temp...  
2  Kalau kamu lagi di [nama_daerah] dan pengen me...  
3  Beberapa destinasi wisata alam yang wajib kamu...  
4  Buat pecinta alam, [nama_daerah] punya banyak ...  


## Explore Chatbot Data

In [6]:
chatbot_df.duplicated().sum()

np.int64(0)

In [7]:
chatbot_df.isnull().sum()

Pertanyaan    0
Intent        0
Jawaban       0
dtype: int64

In [8]:
chatbot_df.describe()

Unnamed: 0,Pertanyaan,Intent,Jawaban
count,723,723,723
unique,723,15,723
top,Bisa bantuin skripsi gk?,wisata_bahari,"hihihi gpp luh 😆 aku bkn ai skripsian, ai libu..."
freq,1,107,1


In [9]:
chatbot_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 723 entries, 0 to 722
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Pertanyaan  723 non-null    object
 1   Intent      723 non-null    object
 2   Jawaban     723 non-null    object
dtypes: object(3)
memory usage: 17.1+ KB


# Preprocessing Text

## Cleaning Text

In [10]:
chatbot_cleaned_df = pd.DataFrame()
chatbot_cleaned_df['question'] = chatbot_df['Pertanyaan']
chatbot_cleaned_df['intent'] = chatbot_df['Intent']
chatbot_cleaned_df['answer'] = chatbot_df['Jawaban']
chatbot_cleaned_df.head()

Unnamed: 0,question,intent,answer
0,"Bro, ini kenapa wifi ku lelet?",uncategorized,"brooo 😵 itu mah urusan teknisi, aku mah jagony..."
1,Apa tempat wisata alam terbaik di [nama_daerah]?,wisata_alam,"Di [nama_daerah], kamu bisa nemuin banyak temp..."
2,"Di [nama_daerah], di mana saya bisa menikmati ...",wisata_alam,Kalau kamu lagi di [nama_daerah] dan pengen me...
3,Tempat wisata alam apa yang wajib dikunjungi d...,wisata_alam,Beberapa destinasi wisata alam yang wajib kamu...
4,Ada apa saja di [nama_daerah] yang cocok untuk...,wisata_alam,"Buat pecinta alam, [nama_daerah] punya banyak ..."


In [11]:
chatbot_cleaned_df['question'] = chatbot_cleaned_df['question'].apply(cleaningText)
chatbot_cleaned_df['answer'] = chatbot_cleaned_df['answer'].apply(cleaningText)

chatbot_cleaned_df.head()

Unnamed: 0,question,intent,answer
0,Bro ini kenapa wifi ku lelet,uncategorized,brooo itu mah urusan teknisi aku mah jagonya ...
1,Apa tempat wisata alam terbaik di namadaerah,wisata_alam,Di namadaerah kamu bisa nemuin banyak tempat w...
2,Di namadaerah di mana saya bisa menikmati alam,wisata_alam,Kalau kamu lagi di namadaerah dan pengen menik...
3,Tempat wisata alam apa yang wajib dikunjungi d...,wisata_alam,Beberapa destinasi wisata alam yang wajib kamu...
4,Ada apa saja di namadaerah yang cocok untuk pe...,wisata_alam,Buat pecinta alam namadaerah punya banyak spot...


## Case Folding text

In [12]:
chatbot_cleaned_df['question'] = chatbot_cleaned_df['question'].apply(casefoldingText)
chatbot_cleaned_df['answer'] = chatbot_cleaned_df['answer'].apply(casefoldingText)
chatbot_cleaned_df.head()

Unnamed: 0,question,intent,answer
0,bro ini kenapa wifi ku lelet,uncategorized,brooo itu mah urusan teknisi aku mah jagonya ...
1,apa tempat wisata alam terbaik di namadaerah,wisata_alam,di namadaerah kamu bisa nemuin banyak tempat w...
2,di namadaerah di mana saya bisa menikmati alam,wisata_alam,kalau kamu lagi di namadaerah dan pengen menik...
3,tempat wisata alam apa yang wajib dikunjungi d...,wisata_alam,beberapa destinasi wisata alam yang wajib kamu...
4,ada apa saja di namadaerah yang cocok untuk pe...,wisata_alam,buat pecinta alam namadaerah punya banyak spot...


## Fix Slang Words

In [13]:
chatbot_cleaned_df['question'] = chatbot_cleaned_df['question'].apply(fix_slangwords)
chatbot_cleaned_df.head()

Unnamed: 0,question,intent,answer
0,saudara laki-laki ini kenapa wifi ku lambat,uncategorized,brooo itu mah urusan teknisi aku mah jagonya ...
1,apa tempat wisata alam terbaik di namadaerah,wisata_alam,di namadaerah kamu bisa nemuin banyak tempat w...
2,di namadaerah di mana saya bisa menikmati alam,wisata_alam,kalau kamu lagi di namadaerah dan pengen menik...
3,tempat wisata alam apa yang wajib dikunjungi d...,wisata_alam,beberapa destinasi wisata alam yang wajib kamu...
4,ada apa saja di namadaerah yang cocok untuk pe...,wisata_alam,buat pecinta alam namadaerah punya banyak spot...


## Stemming Text

In [14]:
chatbot_cleaned_df['question'] = chatbot_cleaned_df['question'].apply(stemmingText)
# chatbot_cleaned_df['answer'] = chatbot_cleaned_df['answer'].apply(stemmingText)
chatbot_cleaned_df.head()

Unnamed: 0,question,intent,answer
0,saudara laki ini kenapa wifi ku lambat,uncategorized,brooo itu mah urusan teknisi aku mah jagonya ...
1,apa tempat wisata alam baik di namadaerah,wisata_alam,di namadaerah kamu bisa nemuin banyak tempat w...
2,di namadaerah di mana saya bisa nikmat alam,wisata_alam,kalau kamu lagi di namadaerah dan pengen menik...
3,tempat wisata alam apa yang wajib kunjung di n...,wisata_alam,beberapa destinasi wisata alam yang wajib kamu...
4,ada apa saja di namadaerah yang cocok untuk ci...,wisata_alam,buat pecinta alam namadaerah punya banyak spot...


In [15]:
def restore_placeholders(text):
    text = text.replace('namadaerah', '[nama_daerah]')
    for i in range(1, 5):
        text = text.replace(f'rek{i}', f'[rekomendasi_{i}]')
    return text

# chatbot_cleaned_df['question'] = chatbot_cleaned_df['question'].apply(restore_placeholders)
# chatbot_cleaned_df['answer'] = chatbot_cleaned_df['answer'].apply(restore_placeholders)
# chatbot_cleaned_df.head()

## Encode Label Intent

In [16]:
label_encoder = LabelEncoder()
chatbot_cleaned_df['label'] = label_encoder.fit_transform(chatbot_cleaned_df['intent'])

# Simpan untuk nanti decoding hasil prediksi
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
for label, encoded in label_mapping.items():
    print(f"{label} → {encoded}")

aktivitas → 0
budget → 1
cuaca → 2
detail_wisata → 3
kuliner → 4
lokasi_wisata → 5
penginapan → 6
transportasi → 7
uncategorized → 8
wisata_alam → 9
wisata_bahari → 10
wisata_budaya → 11
wisata_edukasi → 12
wisata_rekreasi → 13
wisata_sejarah → 14


In [17]:
chatbot_cleaned_df.head()

Unnamed: 0,question,intent,answer,label
0,saudara laki ini kenapa wifi ku lambat,uncategorized,brooo itu mah urusan teknisi aku mah jagonya ...,8
1,apa tempat wisata alam baik di namadaerah,wisata_alam,di namadaerah kamu bisa nemuin banyak tempat w...,9
2,di namadaerah di mana saya bisa nikmat alam,wisata_alam,kalau kamu lagi di namadaerah dan pengen menik...,9
3,tempat wisata alam apa yang wajib kunjung di n...,wisata_alam,beberapa destinasi wisata alam yang wajib kamu...,9
4,ada apa saja di namadaerah yang cocok untuk ci...,wisata_alam,buat pecinta alam namadaerah punya banyak spot...,9


## Filtering Text

In [18]:
chatbot_cleaned_df['question_filtered'] = chatbot_cleaned_df['question'].apply(tokenizingText)
chatbot_cleaned_df['question_filtered'] = chatbot_cleaned_df['question_filtered'].apply(filteringText)
# chatbot_cleaned_df['answer'] = chatbot_cleaned_df['answer'].apply(filteringText)
chatbot_cleaned_df.head()

Unnamed: 0,question,intent,answer,label,question_filtered
0,saudara laki ini kenapa wifi ku lambat,uncategorized,brooo itu mah urusan teknisi aku mah jagonya ...,8,"[saudara, laki, wifi, lambat]"
1,apa tempat wisata alam baik di namadaerah,wisata_alam,di namadaerah kamu bisa nemuin banyak tempat w...,9,"[wisata, alam, namadaerah]"
2,di namadaerah di mana saya bisa nikmat alam,wisata_alam,kalau kamu lagi di namadaerah dan pengen menik...,9,"[namadaerah, nikmat, alam]"
3,tempat wisata alam apa yang wajib kunjung di n...,wisata_alam,beberapa destinasi wisata alam yang wajib kamu...,9,"[wisata, alam, wajib, kunjung, namadaerah]"
4,ada apa saja di namadaerah yang cocok untuk ci...,wisata_alam,buat pecinta alam namadaerah punya banyak spot...,9,"[namadaerah, cocok, cinta, alam]"


## To Sentence

In [19]:
chatbot_cleaned_df['question'] = chatbot_cleaned_df['question_filtered'].apply(toSentence)
chatbot_cleaned_df.head()

Unnamed: 0,question,intent,answer,label,question_filtered
0,saudara laki wifi lambat,uncategorized,brooo itu mah urusan teknisi aku mah jagonya ...,8,"[saudara, laki, wifi, lambat]"
1,wisata alam namadaerah,wisata_alam,di namadaerah kamu bisa nemuin banyak tempat w...,9,"[wisata, alam, namadaerah]"
2,namadaerah nikmat alam,wisata_alam,kalau kamu lagi di namadaerah dan pengen menik...,9,"[namadaerah, nikmat, alam]"
3,wisata alam wajib kunjung namadaerah,wisata_alam,beberapa destinasi wisata alam yang wajib kamu...,9,"[wisata, alam, wajib, kunjung, namadaerah]"
4,namadaerah cocok cinta alam,wisata_alam,buat pecinta alam namadaerah punya banyak spot...,9,"[namadaerah, cocok, cinta, alam]"


## Tokenizing Text

In [20]:
# chatbot_cleaned_df['question'] = chatbot_cleaned_df['question'].apply(tokenizingText)
# chatbot_cleaned_df['answer'] = chatbot_cleaned_df['answer'].apply(tokenizingText)

dataset = Dataset.from_pandas(chatbot_cleaned_df)
tokenized_dataset = dataset.map(tokenize, batched=True)

tokenized_dataset[0]

Map:   0%|          | 0/723 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 723/723 [00:00<00:00, 4714.77 examples/s]


{'question': 'saudara laki wifi lambat',
 'intent': 'uncategorized',
 'answer': 'brooo  itu mah urusan teknisi aku mah jagonya cari tmpt sinyal baguss buat ngonten wkwk',
 'label': 8,
 'question_filtered': ['saudara', 'laki', 'wifi', 'lambat'],
 'input_ids': [2, 3650, 7041, 6007, 5163, 3, 0, 0, 0, 0, 0, 0, 0, 0],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 'labels': 8}

# Modeling

## Split Dataset

In [21]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

## Load Pretrained Model

In [22]:
num_labels = len(chatbot_cleaned_df['label'].unique())

model = AutoModelForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels=num_labels
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training Argument

In [23]:
training_args = TrainingArguments(
    output_dir="../models/results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


## Training Setup

In [24]:
def compute_metrics(p):
    preds = p.predictions.argmax(axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


## Train The Model

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.111138,0.737931
2,No log,0.695673,0.827586
3,No log,0.63704,0.82069
4,No log,0.579998,0.855172


TrainOutput(global_step=292, training_loss=0.8849016999545163, metrics={'train_runtime': 676.335, 'train_samples_per_second': 3.418, 'train_steps_per_second': 0.432, 'total_flos': 16635493528416.0, 'train_loss': 0.8849016999545163, 'epoch': 4.0})

In [28]:
eval_result = trainer.evaluate()
print(eval_result)

{'eval_loss': 0.5799984931945801, 'eval_accuracy': 0.8551724137931035, 'eval_runtime': 6.0788, 'eval_samples_per_second': 23.853, 'eval_steps_per_second': 3.126, 'epoch': 4.0}


# Simpan Model

In [26]:
trainer.save_model("../models/chatbot_bert_model")

In [50]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained("../models/chatbot_bert_model")
tokenizer = AutoTokenizer.from_pretrained("../models/chatbot_bert_model")


label_mapping = {
    0: "aktivitas", 1: "budget", 2: "cuaca", 3: "detail_wisata",
    4: "kuliner", 5: "lokasi_wisata", 6: "penginapan", 7: "transportasi",
    8: "uncategorized", 9: "wisata_alam", 10: "wisata_bahari", 11: "wisata_budaya",
    12: "wisata_edukasi", 13: "wisata_rekreasi", 14: "wisata_sejarah"
}

def preprocess(text):
    cleaned_text = cleaningText(text)
    cleaned_text = casefoldingText(cleaned_text)
    cleaned_text = fix_slangwords(cleaned_text)
    cleaned_text = stemmingText(cleaned_text)
    return cleaned_text

def predict(text):
    cleaned_text = preprocess(text)
    inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    label = torch.argmax(probs, dim=1).item()
    confidence = probs[0][label].item()
    return label_mapping[label], confidence

# Tes input
predict("kamu pernah jaln jalan ke singapore?")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


('wisata_bahari', 0.8435835838317871)