# Import Library

In [4]:
import sys
import os
import json
import joblib
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import pandas as pd
from datasets import Dataset
from utils.preprocessing import cleaningText
from utils.preprocessing import casefoldingText
from utils.preprocessing import stemmingText
from utils.preprocessing import fix_slangwords
from utils.preprocessing import tokenize
from utils.preprocessing import filteringText
from utils.preprocessing import tokenizingText
from utils.preprocessing import toSentence
from sklearn.preprocessing import LabelEncoder
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score
from sentence_transformers import SentenceTransformer, util
from sqlalchemy import create_engine
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)

os.environ["WANDB_DISABLED"] = "true"
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
# Load model
model = SentenceTransformer('distiluse-base-multilingual-cased')

# Model Klasifikasi Intent

## Loading Dataset

In [5]:
# Ubah URL menjadi raw URL GitHub
chatbot_url = "https://raw.githubusercontent.com/novaandini/arkanesia/refs/heads/main/data/chatbot.csv"

chatbot_df = pd.read_csv(chatbot_url)

# Cek kolom yang ada
print(chatbot_df.head())

                                          Pertanyaan       Intent  \
0   Apa tempat wisata alam terbaik di [nama_daerah]?  wisata_alam   
1  Di [nama_daerah], di mana saya bisa menikmati ...  wisata_alam   
2  Tempat wisata alam apa yang wajib dikunjungi d...  wisata_alam   
3  Ada apa saja di [nama_daerah] yang cocok untuk...  wisata_alam   
4  Wisata alam apa yang paling terkenal di [nama_...  wisata_alam   

                                             Jawaban  
0  Di [nama_daerah], kamu bisa nemuin banyak temp...  
1  Kalau kamu lagi di [nama_daerah] dan pengen me...  
2  Beberapa destinasi wisata alam yang wajib kamu...  
3  Buat pecinta alam, [nama_daerah] punya banyak ...  
4  [rekomendasi_1] dan [rekomendasi_2] adalah wis...  


## Explore Chatbot Data

In [6]:
chatbot_df.duplicated().sum()

np.int64(0)

In [7]:
chatbot_df.isnull().sum()

Pertanyaan    0
Intent        0
Jawaban       0
dtype: int64

In [8]:
chatbot_df.describe()

Unnamed: 0,Pertanyaan,Intent,Jawaban
count,723,723,723
unique,723,15,723
top,"Bro, ini kenapa wifi ku lelet?",wisata_bahari,"brooo 😵 itu mah urusan teknisi, aku mah jagony..."
freq,1,107,1


In [9]:
chatbot_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 723 entries, 0 to 722
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Pertanyaan  723 non-null    object
 1   Intent      723 non-null    object
 2   Jawaban     723 non-null    object
dtypes: object(3)
memory usage: 17.1+ KB


## Preprocessing Text

### Cleaning Text

In [10]:
chatbot_cleaned_df = pd.DataFrame()
chatbot_cleaned_df['question'] = chatbot_df['Pertanyaan']
chatbot_cleaned_df['intent'] = chatbot_df['Intent']
chatbot_cleaned_df['answer'] = chatbot_df['Jawaban']
chatbot_cleaned_df.head()

Unnamed: 0,question,intent,answer
0,Apa tempat wisata alam terbaik di [nama_daerah]?,wisata_alam,"Di [nama_daerah], kamu bisa nemuin banyak temp..."
1,"Di [nama_daerah], di mana saya bisa menikmati ...",wisata_alam,Kalau kamu lagi di [nama_daerah] dan pengen me...
2,Tempat wisata alam apa yang wajib dikunjungi d...,wisata_alam,Beberapa destinasi wisata alam yang wajib kamu...
3,Ada apa saja di [nama_daerah] yang cocok untuk...,wisata_alam,"Buat pecinta alam, [nama_daerah] punya banyak ..."
4,Wisata alam apa yang paling terkenal di [nama_...,wisata_alam,[rekomendasi_1] dan [rekomendasi_2] adalah wis...


In [11]:
chatbot_cleaned_df['question'] = chatbot_cleaned_df['question'].apply(cleaningText)
chatbot_cleaned_df['answer'] = chatbot_cleaned_df['answer'].apply(cleaningText)

chatbot_cleaned_df.head()

Unnamed: 0,question,intent,answer
0,Apa tempat wisata alam terbaik di namadaerah,wisata_alam,Di namadaerah kamu bisa nemuin banyak tempat w...
1,Di namadaerah di mana saya bisa menikmati alam,wisata_alam,Kalau kamu lagi di namadaerah dan pengen menik...
2,Tempat wisata alam apa yang wajib dikunjungi d...,wisata_alam,Beberapa destinasi wisata alam yang wajib kamu...
3,Ada apa saja di namadaerah yang cocok untuk pe...,wisata_alam,Buat pecinta alam namadaerah punya banyak spot...
4,Wisata alam apa yang paling terkenal di namada...,wisata_alam,rekomendasi1 dan rekomendasi2 adalah wisata al...


### Case Folding text

In [12]:
chatbot_cleaned_df['question'] = chatbot_cleaned_df['question'].apply(casefoldingText)
chatbot_cleaned_df['answer'] = chatbot_cleaned_df['answer'].apply(casefoldingText)
chatbot_cleaned_df.head()

Unnamed: 0,question,intent,answer
0,apa tempat wisata alam terbaik di namadaerah,wisata_alam,di namadaerah kamu bisa nemuin banyak tempat w...
1,di namadaerah di mana saya bisa menikmati alam,wisata_alam,kalau kamu lagi di namadaerah dan pengen menik...
2,tempat wisata alam apa yang wajib dikunjungi d...,wisata_alam,beberapa destinasi wisata alam yang wajib kamu...
3,ada apa saja di namadaerah yang cocok untuk pe...,wisata_alam,buat pecinta alam namadaerah punya banyak spot...
4,wisata alam apa yang paling terkenal di namada...,wisata_alam,rekomendasi1 dan rekomendasi2 adalah wisata al...


### Fix Slang Words

In [13]:
chatbot_cleaned_df['question'] = chatbot_cleaned_df['question'].apply(fix_slangwords)
chatbot_cleaned_df.head()

Unnamed: 0,question,intent,answer
0,apa tempat wisata alam terbaik di namadaerah,wisata_alam,di namadaerah kamu bisa nemuin banyak tempat w...
1,di namadaerah di mana saya bisa menikmati alam,wisata_alam,kalau kamu lagi di namadaerah dan pengen menik...
2,tempat wisata alam apa yang wajib dikunjungi d...,wisata_alam,beberapa destinasi wisata alam yang wajib kamu...
3,ada apa saja di namadaerah yang cocok untuk pe...,wisata_alam,buat pecinta alam namadaerah punya banyak spot...
4,wisata alam apa yang paling terkenal di namada...,wisata_alam,rekomendasi1 dan rekomendasi2 adalah wisata al...


### Stemming Text

In [14]:
chatbot_cleaned_df['question'] = chatbot_cleaned_df['question'].apply(stemmingText)
# chatbot_cleaned_df['answer'] = chatbot_cleaned_df['answer'].apply(stemmingText)
chatbot_cleaned_df.head()

Unnamed: 0,question,intent,answer
0,apa tempat wisata alam baik di namadaerah,wisata_alam,di namadaerah kamu bisa nemuin banyak tempat w...
1,di namadaerah di mana saya bisa nikmat alam,wisata_alam,kalau kamu lagi di namadaerah dan pengen menik...
2,tempat wisata alam apa yang wajib kunjung di n...,wisata_alam,beberapa destinasi wisata alam yang wajib kamu...
3,ada apa saja di namadaerah yang cocok untuk ci...,wisata_alam,buat pecinta alam namadaerah punya banyak spot...
4,wisata alam apa yang paling kenal di namadaerah,wisata_alam,rekomendasi1 dan rekomendasi2 adalah wisata al...


In [15]:
def restore_placeholders(text):
    text = text.replace('namadaerah', '[nama_daerah]')
    for i in range(1, 5):
        text = text.replace(f'rek{i}', f'[rekomendasi_{i}]')
    return text

# chatbot_cleaned_df['question'] = chatbot_cleaned_df['question'].apply(restore_placeholders)
# chatbot_cleaned_df['answer'] = chatbot_cleaned_df['answer'].apply(restore_placeholders)
# chatbot_cleaned_df.head()

### Encode Label Intent

In [16]:
label_encoder = LabelEncoder()
chatbot_cleaned_df['label'] = label_encoder.fit_transform(chatbot_cleaned_df['intent'])

# Simpan untuk nanti decoding hasil prediksi
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
for label, encoded in label_mapping.items():
    print(f"{label} → {encoded}")

aktivitas → 0
budget → 1
cuaca → 2
detail_wisata → 3
kuliner → 4
lokasi_wisata → 5
penginapan → 6
transportasi → 7
uncategorized → 8
wisata_alam → 9
wisata_bahari → 10
wisata_budaya → 11
wisata_edukasi → 12
wisata_rekreasi → 13
wisata_sejarah → 14


In [17]:
chatbot_cleaned_df.head()

Unnamed: 0,question,intent,answer,label
0,apa tempat wisata alam baik di namadaerah,wisata_alam,di namadaerah kamu bisa nemuin banyak tempat w...,9
1,di namadaerah di mana saya bisa nikmat alam,wisata_alam,kalau kamu lagi di namadaerah dan pengen menik...,9
2,tempat wisata alam apa yang wajib kunjung di n...,wisata_alam,beberapa destinasi wisata alam yang wajib kamu...,9
3,ada apa saja di namadaerah yang cocok untuk ci...,wisata_alam,buat pecinta alam namadaerah punya banyak spot...,9
4,wisata alam apa yang paling kenal di namadaerah,wisata_alam,rekomendasi1 dan rekomendasi2 adalah wisata al...,9


### Filtering Text

In [18]:
chatbot_cleaned_df['question_filtered'] = chatbot_cleaned_df['question'].apply(tokenizingText)
chatbot_cleaned_df['question_filtered'] = chatbot_cleaned_df['question_filtered'].apply(filteringText)
# chatbot_cleaned_df['answer'] = chatbot_cleaned_df['answer'].apply(filteringText)
chatbot_cleaned_df.head()

Unnamed: 0,question,intent,answer,label,question_filtered
0,apa tempat wisata alam baik di namadaerah,wisata_alam,di namadaerah kamu bisa nemuin banyak tempat w...,9,"[wisata, alam, namadaerah]"
1,di namadaerah di mana saya bisa nikmat alam,wisata_alam,kalau kamu lagi di namadaerah dan pengen menik...,9,"[namadaerah, nikmat, alam]"
2,tempat wisata alam apa yang wajib kunjung di n...,wisata_alam,beberapa destinasi wisata alam yang wajib kamu...,9,"[wisata, alam, wajib, kunjung, namadaerah]"
3,ada apa saja di namadaerah yang cocok untuk ci...,wisata_alam,buat pecinta alam namadaerah punya banyak spot...,9,"[namadaerah, cocok, cinta, alam]"
4,wisata alam apa yang paling kenal di namadaerah,wisata_alam,rekomendasi1 dan rekomendasi2 adalah wisata al...,9,"[wisata, alam, kenal, namadaerah]"


### To Sentence

In [19]:
chatbot_cleaned_df['question'] = chatbot_cleaned_df['question_filtered'].apply(toSentence)
chatbot_cleaned_df.head()

Unnamed: 0,question,intent,answer,label,question_filtered
0,wisata alam namadaerah,wisata_alam,di namadaerah kamu bisa nemuin banyak tempat w...,9,"[wisata, alam, namadaerah]"
1,namadaerah nikmat alam,wisata_alam,kalau kamu lagi di namadaerah dan pengen menik...,9,"[namadaerah, nikmat, alam]"
2,wisata alam wajib kunjung namadaerah,wisata_alam,beberapa destinasi wisata alam yang wajib kamu...,9,"[wisata, alam, wajib, kunjung, namadaerah]"
3,namadaerah cocok cinta alam,wisata_alam,buat pecinta alam namadaerah punya banyak spot...,9,"[namadaerah, cocok, cinta, alam]"
4,wisata alam kenal namadaerah,wisata_alam,rekomendasi1 dan rekomendasi2 adalah wisata al...,9,"[wisata, alam, kenal, namadaerah]"


### Tokenizing Text

In [17]:
# chatbot_cleaned_df['question'] = chatbot_cleaned_df['question'].apply(tokenizingText)
# chatbot_cleaned_df['answer'] = chatbot_cleaned_df['answer'].apply(tokenizingText)

dataset = Dataset.from_pandas(chatbot_cleaned_df)
tokenized_dataset = dataset.map(tokenize, batched=True)

tokenized_dataset[0]

Map:   0%|          | 0/723 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map: 100%|██████████| 723/723 [00:00<00:00, 5105.15 examples/s]


{'question': 'wisata alam namadaerah',
 'intent': 'wisata_alam',
 'answer': 'di namadaerah kamu bisa nemuin banyak tempat wisata alam keren kayak rekomendasi1 rekomendasi2 dan rekomendasi3 semuanya punya daya tarik alam yang bikin betah',
 'label': 9,
 'question_filtered': ['wisata', 'alam', 'namadaerah'],
 'input_ids': [2, 1223, 668, 712, 10224, 3, 0, 0, 0, 0, 0, 0, 0, 0],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 'labels': 9}

## Modeling

### Transformer

#### Split Dataset

In [21]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

#### Load Pretrained Model

In [22]:
num_labels = len(chatbot_cleaned_df['label'].unique())

model = AutoModelForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels=num_labels
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Training Argument

In [23]:
training_args = TrainingArguments(
    output_dir="../models/results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


#### Training Setup

In [24]:
def compute_metrics(p):
    preds = p.predictions.argmax(axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


#### Train The Model

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
eval_result = trainer.evaluate()
print(eval_result)

{'eval_loss': 0.5799984931945801, 'eval_accuracy': 0.8551724137931035, 'eval_runtime': 6.0788, 'eval_samples_per_second': 23.853, 'eval_steps_per_second': 3.126, 'epoch': 4.0}


#### Simpan Model

In [26]:
trainer.save_model("../models/chatbot_bert_model")

### Model Scikit-Learn

In [21]:
cols_to_drop = ["intent", "answer"]
data = chatbot_cleaned_df.drop(columns=cols_to_drop)  # Fitur tanpa cluster
X = vectorizer.fit_transform(data["question"])
y = chatbot_cleaned_df["intent"]  # Cluster sebagai target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [22]:
# from sklearn.naive_bayes import MultinomialNB

# model = MultinomialNB()
# model.fit(X_train, y_train)

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)



In [23]:
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

train_accuracy = model.score(X_train, y_train)
test_accuracy = model.score(X_test, y_test)

print(f"Akurasi Training: {train_accuracy:.2f}")
print(f"Akurasi Test: {test_accuracy:.2f}")

Akurasi Training: 1.00
Akurasi Test: 0.72


In [24]:
print(classification_report(y_test, test_pred))

                 precision    recall  f1-score   support

      aktivitas       0.00      0.00      0.00         1
         budget       0.00      0.00      0.00         1
          cuaca       1.00      1.00      1.00         1
  detail_wisata       0.00      0.00      0.00         2
        kuliner       0.50      1.00      0.67         1
  lokasi_wisata       0.50      0.50      0.50         2
     penginapan       1.00      1.00      1.00         1
   transportasi       0.00      0.00      0.00         1
  uncategorized       0.38      1.00      0.55        12
    wisata_alam       0.86      0.57      0.69        21
  wisata_bahari       0.85      0.77      0.81        22
  wisata_budaya       1.00      0.95      0.97        20
 wisata_edukasi       0.71      0.75      0.73        20
wisata_rekreasi       0.74      0.70      0.72        20
 wisata_sejarah       0.92      0.60      0.73        20

       accuracy                           0.72       145
      macro avg       0.56   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
train_f1 = f1_score(y_train, train_pred, average="weighted")
test_f1 = f1_score(y_test, test_pred, average="weighted")

print(f"F1-Score Training: {train_f1:.2f}")
print(f"F1-Score Test: {test_f1:.2f}")

F1-Score Training: 1.00
F1-Score Test: 0.73


In [50]:
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np

# Split your data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# For each training size, train the model and evaluate
train_sizes = np.linspace(0.1, 1.0, 10)
train_scores = []
test_scores = []

for train_size in train_sizes:
    X_train_subset = X_train[:int(train_size * len(X_train))]
    y_train_subset = y_train[:int(train_size * len(y_train))]
    
    model.fit(X_train_subset, y_train_subset)
    
    train_score = model.score(X_train_subset, y_train_subset)
    test_score = model.score(X_test, y_test)
    
    train_scores.append(train_score)
    test_scores.append(test_score)

# Convert lists to arrays for easy manipulation
train_scores = np.array(train_scores)
test_scores = np.array(test_scores)

TypeError: sparse array length is ambiguous; use getnnz() or shape[0]

In [27]:
import pickle

os.makedirs(os.path.dirname("../models/chatbot_sklearn_model/"), exist_ok=True)
# Serialize the model and vectorizer
with open("../models/chatbot_sklearn_model/text_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)
with open("../models/chatbot_sklearn_model/vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)


In [42]:
with open('../models/chatbot_sklearn_model/text_model.pkl', 'rb') as model_file:
    sklearn_model = pickle.load(model_file)  # Ini model Sklearn

with open('../models/chatbot_sklearn_model/vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

# print(vectorizer.vocabulary_)

def predict(text):
    cleaned = stemmingText(fix_slangwords(casefoldingText(cleaningText(text))))
    vector = vectorizer.transform([cleaned])
    prediction = sklearn_model.predict(vector)
    print(prediction[0])

predict("Apa saja museum budaya yang bisa saya kunjungi di")


wisata_budaya


# Model Rekomendasi Tempat Wisata

## Load Dataset

In [3]:
# Koneksi ke MySQL
engine = create_engine(os.getenv("DATABASE_URL"))

# Query SQL
query = "SELECT * FROM tour"

# Baca data ke DataFrame
wisata_df = pd.read_sql(query, engine)

wisata_df.head()

Unnamed: 0,id,name,date,location,image,description,link,prices,district,province,latitude,longitude,createdAt,updatedAt
0,1,Pantai Kuta,1936-01-01 00:00:00,"Kuta, Badung",https://labirutour.com/wp-content/uploads/2020...,Pantai Kuta adalah salah satu destinasi wisata...,https://www.google.com/url?sa=i&url=https%3A%2...,0.0,Kuta,Bali,-8.7177,115.1682,2025-04-14 11:12:31.346,2025-04-14 11:12:31.346
1,2,Pura Ulun Danu Beratan,1633-01-01 00:00:00,"Bedugul, Tabanan",https://awsimages.detik.net.id/community/media...,Pura Ulun Danu Beratan adalah pura air yang te...,https://www.google.com/url?sa=i&url=https%3A%2...,50000.0,Baturiti,Bali,-8.275,115.1675,2025-04-14 11:12:31.355,2025-04-14 11:12:31.355
2,3,Tegallalang Rice Terrace,1800-01-01 00:00:00,"Tegallalang, Gianyar",https://media.cntraveler.com/photos/5e35bdf00e...,Tegallalang Rice Terrace terkenal dengan peman...,https://www.google.com/url?sa=i&url=https%3A%2...,25000.0,Tegallalang,Bali,-8.4366,115.279,2025-04-14 11:12:31.360,2025-04-14 11:12:31.360
3,4,Gunung Batur,1804-01-01 00:00:00,"Kintamani, Bangli",https://upload.wikimedia.org/wikipedia/commons...,Gunung Batur adalah gunung berapi aktif yang p...,https://www.google.com/url?sa=i&url=https%3A%2...,30000.0,Kintamani,Bali,-8.239,115.375,2025-04-14 11:12:31.366,2025-04-14 11:12:31.366
4,5,Pantai Pandawa,2011-01-01 00:00:00,"Kutuh, Badung",https://cozzy.id/uploads/0000/630/2024/09/04/c...,Pantai Pandawa dikenal dengan pasir putihnya y...,https://www.google.com/url?sa=i&url=https%3A%2...,15000.0,Kutuh,Bali,-8.8486,115.1889,2025-04-14 11:12:31.371,2025-04-14 11:12:31.371


## Explore Data

In [30]:
wisata_df.duplicated().sum()

np.int64(0)

In [31]:
wisata_df.isnull().sum()

id             0
name           0
date           0
location       0
image          0
description    0
link           1
prices         0
district       0
province       0
latitude       0
longitude      0
createdAt      0
updatedAt      0
dtype: int64

In [32]:
wisata_df.describe()

Unnamed: 0,id,prices,latitude,longitude,createdAt,updatedAt
count,100.0,100.0,100.0,100.0,100,100
mean,50.5,25160.0,-3.726152,112.452892,2025-04-14 11:12:32.564790016,2025-04-14 11:12:32.564790016
min,1.0,0.0,-10.8794,95.3171,2025-04-14 11:12:31.346000,2025-04-14 11:12:31.346000
25%,25.75,5000.0,-7.712775,105.583175,2025-04-14 11:12:31.760499968,2025-04-14 11:12:31.760499968
50%,50.5,15000.0,-5.4656,110.3907,2025-04-14 11:12:32.408500224,2025-04-14 11:12:32.408500224
75%,75.25,26250.0,0.649775,118.747425,2025-04-14 11:12:33.154749952,2025-04-14 11:12:33.154749952
max,100.0,150000.0,5.8401,140.7056,2025-04-14 11:12:34.325000,2025-04-14 11:12:34.325000
std,29.011492,32759.830046,4.413499,10.479746,,


In [33]:
wisata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           100 non-null    int64         
 1   name         100 non-null    object        
 2   date         100 non-null    object        
 3   location     100 non-null    object        
 4   image        100 non-null    object        
 5   description  100 non-null    object        
 6   link         99 non-null     object        
 7   prices       100 non-null    float64       
 8   district     100 non-null    object        
 9   province     100 non-null    object        
 10  latitude     100 non-null    float64       
 11  longitude    100 non-null    float64       
 12  createdAt    100 non-null    datetime64[ns]
 13  updatedAt    100 non-null    datetime64[ns]
dtypes: datetime64[ns](2), float64(3), int64(1), object(8)
memory usage: 11.1+ KB


## Preprocessing Data

### Cleaning Data

In [10]:
wisata_cleaned_df = pd.DataFrame(wisata_df)
wisata_cleaned_df['name'] = wisata_cleaned_df['name'].apply(cleaningText)
wisata_cleaned_df['location'] = wisata_cleaned_df['location'].apply(cleaningText)
wisata_cleaned_df['description'] = wisata_cleaned_df['description'].apply(cleaningText)
wisata_cleaned_df['district'] = wisata_cleaned_df['district'].apply(cleaningText)
wisata_cleaned_df['province'] = wisata_cleaned_df['province'].apply(cleaningText)

### Case Folding

In [11]:
wisata_cleaned_df['name'] = wisata_cleaned_df['name'].apply(casefoldingText)
wisata_cleaned_df['location'] = wisata_cleaned_df['location'].apply(casefoldingText)
wisata_cleaned_df['description'] = wisata_cleaned_df['description'].apply(casefoldingText)
wisata_cleaned_df['district'] = wisata_cleaned_df['district'].apply(casefoldingText)
wisata_cleaned_df['province'] = wisata_cleaned_df['province'].apply(casefoldingText)
wisata_cleaned_df.head()

Unnamed: 0,id,name,date,location,image,description,link,prices,district,province,latitude,longitude,createdAt,updatedAt
0,1,pantai kuta,1936-01-01 00:00:00,kuta badung,https://labirutour.com/wp-content/uploads/2020...,pantai kuta adalah salah satu destinasi wisata...,https://www.google.com/url?sa=i&url=https%3A%2...,0.0,kuta,bali,-8.7177,115.1682,2025-04-14 11:12:31.346,2025-04-14 11:12:31.346
1,2,pura ulun danu beratan,1633-01-01 00:00:00,bedugul tabanan,https://awsimages.detik.net.id/community/media...,pura ulun danu beratan adalah pura air yang te...,https://www.google.com/url?sa=i&url=https%3A%2...,50000.0,baturiti,bali,-8.275,115.1675,2025-04-14 11:12:31.355,2025-04-14 11:12:31.355
2,3,tegallalang rice terrace,1800-01-01 00:00:00,tegallalang gianyar,https://media.cntraveler.com/photos/5e35bdf00e...,tegallalang rice terrace terkenal dengan peman...,https://www.google.com/url?sa=i&url=https%3A%2...,25000.0,tegallalang,bali,-8.4366,115.279,2025-04-14 11:12:31.360,2025-04-14 11:12:31.360
3,4,gunung batur,1804-01-01 00:00:00,kintamani bangli,https://upload.wikimedia.org/wikipedia/commons...,gunung batur adalah gunung berapi aktif yang p...,https://www.google.com/url?sa=i&url=https%3A%2...,30000.0,kintamani,bali,-8.239,115.375,2025-04-14 11:12:31.366,2025-04-14 11:12:31.366
4,5,pantai pandawa,2011-01-01 00:00:00,kutuh badung,https://cozzy.id/uploads/0000/630/2024/09/04/c...,pantai pandawa dikenal dengan pasir putihnya y...,https://www.google.com/url?sa=i&url=https%3A%2...,15000.0,kutuh,bali,-8.8486,115.1889,2025-04-14 11:12:31.371,2025-04-14 11:12:31.371


### Stemming Text

In [36]:
wisata_cleaned_df['description'] = wisata_cleaned_df['description'].apply(stemmingText)
wisata_cleaned_df.head()

Unnamed: 0,id,name,date,location,image,description,link,prices,district,province,latitude,longitude,createdAt,updatedAt
0,1,pantai kuta,1936-01-01 00:00:00,kuta badung,https://labirutour.com/wp-content/uploads/2020...,pantai kuta adalah salah satu destinasi wisata...,https://www.google.com/url?sa=i&url=https%3A%2...,0.0,kuta,bali,-8.7177,115.1682,2025-04-14 11:12:31.346,2025-04-14 11:12:31.346
1,2,pura ulun danu beratan,1633-01-01 00:00:00,bedugul tabanan,https://awsimages.detik.net.id/community/media...,pura ulun danu rat adalah pura air yang letak ...,https://www.google.com/url?sa=i&url=https%3A%2...,50000.0,baturiti,bali,-8.275,115.1675,2025-04-14 11:12:31.355,2025-04-14 11:12:31.355
2,3,tegallalang rice terrace,1800-01-01 00:00:00,tegallalang gianyar,https://media.cntraveler.com/photos/5e35bdf00e...,tegallalang rice terrace kenal dengan pandang ...,https://www.google.com/url?sa=i&url=https%3A%2...,25000.0,tegallalang,bali,-8.4366,115.279,2025-04-14 11:12:31.360,2025-04-14 11:12:31.360
3,4,gunung batur,1804-01-01 00:00:00,kintamani bangli,https://upload.wikimedia.org/wikipedia/commons...,gunung batur adalah gunung rap aktif yang popu...,https://www.google.com/url?sa=i&url=https%3A%2...,30000.0,kintamani,bali,-8.239,115.375,2025-04-14 11:12:31.366,2025-04-14 11:12:31.366
4,5,pantai pandawa,2011-01-01 00:00:00,kutuh badung,https://cozzy.id/uploads/0000/630/2024/09/04/c...,pantai pandawa kenal dengan pasir putih yang b...,https://www.google.com/url?sa=i&url=https%3A%2...,15000.0,kutuh,bali,-8.8486,115.1889,2025-04-14 11:12:31.371,2025-04-14 11:12:31.371


## Buat Embeddings

In [12]:
wisata_cleaned_df['combined'] = (
    wisata_cleaned_df['name'] + ' ' +
    wisata_cleaned_df['district'] + ' ' +
    wisata_cleaned_df['province'] + ' ' +
    wisata_cleaned_df['location'] + ' ' +
    wisata_cleaned_df['description']
)

In [13]:
embeddings = model.encode(wisata_cleaned_df['combined'].tolist(), convert_to_tensor=True)

In [123]:
def rekomendasi_wisata(pertanyaan_user, top_k=3):
    original_question = pertanyaan_user.lower()

    all_provinces = wisata_cleaned_df['province'].str.lower().unique()

    lokasi_filter = None
    for prov in all_provinces:
        if prov in original_question:
            lokasi_filter = prov
            break

    pertanyaan_user = cleaningText(pertanyaan_user)
    pertanyaan_user = casefoldingText(pertanyaan_user)
    pertanyaan_user = stemmingText(pertanyaan_user)
    query_embedding = model.encode(pertanyaan_user, convert_to_tensor=True)
    
    # Saat ada filter lokasi
    if lokasi_filter:
        mask = wisata_cleaned_df['province'].str.lower().str.contains(lokasi_filter)
        df_filtered = wisata_cleaned_df[mask]
        embeddings_filtered = embeddings[mask.values]  # <- filter embedding juga
    else:
        df_filtered = wisata_cleaned_df
        embeddings_filtered = embeddings

    # Reset index
    df_filtered = df_filtered.reset_index(drop=True)
    embeddings_filtered = embeddings_filtered.cpu()  # kalau tensor di GPU

    # Cosine similarity terhadap embedding hasil filter
    cos_scores = util.pytorch_cos_sim(query_embedding, embeddings_filtered)[0]
    top_results = cos_scores.topk(k=top_k)

    # Ambil hasil
    recommendations = []
    for score, idx in zip(top_results[0], top_results[1]):
        idx = idx.item()
        row = df_filtered.iloc[idx]
        recommendations.append({
            'lokasi': lokasi_filter,
            'result': row['name'],
            'score': score.item(),
        })

    return recommendations


# Testing

In [128]:
recommendation = rekomendasi_wisata("pantai bali")
print(recommendation)

[{'lokasi': 'bali', 'result': 'pantai kuta', 'score': 0.30511873960494995}, {'lokasi': 'bali', 'result': 'pantai pandawa', 'score': 0.2494456171989441}, {'lokasi': 'bali', 'result': 'pura ulun danu beratan', 'score': 0.05591677129268646}]


In [108]:
chatbot_model = AutoModelForSequenceClassification.from_pretrained("../models/chatbot_bert_model")
chatbot_tokenizer = AutoTokenizer.from_pretrained("../models/chatbot_bert_model")
model = SentenceTransformer('distiluse-base-multilingual-cased')
doc_embeddings = model.encode(chatbot_df['Pertanyaan'].tolist(), convert_to_tensor=True)

label_mapping = {
    0: "aktivitas", 1: "budget", 2: "cuaca", 3: "detail_wisata",
    4: "kuliner", 5: "lokasi_wisata", 6: "penginapan", 7: "transportasi",
    8: "uncategorized", 9: "wisata_alam", 10: "wisata_bahari", 11: "wisata_budaya",
    12: "wisata_edukasi", 13: "wisata_rekreasi", 14: "wisata_sejarah"
}

In [125]:
def preprocess(text):
    cleaned_text = cleaningText(text)
    cleaned_text = casefoldingText(cleaned_text)
    cleaned_text = fix_slangwords(cleaned_text)
    cleaned_text = stemmingText(cleaned_text)
    return cleaned_text

import re

def predict(text, top_k=3):
    cleaned_text = preprocess(text)

    # Text classification
    inputs = chatbot_tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = chatbot_model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    label = torch.argmax(probs, dim=1).item()
    confidence = probs[0][label].item()

    # Ambil rekomendasi wisata
    recommendation = rekomendasi_wisata(text)
    # Pilih jawaban template sesuai intent
    answer = chatbot_df[chatbot_df['Intent'] == label_mapping[label]].sample(n=1).iloc[0]['Jawaban']

    placeholders = re.findall(r"\[rekomendasi_(\d+)\]", answer)
    final_answer = answer

    # Cek dan ganti [nama_daerah]
    if "[nama_daerah]" in final_answer:
        if recommendation[0]['lokasi'] == None:
            final_answer = final_answer.replace("[nama_daerah]", "indonesia")
        else:
            final_answer = final_answer.replace("[nama_daerah]", recommendation[0]['lokasi'])

    # Ganti placeholder rekomendasi
    for ph in placeholders:
        idx = int(ph) - 1  # karena rekomendasi_1 = recommendation[0]
        if idx < len(recommendation):
            final_answer = final_answer.replace(f"[rekomendasi_{ph}]", recommendation[idx]['result'])
        else:
            final_answer = final_answer.replace(f"[rekomendasi_{ph}]", "-")

    result = {
        'answer': final_answer,
        'intent': label_mapping[label],
        'score': confidence
    }
    return result


In [126]:
predict("ada rekomendasi tempat wisata yang bisa sambil belajar sejarah di bali")

{'answer': 'Lu wajib banget ke pantai kuta, kayak ngestep ke zaman dulu tapi fun!',
 'intent': 'wisata_sejarah',
 'score': 0.9301119446754456}