## Import Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib

## Load Dataset

In [3]:
pd.set_option('display.max_columns', None)

tl = pd.read_csv('../dataset/TravelokaScrape.csv')
tl = tl.drop(columns='Unnamed: 0')
tl.head()

Unnamed: 0,VillaName,Features,Rating,Total_Review,Rating_Info,Location,URL,Currency,PriceValue,Area
0,Hanging Gardens Of Bali,"['Cycling', 'Outdoor facilities', 'Massage', '...",8.5,30.0,Impressive,"Payangan, Ubud",www.traveloka.com/en-en/hotel/indonesia/hangin...,USD,722.65,Ubud
1,The Bali Dream Villa Resort Echo Beach Canggu,"['Sun loungers', 'Massage', 'Kitchenette', 'Po...",8.3,26.0,Impressive,"Canggu, Kuta Utara",www.traveloka.com/en-en/hotel/indonesia/the-ba...,USD,50.13,Canggu
2,Menjangan Dynasty Resort,"['Snorkeling', 'Canoeing', 'Horse riding', 'Di...",9.1,135.0,Superb,"Menjangan, Buleleng",www.traveloka.com/en-en/hotel/indonesia/menjan...,USD,156.45,Others
3,AYANA Villas Bali,"['Show cooking restaurant', 'Beach Bar', 'Busi...",8.7,23.0,Impressive,"Jimbaran, Kuta Selatan",www.traveloka.com/en-en/hotel/indonesia/ayana-...,USD,1156.73,Jimbaran
4,Elevate Bali,"['Canoeing', 'Diving', 'Massage', 'Airport tra...",9.8,6.0,Superb,"Munduk, Buleleng",www.traveloka.com/en-en/hotel/indonesia/elevat...,USD,313.7,Others


## Feature Selection and Engineering

- only the area and price value will be used to predict the rating since the Traveloka provide facilities, while BukitVista provide features which can't really be used to predict the rating
- since currency is all in USD, it can also be omitted
- total reviews can represent the number of bookings made which is also not provided by BukitVista
- Rating_info, location, and URL are irrelevant

In [4]:
df = tl[['Rating', 'PriceValue', 'Area']]
df.head()

Unnamed: 0,Rating,PriceValue,Area
0,8.5,722.65,Ubud
1,8.3,50.13,Canggu
2,9.1,156.45,Others
3,8.7,1156.73,Jimbaran
4,9.8,313.7,Others


## Train-Test Split

In [5]:
X = df[['PriceValue', 'Area']]
y = df['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
#encode location
encoder = LabelEncoder()
X_train['Area_encoded'] = encoder.fit_transform(X_train['Area'])
X_test['Area_encoded'] = encoder.transform(X_test['Area'])

In [7]:
#train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train[['PriceValue', 'Area_encoded']], y_train)


In [None]:
#check model's performance
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_test[['PriceValue', 'Area_encoded']])
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.07688960407327618


The model has a small MSE, indicating a good performance.

In [10]:
#save the model and encoder
joblib.dump(model, 'rating_predictor_model.pkl')
joblib.dump(encoder, 'location_encoder.pkl')

['location_encoder.pkl']