In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, make_scorer

In [8]:
data_path = os.path.join(os.getcwd(), '..', '..', 'data', 'preprocessed_data.csv')

In [9]:
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,bedroom,bathroom,beds,guests,wifi,tv,cable_tv,ac,workspace,hot_water,...,lng,distance_to_coastline,room_name,booking_window,stay_duration_in_days,review_sentiment_score,rating,booking_earned,average_daily_rate,rating_rounded
0,1,1,1,2,1,0,0,1,1,1,...,115.113378,15.6035,Bingin Ombak Apartment - 1 Lantai 1,81,4,0.458603,4.0,5911197.97,1477799.0,4.0
1,1,1,1,2,1,0,0,1,1,0,...,115.113378,15.6035,Bingin Ombak Apartment - 2 Lantai 2,81,4,0.458603,4.0,5911197.97,1477799.0,4.0
2,1,2,1,2,1,0,0,0,1,0,...,115.113378,15.6035,Standard (PLEASE IGNORE),81,4,0.45545,4.0,5911197.97,1477799.0,4.0
3,1,2,1,2,1,0,0,0,1,0,...,115.113378,15.6035,Standard (PLEASE IGNORE),81,4,0.45545,4.0,5911197.97,1477799.0,4.0
4,1,3,1,2,1,0,0,0,1,0,...,115.113378,15.6035,Standard (PLEASE IGNORE),81,4,0.452298,4.0,5911197.97,1477799.0,4.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50728 entries, 0 to 50727
Data columns (total 40 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   bedroom                 50728 non-null  int64  
 1   bathroom                50728 non-null  int64  
 2   beds                    50728 non-null  int64  
 3   guests                  50728 non-null  int64  
 4   wifi                    50728 non-null  int64  
 5   tv                      50728 non-null  int64  
 6   cable_tv                50728 non-null  int64  
 7   ac                      50728 non-null  int64  
 8   workspace               50728 non-null  int64  
 9   hot_water               50728 non-null  int64  
 10  parking                 50728 non-null  int64  
 11  pool                    50728 non-null  int64  
 12  gym                     50728 non-null  int64  
 13  private_entrance        50728 non-null  int64  
 14  luggage_drop_off        50728 non-null

###  Normalize the features

In [11]:
# Separate features and target variable
X = df.drop(columns=['booking_earned', 'property_name', 'room_name', 
                     'review_sentiment_score', 'rating_rounded'])  # Features
y = df['booking_earned']  # Target variable

# Melakukan standarisasi pada fitur
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [12]:
# Initialize the XGBoost model
model = GradientBoostingRegressor(n_estimators=2000, max_depth=7, random_state=42)

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate the model
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)
print(f'Mean Absolute Error on Training: {mae_train}')
print(f'Mean Absolute Error on Test: {mae_test}')

Mean Absolute Error on Training: 103.33589006806999
Mean Absolute Error on Test: 4397.529141130648


### Tune the hyperparameter

In [14]:
# Inisialisasi model Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor(random_state=42)

# Definisi grid parameter yang akan dijelajahi
param_grid = {
    'n_estimators': [100],
    'learning_rate': [0.3, 0.5, 0.7],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Membuat objek GridSearchCV
grid_search = GridSearchCV(estimator=gb_regressor, param_grid=param_grid, 
                           scoring='neg_mean_absolute_error', cv=5, verbose=2)

# Melatih model pada data
grid_search.fit(X_train, y_train)

# Mendapatkan model terbaik
best_gb_regressor = grid_search.best_estimator_

# Memprediksi pada set pengujian
y_pred = best_gb_regressor.predict(X_test)

# Menghitung MAE
mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)

# Mendapatkan parameter terbaik
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

In [13]:
# Inisialisasi model Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor(random_state=42)

# Definisi grid parameter yang akan dijelajahi
param_grid = {
    'n_estimators': [500, 1000, 2000, 3000],
    'learning_rate': [0.7],
    'max_depth': [7],
    'subsample': [1.0],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}

# Membuat objek GridSearchCV
grid_search = GridSearchCV(estimator=gb_regressor, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=5, verbose=2)

# Melatih model pada data
grid_search.fit(X_train, y_train)

# Mendapatkan model terbaik
best_gb_regressor = grid_search.best_estimator_

# Memprediksi pada set pengujian
y_pred = best_gb_regressor.predict(X_test)

# Menghitung MAE
mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)

# Mendapatkan parameter terbaik
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

### Model dengan parameter terbaik

In [15]:
# # Initialize the GradientBoostingRegressor model
# model = GradientBoostingRegressor(n_estimators=2000, learning_rate=0.7, 
#                                   max_depth=7, subsample=1.0, 
#                                   min_samples_split=2, min_samples_leaf=1)

# # Fit the model on the training data
# model.fit(X_train, y_train)

# # Make predictions on the test data
# y_pred = model.predict(X_test)

# # Evaluate the model
# mae = mean_absolute_error(y_test, y_pred)
# print(f'Mean Absolute Error: {mae}')

### Cross Validation

In [38]:
# # menentukan skema k-fold cross-validation
# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# # menentukan scoring function
# scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# # Melakukan cross-validation pada model terbaik
# scores = cross_val_score(best_gb_regressor, X_train, y_train, scoring=scorer, cv=kf)

# # Karena scoring='neg_mean_absolute_error', hasilnya negatif, jadi kita ambil nilai absolutnya
# mae_scores = -scores

# # Cetak nilai MAE untuk setiap fold
# print("MAE scores for each fold:", mae_scores)

# # Rata-rata dan standar deviasi dari nilai MAE
# print("Mean MAE on Cross Validation:", mae_scores.mean())
# print("Standard Deviation of MAE:", mae_scores.std())

# # Melatih model pada data
# best_gb_regressor.fit(X_train, y_train)

# # Memprediksi pada set pengujian
# y_pred_cv = best_gb_regressor.predict(X_test)

# # Menghitung MAE
# mae = mean_absolute_error(y_test, y_pred_cv)
# print("MAE on testing:", mae)

In [None]:
# # Get feature importances from the best model
# feature_importances = best_gb_regressor.feature_importances_

# # Create a DataFrame to store feature importances with corresponding column names
# fi_df = pd.DataFrame({'Feature': X.columns, 
#                       'Importance': feature_importances})

# # Sort the DataFrame by feature importance values
# fi_df = fi_df.sort_values(by='Importance', ascending=False)

# # Visualisasi feature importances
# plt.figure(figsize=(10, 6))
# plt.title("Feature Importances")
# plt.bar(range(len(fi_df)), fi_df['Importance'], align='center')
# plt.xticks(range(len(fi_df)), fi_df['Feature'], rotation=90)
# plt.xlim([-1, len(fi_df)])
# plt.show()