In [1]:
import pandas as pd 
df = pd.read_csv("prep_data/hotelid0_daily_merged.csv")

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46084 entries, 0 to 46083
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           46084 non-null  int64  
 1   datum_dolaska        46084 non-null  object 
 2   datum_odjave         46084 non-null  object 
 3   kanal_prodaje_id     46084 non-null  int64  
 4   tip_sobe_id          46084 non-null  int64  
 5   cijena_nocenja       46084 non-null  float64
 6   rezervacija_id       46084 non-null  int64  
 7   gost_id              46084 non-null  int64  
 8   zemlja_gosta_mapped  46084 non-null  int64  
 9   total_occupancy      46084 non-null  float64
dtypes: float64(2), int64(6), object(2)
memory usage: 3.5+ MB


In [3]:
df = df.drop(columns=["Unnamed: 0"], axis=1)

Kako bi random forest algoritam mogao da razume vremenske serije moramo iz datuma izvuci odredjene podatke, kao sto je to u narednom cell-u

In [4]:
df['datum_dolaska'] = pd.to_datetime(df['datum_dolaska'])
df['datum_odjave'] = pd.to_datetime(df['datum_odjave'])

df['day_of_week'] = df['datum_dolaska'].dt.day_of_week
df['month'] = df['datum_dolaska'].dt.month
df['year'] = df['datum_dolaska'].dt.year
df['week_of_year'] = df['datum_dolaska'].dt.isocalendar().week
df['week_of_month'] = df['datum_dolaska'].dt.day // 7 + 1

In [5]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=["total_occupancy", "datum_dolaska", "datum_odjave"], axis=1)
y = df["total_occupancy"]
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=27)

In [12]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np 
from sklearn.metrics import mean_squared_error,  r2_score, mean_absolute_error

rf_model = RandomForestRegressor(random_state=27, n_jobs=-1, min_samples_split =3, max_samples=800, n_estimators=73)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred)
mse_rf = mean_squared_error(y_test, y_pred)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred)

print(f'Mean Absolute Error (RF): {mae_rf}')
print(f'Mean Squared Error (RF): {mse_rf}')
print(f'Root Mean Squared Error (RF): {rmse_rf}')
print(f'R2 (RF): {r2_rf}')

Mean Absolute Error (RF): 25.74575506447041
Mean Squared Error (RF): 1656.4430708414397
Root Mean Squared Error (RF): 40.699423470627195
R2 (RF): 0.9687722750591333


In [None]:
import warnings
from sklearn.model_selection import cross_val_score
warnings.filterwarnings('ignore')


for i in range(200,500):
    for j in range(2000,2001):
        model = RandomForestRegressor(n_estimators = i, max_samples =j,  random_state = 27, n_jobs=-1)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(" n_estimators={}, max_samples={}, r2={}".format(i, j,
                                                                     r2_score(y_test, y_pred)
                                                                    ))

In [8]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {'n_estimators': list(range(1,100)), 'max_samples': list(range(1,1000)),'min_samples_split':list(range(1,11))}
random_search = RandomizedSearchCV(RandomForestRegressor(random_state=27,n_jobs=-1), param_dist, n_iter=10, cv=5, scoring='neg_mean_absolute_error', random_state=27)
random_search.fit(X_train, y_train)


best_hyperparams = random_search.best_params_

best_mae_score = -random_search.best_score_

best_model = random_search.best_estimator_

y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best Hyperparameters: {best_hyperparams}')
print(f'Best Mean Absolute Error: {best_mae_score}')
print(f'Mean Absolute Error on Test Data: {mae}')
print(f'R-squared on Test Data: {r2}')

Best Hyperparameters: {'n_estimators': 73, 'min_samples_split': 3, 'max_samples': 800}
Best Mean Absolute Error: 25.731571263987075
Mean Absolute Error on Test Data: 25.74575506447041
R-squared on Test Data: 0.9687722750591333


In [11]:
X = df.drop(columns=["total_occupancy", "datum_dolaska", "datum_odjave"], axis=1)
y = df["total_occupancy"]

rf_model = RandomForestRegressor(random_state=27, n_estimators= 73, min_samples_split= 3, max_samples= 800, n_jobs=-1)  # Set n_estimators accordingly

cv_scores_mae = cross_val_score(rf_model, X, y, cv=5, scoring='neg_mean_absolute_error')
cv_scores_r2 = cross_val_score(rf_model, X, y, cv=5, scoring='r2')

cv_scores_mae = -cv_scores_mae

print(f'Cross-Validated Mean Absolute Error (RF): {cv_scores_mae.mean()}')
print(f'Cross-Validated R2 (RF): {cv_scores_r2.mean()}')


Cross-Validated Mean Absolute Error (RF): 69.99570126041124
Cross-Validated R2 (RF): 0.6121913922099151
