In [2]:
import pandas as pd 
df = pd.read_csv("prep_data/hotelid0_daily_merged.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46084 entries, 0 to 46083
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           46084 non-null  int64  
 1   datum_dolaska        46084 non-null  object 
 2   datum_odjave         46084 non-null  object 
 3   kanal_prodaje_id     46084 non-null  int64  
 4   tip_sobe_id          46084 non-null  int64  
 5   cijena_nocenja       46084 non-null  float64
 6   rezervacija_id       46084 non-null  int64  
 7   gost_id              46084 non-null  int64  
 8   zemlja_gosta_mapped  46084 non-null  int64  
 9   total_occupancy      46084 non-null  float64
dtypes: float64(2), int64(6), object(2)
memory usage: 3.5+ MB


In [4]:
df = df.drop(columns=["Unnamed: 0"], axis=1)

Kako bi random forest algoritam mogao da razume vremenske serije moramo iz datuma izvuci odredjene podatke, kao sto je to u narednom cell-u

In [5]:
df['datum_dolaska'] = pd.to_datetime(df['datum_dolaska'])
df['datum_odjave'] = pd.to_datetime(df['datum_odjave'])

df['day_of_week'] = df['datum_dolaska'].dt.day_of_week
df['month'] = df['datum_dolaska'].dt.month
df['year'] = df['datum_dolaska'].dt.year
df['week_of_year'] = df['datum_dolaska'].dt.isocalendar().week
df['week_of_month'] = df['datum_dolaska'].dt.day // 7 + 1

In [7]:
df.tail()

Unnamed: 0,datum_dolaska,datum_odjave,kanal_prodaje_id,tip_sobe_id,cijena_nocenja,rezervacija_id,gost_id,zemlja_gosta_mapped,total_occupancy,day_of_week,month,year,week_of_year,week_of_month
46079,2017-08-30,2017-09-06,2,1,83.444595,1432428,1063376,13,359.0,2,8,2017,35,5
46080,2017-08-31,2017-09-07,2,3,201.924145,1432429,1088912,5,358.0,3,8,2017,35,5
46081,2017-08-31,2017-09-07,2,2,148.455148,1432430,1101990,12,358.0,3,8,2017,35,5
46082,2017-08-31,2017-09-07,2,1,101.498939,1432431,1053323,3,358.0,3,8,2017,35,5
46083,2017-10-29,2017-11-07,2,1,193.709372,1432432,1013647,13,360.0,6,10,2017,43,5


In [8]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=["total_occupancy", "datum_dolaska", "datum_odjave"], axis=1)
y = df["total_occupancy"]
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=27)

In [14]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np 
from sklearn.metrics import mean_squared_error,  r2_score, mean_absolute_error

rf_model = RandomForestRegressor(random_state=27, n_jobs=-1, min_samples_split =3, max_samples=800, n_estimators=73)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred)
mse_rf = mean_squared_error(y_test, y_pred)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred)

print(f'Mean Absolute Error (RF): {mae_rf}')
print(f'Mean Squared Error (RF): {mse_rf}')
print(f'Root Mean Squared Error (RF): {rmse_rf}')
print(f'R2 (RF): {r2_rf}')

Mean Absolute Error (RF): 35.037949932220315
Mean Squared Error (RF): 2564.5812868533008
Root Mean Squared Error (RF): 50.64169514198059
R2 (RF): 0.9587561367829432


In [None]:
import warnings
from sklearn.model_selection import cross_val_score
warnings.filterwarnings('ignore')


for i in range(200,500):
    for j in range(2000,2001):
        model = RandomForestRegressor(n_estimators = i, max_samples =j,  random_state = 27, n_jobs=-1)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(" n_estimators={}, max_samples={}, r2={}".format(i, j,
                                                                     r2_score(y_test, y_pred)
                                                                    ))

In [8]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {'n_estimators': list(range(1,100)), 'max_samples': list(range(1,1000)),'min_samples_split':list(range(1,11))}
random_search = RandomizedSearchCV(RandomForestRegressor(random_state=27,n_jobs=-1), param_dist, n_iter=10, cv=5, scoring='neg_mean_absolute_error', random_state=27)
random_search.fit(X_train, y_train)


best_hyperparams = random_search.best_params_

best_mae_score = -random_search.best_score_

best_model = random_search.best_estimator_

y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best Hyperparameters: {best_hyperparams}')
print(f'Best Mean Absolute Error: {best_mae_score}')
print(f'Mean Absolute Error on Test Data: {mae}')
print(f'R-squared on Test Data: {r2}')

Best Hyperparameters: {'n_estimators': 73, 'min_samples_split': 3, 'max_samples': 800}
Best Mean Absolute Error: 25.731571263987075
Mean Absolute Error on Test Data: 25.74575506447041
R-squared on Test Data: 0.9687722750591333


In [11]:
X = df.drop(columns=["total_occupancy", "datum_dolaska", "datum_odjave"], axis=1)
y = df["total_occupancy"]

rf_model = RandomForestRegressor(random_state=27, n_estimators= 73, min_samples_split= 3, max_samples= 800, n_jobs=-1)

cv_scores_mae = cross_val_score(rf_model, X, y, cv=5, scoring='neg_mean_absolute_error')
cv_scores_r2 = cross_val_score(rf_model, X, y, cv=5, scoring='r2')

cv_scores_mae = -cv_scores_mae

print(f'Cross-Validated Mean Absolute Error (RF): {cv_scores_mae.mean()}')
print(f'Cross-Validated R2 (RF): {cv_scores_r2.mean()}')


Cross-Validated Mean Absolute Error (RF): 69.99570126041124
Cross-Validated R2 (RF): 0.6121913922099151


In [28]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

tscv = TimeSeriesSplit(n_splits=5)

model = RandomForestRegressor(random_state=27, n_jobs=-1, min_samples_split =3, max_samples=800, n_estimators=73)

rmse_scores = []
r2_scores = []

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    rmse_scores.append(rmse)

print("Mean RMSE:", np.mean(rmse_scores))
print("Mean R2:", np.mean(r2_scores))

Mean RMSE: 83.58908360440043
Mean R2: 0.6636792280186012


Sad testiramo 

In [24]:
df1 = pd.read_csv("prep_data/hotelid1_daily_merged.csv")
df1 = df1.drop(columns=["Unnamed: 0"], axis=1)
df1.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46084 entries, 0 to 46083
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   datum_dolaska        46084 non-null  object 
 1   datum_odjave         46084 non-null  object 
 2   kanal_prodaje_id     46084 non-null  int64  
 3   tip_sobe_id          46084 non-null  int64  
 4   cijena_nocenja       46084 non-null  float64
 5   rezervacija_id       46084 non-null  int64  
 6   gost_id              46084 non-null  int64  
 7   zemlja_gosta_mapped  46084 non-null  int64  
 8   total_occupancy      46084 non-null  float64
dtypes: float64(2), int64(5), object(2)
memory usage: 3.2+ MB


In [25]:
df1['datum_dolaska'] = pd.to_datetime(df1['datum_dolaska'])
df1['datum_odjave'] = pd.to_datetime(df1['datum_odjave'])

df1['day_of_week'] = df1['datum_dolaska'].dt.day_of_week
df1['month'] = df1['datum_dolaska'].dt.month
df1['year'] = df1['datum_dolaska'].dt.year
df1['week_of_year'] = df1['datum_dolaska'].dt.isocalendar().week
df1['week_of_month'] = df1['datum_dolaska'].dt.day // 7 + 1

In [26]:
X = df.drop(columns=["total_occupancy", "datum_dolaska", "datum_odjave"], axis=1)
y = df["total_occupancy"]

In [27]:
y_pred = rf_model.predict(X)

mae_rf = mean_absolute_error(y, y_pred)
mse_rf = mean_squared_error(y, y_pred)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y, y_pred)

print(f'Mean Absolute Error (RF): {mae_rf}')
print(f'Mean Squared Error (RF): {mse_rf}')
print(f'Root Mean Squared Error (RF): {rmse_rf}')
print(f'R2 (RF): {r2_rf}')

Mean Absolute Error (RF): 62.988344269289485
Mean Squared Error (RF): 12802.941705360901
Root Mean Squared Error (RF): 113.15008486678612
R2 (RF): 0.7590229409389635


Ovo zapravo nije uopste lose, brinuo sam se hoce li model overfittovati jer ako je r2 vrednost kao u prvom slucaju preko 90% to je mozda znak da model zbog svoje kompleksnosti moze bukvalno da upamti slucajeve, ali izgleda da je okej i nad podacima koje nikako nije mogao videti, a to su ovi za drugi dataset, sto znaci da smo solidni wooohoo.