# Advanced Modeling
This notebook demonstrates how to train and evaluate more sophisticated models on the merged dataset.

In [None]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [None]:
clean_dir = Path('Data/clean')
acc = pd.read_csv(clean_dir / 'accommodation_facts_clean.csv')
market = pd.read_csv(clean_dir / 'market_otb_clean.csv', parse_dates=['stay_date'])
rates = pd.read_csv(clean_dir / 'rates_combined.csv', parse_dates=['stay_date', 'extract_date'])

In [None]:
rates_dest = rates.merge(acc[['bookingdotcom_id', 'travel_destination_name', 'stars']],
                    on='bookingdotcom_id', how='left')
agg_rates = rates_dest.groupby('travel_destination_name').agg(
    avg_rate=('price_value', 'mean'), stars=('stars', 'mean'))
agg_occ = market.groupby('travel_destination_name').agg(avg_occ=('average_occupancy', 'mean'))
data = agg_rates.join(agg_occ).dropna()
X = data[['avg_rate', 'stars']]
y = data['avg_occ']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf = RandomForestRegressor(n_estimators=200, random_state=42)
gb = GradientBoostingRegressor(random_state=42)

for model in [rf, gb]:
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
    print(model.__class__.__name__, 'CV RMSE:', -cv_scores.mean())

In [None]:
rf.fit(X_train, y_train)
gb.fit(X_train, y_train)

pred_rf = rf.predict(X_test)
pred_gb = gb.predict(X_test)

print('RF RMSE:', mean_squared_error(y_test, pred_rf, squared=False))
print('GB RMSE:', mean_squared_error(y_test, pred_gb, squared=False))
print('Feature importances:', dict(zip(X.columns, rf.feature_importances_)))