In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib
import os

# --- Load All Pre-computed Assets ---
print('Loading pre-computed features ...')
folderGCS='wanderlust-recommender-system/processed/'
localDir = 'Features'
if not os.path.exists(localDir):
    os.makedirs(localDir)

# Define dictionary for features
featuresDict = {'userFactor.npy': f'gs://{folderGCS}userFactor.npy',\
                'hotelFactor.npy': f'gs://{folderGCS}hotelFactor.npy',\
                'hotelEmbeddings.npy':f'gs://{folderGCS}newEmbedding.npy'}


# Download all feature files from GCS
try:
    print('Downloading data from GCS bucket ...')
    for fileName, address in featuresDict.items():
        !gsutil cp {address} {os.path.join(localDir,fileName)}
    print('Downloaded.')
except Exception as e:
    print(f'Files cannot be downloaded: {e}')

# Load the features and cleaned data into memory from their local paths
try:
    print('Loadding data and features ...')
    df = pd.read_parquet(f'gs://{folderGCS}combined_hotel_reviews.parquet')
    userFactors = np.load(os.path.join(localDir,'userFactor.npy'))
    hotelFactors = np.load(os.path.join(localDir,'hotelFactor.npy'))
    ContentBasedFeatures = np.load(os.path.join(localDir,'hotelEmbeddings.npy'), allow_pickle=True).item()
    print('Loaded.')
except Exception as e:
    print(f'problem with loading: {e}')


# --- Create the Final Training Dataset ---
print('Creating training dataset ...')
# For each rating, get the corresponding user and hotel factors
userFeatures = [userFactors[user_id] for user_id in df['user_id']]
hotelFeatures = [hotelFactors[hotel_id] for hotel_id in df['hotel_id']]

# For each rating, get the corresponding hotel content embedding
hotelContent = [ContentBasedFeatures[hotel_id] for hotel_id in df['hotel_id']]


# Combine all features into a single feature matrix `X`and define target variable
X = np.hstack([userFeatures, hotelFeatures, hotelContent])
y = df['reviews.rating']
print('Training dataset created.')

# --- Split Data and Train XGBoost Model ---
print('Split training data to 70-10-20 ...')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)
print('Splitted.')

rankerModel = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=300, max_depth=10, learning_rate=0.1, random_state=42)
print('Training started ...')
rankerModel.fit(X_train, y_train,\
               eval_set=[(X_val, y_val)],\
               verbose=10)
print('Training finished successfully.')

# --- Evaluate the Model ---
print('Evaluating the trained model ...')
y_pred = rankerModel.predict(X_test)
metric = mean_squared_error(y_test, y_pred)
print(f'Evaluation completed. Resulting MSE is: {metric:.4f}')

# --- Save the Trained Ranker Model ---
print('Saving the trained model on GCS ...')
modelName = 'xgbScorer.joblib'
joblib.dump(rankerModel, modelName)
!gsutil cp {modelName} gs://{folderGCS}{modelName}
!rm -f {modelName}
print('Saved successfully.')