In [1]:
import os
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, round, stddev, min as spark_min, max as spark_max
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import json

In [2]:
class Config:
    TRAIN_PATH = '../data/train.parquet'
    TEST_PATH = '../data/test.parquet'
    LISTINGS_PATH = '../data/listings.parquet'
    MODEL_PATH = '../data/xgboost_model_baseline'

    # Optimized XGBoost Parameters
    N_ESTIMATORS = 200  # Increased for better learning
    MAX_DEPTH = 5  # Reduced to prevent overfitting
    LEARNING_RATE = 0.05  # Reduced for better convergence
    SUBSAMPLE = 0.85
    COL_SAMPLE_BY_TREE = 0.85
    MIN_CHILD_WEIGHT = 3  # Regularization
    GAMMA = 0.1  # Minimum loss reduction
    REG_ALPHA = 0.1  # L1 regularization
    REG_LAMBDA = 1.0  # L2 regularization
    VALIDATION_SIZE = 0.2  # Validation set size for monitoring
    RANDOM_STATE = 42


config = Config()

In [3]:
# Initialize Spark
spark = SparkSession.builder \
    .appName("AirbnbXGBoost_Baseline") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

print(f"Spark Session created. Version: {spark.version}")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/27 14:57:08 WARN Utils: Your hostname, nnnnnn.local, resolves to a loopback address: 127.0.0.1; using 192.168.70.243 instead (on interface en0)
25/11/27 14:57:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/27 14:57:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/27 14:57:09 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/11/27 14:57:09 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/11/27 14:57:09 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


Spark Session created. Version: 4.0.1


In [4]:
print("Loading data...")

if not os.path.exists(config.TRAIN_PATH) or not os.path.exists(config.TEST_PATH):
    raise FileNotFoundError("Train/Test data not found. Please run the previous data prep step first.")

if not os.path.exists(config.LISTINGS_PATH):
    raise FileNotFoundError("Listings data not found. Please run the previous data prep step first.")

# Load Spark DataFrames
train_spark = spark.read.parquet(config.TRAIN_PATH)
test_spark = spark.read.parquet(config.TEST_PATH)
listings_spark = spark.read.parquet(config.LISTINGS_PATH)

# Cache for faster iteration
train_spark.cache()
test_spark.cache()
listings_spark.cache()

print(f"Train count: {train_spark.count():,}")
print(f"Test count:  {test_spark.count():,}")
print(f"Listings count: {listings_spark.count():,}")

Loading data...


                                                                                

Train count: 50,410
Test count:  12,603
Listings count: 12,004


In [5]:
# Enhanced Feature Engineering
print("\nCreating enhanced features for XGBoost...")

# 1. Enhanced User features from training data
print("  - Computing enhanced user statistics...")
user_stats = train_spark.groupBy("user_id").agg(
    avg("rating").alias("user_avg_rating"),
    stddev("rating").alias("user_rating_std"),
    count("item_id").alias("user_review_count"),
    spark_min("rating").alias("user_min_rating"),
    spark_max("rating").alias("user_max_rating")
).withColumnRenamed("user_id", "user_id_stats")

# 2. Enhanced Item features from training data
print("  - Computing enhanced item statistics...")
item_stats = train_spark.groupBy("item_id").agg(
    avg("rating").alias("item_avg_rating"),
    stddev("rating").alias("item_rating_std"),
    count("user_id").alias("item_review_count"),
    spark_min("rating").alias("item_min_rating"),
    spark_max("rating").alias("item_max_rating")
).withColumnRenamed("item_id", "item_id_stats")

# 3. Join enhanced user and item stats to train data
print("  - Joining enhanced features to train data...")
train_with_features = train_spark \
    .join(user_stats, train_spark.user_id == user_stats.user_id_stats, "left") \
    .join(item_stats, train_spark.item_id == item_stats.item_id_stats, "left") \
    .drop("user_id_stats", "item_id_stats")

# 4. Join listings features
print("  - Joining listings metadata...")
train_with_features = train_with_features \
    .join(listings_spark, train_with_features.listing_id == listings_spark.listing_id, "left")

# 5. Apply same feature engineering to test data
print("  - Applying enhanced features to test data...")
test_with_features = test_spark \
    .join(user_stats, test_spark.user_id == user_stats.user_id_stats, "left") \
    .join(item_stats, test_spark.item_id == item_stats.item_id_stats, "left") \
    .drop("user_id_stats", "item_id_stats") \
    .join(listings_spark, test_spark.listing_id == listings_spark.listing_id, "left")

print("✓ Enhanced feature engineering complete")


Creating enhanced features for XGBoost...
  - Computing enhanced user statistics...
  - Computing enhanced item statistics...
  - Joining enhanced features to train data...
  - Joining listings metadata...
  - Applying enhanced features to test data...
✓ Enhanced feature engineering complete


In [6]:
# Prepare data for XGBoost
print("\nPreparing data for XGBoost...")

# Convert to Pandas for XGBoost
print("  - Converting to Pandas DataFrames...")
train_df = train_with_features.toPandas()
test_df = test_with_features.toPandas()

print(f"  - Train shape: {train_df.shape}")
print(f"  - Test shape: {test_df.shape}")

# Base feature columns
feature_cols = [
    'user_id', 'item_id',
    'user_avg_rating', 'user_rating_std', 'user_review_count',
    'user_min_rating', 'user_max_rating',
    'item_avg_rating', 'item_rating_std', 'item_review_count',
    'item_min_rating', 'item_max_rating',
    'price', 'accommodates', 'bedrooms', 'beds',
    'minimum_nights', 'number_of_reviews',
    'review_scores_rating', 'review_scores_location', 'review_scores_value',
    'latitude', 'longitude'
]

# Handle categorical columns
categorical_cols = ['property_type', 'room_type', 'neighbourhood_cleansed']
label_encoders = {}

for col_name in categorical_cols:
    if col_name in train_df.columns:
        le = LabelEncoder()
        # Combine train and test for encoding
        combined = pd.concat([train_df[col_name].fillna('unknown'), 
                              test_df[col_name].fillna('unknown')])
        le.fit(combined)
        train_df[f'{col_name}_encoded'] = le.transform(train_df[col_name].fillna('unknown'))
        test_df[f'{col_name}_encoded'] = le.transform(test_df[col_name].fillna('unknown'))
        label_encoders[col_name] = le
        feature_cols.append(f'{col_name}_encoded')

# Handle boolean columns
bool_cols = ['host_is_superhost', 'instant_bookable']
for col_name in bool_cols:
    if col_name in train_df.columns:
        train_df[col_name] = train_df[col_name].astype(float).fillna(0)
        test_df[col_name] = test_df[col_name].astype(float).fillna(0)
        feature_cols.append(col_name)

# Create derived features
print("  - Creating derived features...")

# Price per person
train_df['price_per_person'] = train_df['price'] / (train_df['accommodates'] + 1e-6)
test_df['price_per_person'] = test_df['price'] / (test_df['accommodates'] + 1e-6)
feature_cols.append('price_per_person')

# Bedroom ratio
train_df['bedroom_ratio'] = train_df['bedrooms'] / (train_df['accommodates'] + 1e-6)
test_df['bedroom_ratio'] = test_df['bedrooms'] / (test_df['accommodates'] + 1e-6)
feature_cols.append('bedroom_ratio')

# Bed ratio
train_df['bed_ratio'] = train_df['beds'] / (train_df['accommodates'] + 1e-6)
test_df['bed_ratio'] = test_df['beds'] / (test_df['accommodates'] + 1e-6)
feature_cols.append('bed_ratio')

# Review score composite
train_df['review_score_composite'] = (
    train_df['review_scores_rating'].fillna(0) * 0.5 +
    train_df['review_scores_location'].fillna(0) * 0.3 +
    train_df['review_scores_value'].fillna(0) * 0.2
)
test_df['review_score_composite'] = (
    test_df['review_scores_rating'].fillna(0) * 0.5 +
    test_df['review_scores_location'].fillna(0) * 0.3 +
    test_df['review_scores_value'].fillna(0) * 0.2
)
feature_cols.append('review_score_composite')

# Interaction features
print("  - Creating interaction features...")
train_df['user_avg_x_item_avg'] = train_df['user_avg_rating'] * train_df['item_avg_rating']
test_df['user_avg_x_item_avg'] = test_df['user_avg_rating'] * test_df['item_avg_rating']
feature_cols.append('user_avg_x_item_avg')

train_df['user_avg_x_price_norm'] = train_df['user_avg_rating'] * (train_df['price'] / 100)
test_df['user_avg_x_price_norm'] = test_df['user_avg_rating'] * (test_df['price'] / 100)
feature_cols.append('user_avg_x_price_norm')

train_df['item_avg_x_review_score'] = train_df['item_avg_rating'] * train_df['review_score_composite']
test_df['item_avg_x_review_score'] = test_df['item_avg_rating'] * test_df['review_score_composite']
feature_cols.append('item_avg_x_review_score')

# Select only available features
available_features = [f for f in feature_cols if f in train_df.columns]
print(f"  - Using {len(available_features)} features")

# Better missing value handling using median
print("  - Handling missing values with median imputation...")
numeric_features = [f for f in available_features if f not in ['user_id', 'item_id']]
for col in numeric_features:
    if col in train_df.columns:
        median_val = train_df[col].median()
        if pd.notna(median_val):
            train_df[col] = train_df[col].fillna(median_val)
            test_df[col] = test_df[col].fillna(median_val)
        else:
            train_df[col] = train_df[col].fillna(0)
            test_df[col] = test_df[col].fillna(0)

# Prepare X and y
X_train = train_df[available_features]
y_train = train_df['rating'].values
X_test = test_df[available_features]
y_test = test_df['rating'].values

print("✓ Data prepared for XGBoost")


Preparing data for XGBoost...
  - Converting to Pandas DataFrames...
  - Train shape: (50410, 35)
  - Test shape: (12603, 35)
  - Creating derived features...
  - Creating interaction features...
  - Using 35 features
  - Handling missing values with median imputation...
✓ Data prepared for XGBoost


In [7]:
# Train XGBoost Model
print("\nTraining XGBoost model...")

# Create validation set for monitoring
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, 
    test_size=config.VALIDATION_SIZE, 
    random_state=config.RANDOM_STATE
)

print(f"  - Training set: {X_train_split.shape[0]:,} samples")
print(f"  - Validation set: {X_val_split.shape[0]:,} samples")

# XGBoost 3.x sklearn API
model = xgb.XGBRegressor(
    n_estimators=config.N_ESTIMATORS,
    max_depth=config.MAX_DEPTH,
    learning_rate=config.LEARNING_RATE,
    subsample=config.SUBSAMPLE,
    colsample_bytree=config.COL_SAMPLE_BY_TREE,
    min_child_weight=config.MIN_CHILD_WEIGHT,
    gamma=config.GAMMA,
    reg_alpha=config.REG_ALPHA,
    reg_lambda=config.REG_LAMBDA,
    random_state=config.RANDOM_STATE,
    objective='reg:squarederror',
    n_jobs=-1
)

# Train model
model.fit(X_train_split, y_train_split)

# Evaluate on validation set
val_pred = model.predict(X_val_split)
val_rmse = np.sqrt(mean_squared_error(y_val_split, val_pred))
print(f"  - Validation RMSE: {val_rmse:.4f}")

print(f"✓ Model trained successfully ({config.N_ESTIMATORS} iterations)")


Training XGBoost model...
  - Training set: 40,328 samples
  - Validation set: 10,082 samples
  - Validation RMSE: 0.4144
✓ Model trained successfully (200 iterations)


In [8]:
# Generate Predictions
print("\nGenerating predictions on test set...")
y_pred = model.predict(X_test)

# Clip predictions to valid rating range [1, 5]
y_pred = np.clip(y_pred, 1.0, 5.0)

# Create predictions DataFrame for display
predictions_df = pd.DataFrame({
    'user_id': test_df['user_id'].values,
    'item_id': test_df['item_id'].values,
    'rating': y_test,
    'prediction': y_pred
})

print("Sample Predictions:")
print(predictions_df.head(10).to_string(index=False))


Generating predictions on test set...
Sample Predictions:
 user_id  item_id  rating  prediction
   13497       22    4.49    4.141732
    7614     6588    4.62    4.353892
      29     2317    3.68    4.216444
   13131     1166    1.36    3.310230
   14631        9    4.40    2.964554
    4731     5110    3.90    3.928891
     825     5346    3.29    3.747145
   14301      618    4.39    2.588744
    2732     4448    3.05    3.779743
    3539      840    3.65    3.394812


In [9]:
# Calculate RMSE
print("\nCalculating RMSE...")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("------------------------------------------------")
print(f"Root Mean Square Error (RMSE): {rmse:.4f}")
print("------------------------------------------------")

# Contextual Interpretation
print(f"\nInterpretation:")
print(f"On average, the model's prediction is off by {rmse:.2f} stars.")
if rmse < 1.0:
    print(f"✓ Excellent! RMSE is below 1.0, which is considered good for a 5-star scale.")
else:
    print(f"For a 5-star scale, an RMSE below 1.0 is generally considered acceptable for a baseline.")

# Feature Importance
print("\nTop 15 Most Important Features:")
feature_importance = pd.DataFrame({
    'feature': available_features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(15).to_string(index=False))

# Calculate additional metrics
print("\nAdditional Metrics:")
mae = np.mean(np.abs(y_test - y_pred))
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Rating: {np.mean(y_test):.4f}")
print(f"Std Rating: {np.std(y_test):.4f}")



Calculating RMSE...
------------------------------------------------
Root Mean Square Error (RMSE): 0.9051
------------------------------------------------

Interpretation:
On average, the model's prediction is off by 0.91 stars.
✓ Excellent! RMSE is below 1.0, which is considered good for a 5-star scale.

Top 15 Most Important Features:
                feature  importance
    user_avg_x_item_avg    0.722688
        user_avg_rating    0.096293
        user_max_rating    0.025276
        item_max_rating    0.024909
        item_avg_rating    0.019960
        user_min_rating    0.019134
        item_min_rating    0.009989
item_avg_x_review_score    0.009695
        user_rating_std    0.007250
        item_rating_std    0.005606
                item_id    0.005529
      item_review_count    0.005109
      host_is_superhost    0.004259
         minimum_nights    0.003212
      room_type_encoded    0.003024

Additional Metrics:
Mean Absolute Error (MAE): 0.6854
Mean Rating: 2.9995
Std Rati

In [10]:
# Save the model for future use
print(f"\nSaving model to {config.MODEL_PATH}...")
os.makedirs(config.MODEL_PATH, exist_ok=True)
model.save_model(f"{config.MODEL_PATH}/xgboost_model.json")

# Also save feature names and model info
model_info = {
    'feature_names': available_features,
    'categorical_columns': list(label_encoders.keys()),
    'rmse': float(rmse),
    'mae': float(mae),
    'n_estimators_used': config.N_ESTIMATORS,
    'n_features': len(available_features),
    'hyperparameters': {
        'n_estimators': config.N_ESTIMATORS,
        'max_depth': config.MAX_DEPTH,
        'learning_rate': config.LEARNING_RATE,
        'subsample': config.SUBSAMPLE,
        'colsample_bytree': config.COL_SAMPLE_BY_TREE,
        'min_child_weight': config.MIN_CHILD_WEIGHT,
        'gamma': config.GAMMA,
        'reg_alpha': config.REG_ALPHA,
        'reg_lambda': config.REG_LAMBDA
    }
}

with open(f"{config.MODEL_PATH}/model_info.json", 'w') as f:
    json.dump(model_info, f, indent=2)

# Save feature importance
feature_importance.to_csv(f"{config.MODEL_PATH}/feature_importance.csv", index=False)

print("✓ Model saved successfully")
print(f"  - Model: {config.MODEL_PATH}/xgboost_model.json")
print(f"  - Info: {config.MODEL_PATH}/model_info.json")
print(f"  - Feature importance: {config.MODEL_PATH}/feature_importance.csv")



Saving model to ../data/xgboost_model_baseline...
✓ Model saved successfully
  - Model: ../data/xgboost_model_baseline/xgboost_model.json
  - Info: ../data/xgboost_model_baseline/model_info.json
  - Feature importance: ../data/xgboost_model_baseline/feature_importance.csv
