In [1]:
import os
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pyspark.sql.functions import round

In [2]:
class Config:
    TRAIN_PATH = '../data/train.parquet'
    TEST_PATH = '../data/test.parquet'
    MODEL_PATH = '../data/als_model_baseline'

    # Baseline ALS Parameters
    RANK = 20  # Number of latent factors
    MAX_ITER = 10  # Maximum iterations
    REG_PARAM = 0.1  # Regularization parameter
    COLD_START = 'drop'  # Drop rows with NaN predictions during eval


config = Config()

In [3]:
# Initialize Spark
spark = SparkSession.builder \
    .appName("AirbnbALS_Baseline") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

print(f"Spark Session created. Version: {spark.version}")

Spark Session created. Version: 4.0.1


In [4]:
print("Loading data...")

if not os.path.exists(config.TRAIN_PATH) or not os.path.exists(config.TEST_PATH):
    raise FileNotFoundError("Train/Test data not found. Please run the previous data prep step first.")

train_data = spark.read.parquet(config.TRAIN_PATH)
test_data = spark.read.parquet(config.TEST_PATH)

# Cache data for faster iteration
train_data.cache()
test_data.cache()

print(f"Train count: {train_data.count():,}")
print(f"Test count:  {test_data.count():,}")

Loading data...
Train count: 50,410
Test count:  12,603


In [5]:
# Implement ALS Model
print("\nInitializing ALS model...")

als = ALS(
    userCol="user_id",
    itemCol="item_id",
    ratingCol="rating",
    rank=config.RANK,
    maxIter=config.MAX_ITER,
    regParam=config.REG_PARAM,
    coldStartStrategy=config.COLD_START,  # Critical for evaluation
    nonnegative=True  # Ratings are 1-5, so forces positive factors
)

# Train the model
print("Training model (this may take a moment)...")
model = als.fit(train_data)

print("✓ Model trained successfully")


Initializing ALS model...
Training model (this may take a moment)...
✓ Model trained successfully


In [6]:
# Generate Predictions
print("\nGenerating predictions on test set...")

# Transform test data to get predictions
predictions = model.transform(test_data)

# Show sample predictions
print("Sample Predictions:")
predictions.select(
    "user_id",
    "item_id",
    "rating",
    round("prediction", 2).alias("prediction")
).show(10)


Generating predictions on test set...
Sample Predictions:
+-------+-------+---------+----------+
|user_id|item_id|   rating|prediction|
+-------+-------+---------+----------+
|   3918|    204| 3.348124|      3.22|
|  13289|    652|4.6463885|      4.37|
|   1025|    410| 3.348124|      3.15|
|   1025|     68|2.2889438|      2.23|
|  15967|   5798| 3.222622|      2.88|
|    540|   4841|3.8549635|      3.72|
|  13009|   3638|2.4328773|      2.11|
|   2721|   2819| 3.348124|      3.15|
|   9946|     34|3.1089873|      3.01|
|   7530|    907| 4.007063|      3.75|
+-------+-------+---------+----------+
only showing top 10 rows


In [7]:
print("Calculating RMSE...")

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions)

print("------------------------------------------------")
print(f"Root Mean Square Error (RMSE): {rmse:.4f}")
print("------------------------------------------------")

# Contextual Interpretation
print(f"\nInterpretation:")
print(f"On average, the model's prediction is off by {rmse:.2f} stars.")
print(f"For a 5-star scale, an RMSE below 1.0 is generally considered acceptable for a baseline.")

Calculating RMSE...
------------------------------------------------
Root Mean Square Error (RMSE): 0.4616
------------------------------------------------

Interpretation:
On average, the model's prediction is off by 0.46 stars.
For a 5-star scale, an RMSE below 1.0 is generally considered acceptable for a baseline.


In [8]:
print("\nLearned User Factors (First 5):")
model.userFactors.show(5, truncate=False)

# Save the model for future use
print(f"Saving model to {config.MODEL_PATH}...")
model.write().overwrite().save(config.MODEL_PATH)


Learned User Factors (First 5):
+---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                                                                                                                                     |
+---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0  |[0.34221596, 0.2795293, 0.28437147, 0.29964733, 0.31039083, 0.32725284, 0.30090427, 0.28663862, 0.35389683, 0.27607363, 0.3090803, 0.3133531, 0.28163907, 0.30895388, 0.29674083, 0.28178635, 0.29127085, 0.31363103, 0.32438582, 0.31