In [None]:
import os
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pyspark.sql.functions import round

In [None]:
class Config:
    TRAIN_PATH = '../data/train.parquet'
    TEST_PATH = '../data/test.parquet'
    MODEL_PATH = '../data/als_model_baseline'

    # Baseline ALS Parameters
    RANK = 20  # Number of latent factors
    MAX_ITER = 10  # Maximum iterations
    REG_PARAM = 0.1  # Regularization parameter
    COLD_START = 'drop'  # Drop rows with NaN predictions during eval


config = Config()

In [None]:
# Initialize Spark
spark = SparkSession.builder \
    .appName("AirbnbALS_Baseline") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

print(f"Spark Session created. Version: {spark.version}")

In [None]:
print("Loading data...")

if not os.path.exists(config.TRAIN_PATH) or not os.path.exists(config.TEST_PATH):
    raise FileNotFoundError("Train/Test data not found. Please run the previous data prep step first.")

train_data = spark.read.parquet(config.TRAIN_PATH)
test_data = spark.read.parquet(config.TEST_PATH)

# Cache data for faster iteration
train_data.cache()
test_data.cache()

print(f"Train count: {train_data.count():,}")
print(f"Test count:  {test_data.count():,}")

In [None]:
# Implement ALS Model
print("\nInitializing ALS model...")

als = ALS(
    userCol="user_id",
    itemCol="item_id",
    ratingCol="rating",
    rank=config.RANK,
    maxIter=config.MAX_ITER,
    regParam=config.REG_PARAM,
    coldStartStrategy=config.COLD_START,  # Critical for evaluation
    nonnegative=True  # Ratings are 1-5, so forces positive factors
)

# Train the model
print("Training model (this may take a moment)...")
model = als.fit(train_data)

print("âœ“ Model trained successfully")

In [None]:
# Generate Predictions
print("\nGenerating predictions on test set...")

# Transform test data to get predictions
predictions = model.transform(test_data)

# Show sample predictions
print("Sample Predictions:")
predictions.select(
    "user_id",
    "item_id",
    "rating",
    round("prediction", 2).alias("prediction")
).show(10)

In [None]:
print("Calculating RMSE...")

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions)

print("------------------------------------------------")
print(f"Root Mean Square Error (RMSE): {rmse:.4f}")
print("------------------------------------------------")

# Contextual Interpretation
print(f"\nInterpretation:")
print(f"On average, the model's prediction is off by {rmse:.2f} stars.")
print(f"For a 5-star scale, an RMSE below 1.0 is generally considered acceptable for a baseline.")

In [None]:
print("\nLearned User Factors (First 5):")
model.userFactors.show(5, truncate=False)

# Save the model for future use
print(f"Saving model to {config.MODEL_PATH}...")
model.write().overwrite().save(config.MODEL_PATH)