# Popularity Prediction for UCSD Courses

In [1]:
from popularity import myRatingModel
from baseline import global_average_mse
import pandas as pd
import random
from collections import defaultdict

## helper functions

In [2]:
def clip(x, min_val, max_val):
    return max(min_val, min(x, max_val))

## read in data

In [3]:
# file path
data_path = "../data/CAPEs_with_features.csv"

# read CSV
df = pd.read_csv(data_path)

# filter relevant columns and drop NaNs in target
# target: rcmd_class (percentage 0-100)
df = df[['instructor', 'sub_course', 'rcmd_class']].dropna()

# Extract department
df['department'] = df['sub_course'].apply(lambda x: x.split()[0] if isinstance(x, str) else 'UNKNOWN')

# Create item -> department mapping
itemToDept = df.set_index('sub_course')['department'].to_dict()

# Create allRatings list: (user, item, rating)
allRatings = []
for _, row in df.iterrows():
    allRatings.append((row['instructor'], row['sub_course'], row['rcmd_class']))

# Split into train/validation
validationSplit = 0.1
random.seed(42)
random.shuffle(allRatings)
split_idx = int(len(allRatings) * (1 - validationSplit))
ratingsTrain = allRatings[:split_idx]
ratingsValid = allRatings[split_idx:]

# Build per-user and per-item dictionaries
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)

for user, item, rating in ratingsTrain:
    ratingsPerUser[user].append((item, rating))
    ratingsPerItem[item].append((user, rating))

print(f"Loaded {len(allRatings)} ratings.")
print(f"Training: {len(ratingsTrain)}, Validation: {len(ratingsValid)}")
print(f"Unique users: {len(ratingsPerUser)}, Unique items: {len(ratingsPerItem)}")

Loaded 51764 ratings.
Training: 46587, Validation: 5177
Unique users: 4977, Unique items: 3243


## baseline runner

In [4]:
"""
baseline rating prediction using the global average
"""

baseline_MSE, global_avg = global_average_mse(ratingsTrain, ratingsValid)
print(f"global average prediction: {global_avg:.4f}")
print(f"baseline mse on validation set: {baseline_MSE:.4f}")

global average prediction: 89.4381
baseline mse on validation set: 145.0333


## pipeline for popularity prediction (bias-only model)

### hyperparameter grid search

In [6]:
lambdaU_grid = [0.5, 1.0, 2.0, 3.5, 5.0]
lambdaI_grid = [4, 8, 12, 16, 20]

search_results = []
best_result = {"lambdaU": None, "lambdaI": None, "mse": float("inf")}

for lambU in lambdaU_grid:
    for lambI in lambdaI_grid:
        _, _, _, mse, _ = myRatingModel(
            ratingsTrain,
            ratingsValid,
            ratingsPerUser,
            ratingsPerItem,
            lambU,
            lambI,
            itemToDept
        )
        search_results.append((lambU, lambI, mse))
        if mse < best_result["mse"]:
            best_result = {"lambdaU": lambU, "lambdaI": lambI, "mse": mse}

search_results.sort(key=lambda x: x[2])

print("top grid-search results (lambdaU, lambdaI, MSE):")
for lambU, lambI, mse in search_results[:5]:
    print(f"  ({lambU}, {lambI}) -> {mse:.4f}")

best_lambdaU = best_result["lambdaU"]
best_lambdaI = best_result["lambdaI"]
best_grid_mse = best_result["mse"]

print(
    f"\nbest combination -> lambdaU: {best_lambdaU}, lambdaI: {best_lambdaI}, MSE: {best_grid_mse:.4f}"
)

top grid-search results (lambdaU, lambdaI, MSE):
  (3.5, 8) -> 97.6604
  (5.0, 8) -> 97.6806
  (3.5, 4) -> 97.7190
  (5.0, 4) -> 97.7817
  (3.5, 12) -> 98.0576

best combination -> lambdaU: 3.5, lambdaI: 8, MSE: 97.6604


### pipeline (bias-only model)

In [7]:
"""
train and evaluate rating prediction model
"""

# hyperparameters chosen via grid search (fallback to defaults if not run yet)
lambdaU = globals().get("best_lambdaU")
lambdaI = globals().get("best_lambdaI")

print(f"using lambdaU={lambdaU}, lambdaI={lambdaI}")
if "best_grid_mse" in globals():
    print(f"(grid-search validation mse: {best_grid_mse:.4f})")

# train model with validation
print("\n" + "=" * 60)
print("MY MODEL (Bias-Only Model)")
print(f"lambdaU: {lambdaU}, lambdaI: {lambdaI}")
print("=" * 60)

alpha, betaU, betaI, my_MSE, globalAlpha = myRatingModel(
    ratingsTrain,
    ratingsValid,
    ratingsPerUser,
    ratingsPerItem,
    lambdaU,
    lambdaI,
    itemToDept,
    verbose=True
)

print(f"\nfinal validation MSE: {my_MSE:.4f}")

print("\n" + "=" * 60)
print("PERFORMANCE COMPARISON")
print("=" * 60)
print(f"baseline MSE: {baseline_MSE:.4f}")
print(f"my MSE:       {my_MSE:.4f}")
improvement = baseline_MSE - my_MSE
improvement_pct = (improvement / baseline_MSE) * 100
print(f"improvement:  {improvement:+.4f} ({improvement_pct:+.2f}%)")

using lambdaU=3.5, lambdaI=8
(grid-search validation mse: 97.6604)

MY MODEL (Bias-Only Model)
lambdaU: 3.5, lambdaI: 8
Iteration 1: Training MSE = 85.2075, MSE+Reg = 479088.4673, Valid MSE = 99.6798
Iteration 2: Training MSE = 83.5149, MSE+Reg = 456600.7283, Valid MSE = 98.1274
Iteration 3: Training MSE = 83.1693, MSE+Reg = 457393.2662, Valid MSE = 97.8498
Iteration 4: Training MSE = 83.0413, MSE+Reg = 458767.9392, Valid MSE = 97.7567
Iteration 5: Training MSE = 82.9795, MSE+Reg = 459669.4548, Valid MSE = 97.7154
Iteration 6: Training MSE = 82.9448, MSE+Reg = 460229.4937, Valid MSE = 97.6937
Iteration 7: Training MSE = 82.9234, MSE+Reg = 460589.6576, Valid MSE = 97.6810
Iteration 8: Training MSE = 82.9092, MSE+Reg = 460832.6774, Valid MSE = 97.6731
Iteration 9: Training MSE = 82.8993, MSE+Reg = 461003.7532, Valid MSE = 97.6680
Iteration 10: Training MSE = 82.8923, MSE+Reg = 461128.1484, Valid MSE = 97.6647
Iteration 11: Training MSE = 82.8871, MSE+Reg = 461220.7381, Valid MSE = 97.662