# Model Comparison

This notebook runs a comparison of all the implemented recommender models on the MovieLens 1M dataset.

**Important:** Before running, make sure you have set your `GOOGLE_API_KEY` as an environment variable.

In [None]:
import os
import pandas as pd

from osiro_llm.data.movielens import load_movielens_data
from osiro_llm.recommenders.baseline_random import RandomRecommender
from osiro_llm.recommenders.baseline_popularity import PopularityRecommender
from osiro_llm.recommenders.baseline_user_based_cf import UserCFRecommender
from osiro_llm.llm.google import GoogleLLMWrapper
from osiro_llm.recommenders.llm_zero_shot import LLMZeroShotRecommender
from osiro_llm.recommenders.llm_reranker import LLMReranker
from osiro_llm.evaluation.evaluator import Evaluator

from sklearn.model_selection import train_test_split

GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
if GOOGLE_API_KEY is None:
    print("ERROR: Please set the GOOGLE_API_KEY environment variable before proceeding.")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data
# Choose dataset version: '100k' or '1m'
DATASET_VERSION = '100k' 
_, ratings_df, movies_df = load_movielens_data(version=DATASET_VERSION)

# Split data into train and test sets
train_df, test_df = train_test_split(
    ratings_df,
    test_size=0.2,
    random_state=42,
    stratify=ratings_df['UserID']
)


In [3]:
# Instantiate models
llm_wrapper = GoogleLLMWrapper()

models = {
    "Random": RandomRecommender(),
    "Popularity": PopularityRecommender(),
    "User-CF": UserCFRecommender(k=20),
    "LLM Zero-Shot": LLMZeroShotRecommender(llm_wrapper),
    "LLM Re-Ranker (User-CF)": LLMReranker(
        base_recommender=UserCFRecommender(k=20),
        llm_wrapper=llm_wrapper
    )
}


In [4]:
# Run evaluation
evaluator = Evaluator(models, k=10, llm_delay=4) # 1 second delay between LLM requests
results = evaluator.evaluate(train_df, test_df, movies_df)


Evaluating Random...


Predicting for Random: 100%|██████████| 10/10 [00:00<00:00, 11848.32it/s]


Results for Random: {'precision@k': 0.0, 'recall@k': 0.0, 'ndcg@k': np.float64(0.0)}
Evaluating Popularity...


Predicting for Popularity: 100%|██████████| 10/10 [00:00<00:00, 3181.36it/s]
  ret = a @ b
  ret = a @ b
  ret = a @ b


Results for Popularity: {'precision@k': 0.18, 'recall@k': 0.13026931880380155, 'ndcg@k': np.float64(0.21979548853805694)}
Evaluating User-CF...


Predicting for User-CF: 100%|██████████| 10/10 [00:00<00:00, 382.38it/s]


Results for User-CF: {'precision@k': 0.02, 'recall@k': 0.015357142857142857, 'ndcg@k': np.float64(0.015953123058914498)}
Evaluating LLM Zero-Shot...


Predicting for LLM Zero-Shot: 100%|██████████| 10/10 [00:52<00:00,  5.21s/it]
  ret = a @ b
  ret = a @ b
  ret = a @ b


Results for LLM Zero-Shot: {'precision@k': 0.16999999999999998, 'recall@k': 0.05162934766383043, 'ndcg@k': np.float64(0.1890416854005464)}
Evaluating LLM Re-Ranker (User-CF)...


Predicting for LLM Re-Ranker (User-CF): 100%|██████████| 10/10 [00:50<00:00,  5.02s/it]

Results for LLM Re-Ranker (User-CF): {'precision@k': 0.2, 'recall@k': 0.12900283624421555, 'ndcg@k': np.float64(0.2168459021729464)}





In [5]:
# Display results
results_df = pd.DataFrame(results).T
print(results_df)


                         precision@k  recall@k    ndcg@k
Random                          0.00  0.000000  0.000000
Popularity                      0.18  0.130269  0.219795
User-CF                         0.02  0.015357  0.015953
LLM Zero-Shot                   0.17  0.051629  0.189042
LLM Re-Ranker (User-CF)         0.20  0.129003  0.216846
