# LLM Recommender Debugging

This notebook is for debugging the LLM-based recommenders and their dependencies, like the `UserCFRecommender`.


In [1]:
import os
import pandas as pd
import numpy as np

from osiro_llm.data.movielens import load_movielens_data
from osiro_llm.recommenders.baselines import UserCFRecommender
from osiro_llm.llm.google import GoogleLLMWrapper
from osiro_llm.recommenders.llm_zero_shot import LLMZeroShotRecommender
from osiro_llm.recommenders.llm_reranker import LLMReranker
from osiro_llm.evaluation.metrics import precision_at_k, recall_at_k, ndcg_at_k

# Check for API Key
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
if GOOGLE_API_KEY is None:
    print(
        "ERROR: Please set the GOOGLE_API_KEY environment variable before proceeding."
    )

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data (using 100k for speed)
_, ratings_df, movies_df = load_movielens_data(version="100k")

# Create a movie ID to title mapping for easier inspection
movie_id_to_title = movies_df.set_index("MovieID")["Title"].to_dict()

In [3]:
from sklearn.model_selection import train_test_split

# 1. Create a proper train/test split of the ratings
# This is the correct way to prepare for evaluation.
train_df, test_df = train_test_split(
    ratings_df, test_size=0.2, random_state=42, stratify=ratings_df["UserID"]
)

print(f"Total Ratings: {len(ratings_df)}")
print(f"Training Ratings: {len(train_df)}")
print(f"Test Ratings: {len(test_df)}")

Total Ratings: 100000
Training Ratings: 80000
Test Ratings: 20000


In [4]:
# --- Configuration ---
# Pick a user to debug
TEST_USER_ID = 1
N_RECOMMENDATIONS = 10

# 2. Define user history from the TRAINING set (for building prompts)
train_user_ratings = train_df[train_df["UserID"] == TEST_USER_ID]
user_liked_movies_history = train_user_ratings[train_user_ratings["Rating"] >= 4]

# 3. Define the ground truth from the TEST set (for evaluation)
test_user_ratings = test_df[test_df["UserID"] == TEST_USER_ID]
relevant_items = test_user_ratings[test_user_ratings["Rating"] >= 4]["MovieID"].tolist()


print(f"--- Debugging for User ID: {TEST_USER_ID} ---")
print(
    f"\nUser has {len(user_liked_movies_history)} liked movies in their training history."
)
print(
    f"User has {len(relevant_items)} liked movies in the test set (this is our ground truth)."
)

--- Debugging for User ID: 1 ---

User has 128 liked movies in their training history.
User has 35 liked movies in the test set (this is our ground truth).


## 1. Debugging User-CF Recommender

Let's first debug the `UserCFRecommender` since the re-ranker depends on it and it's currently returning zero results.


In [5]:
# Instantiate and fit the model
user_cf = UserCFRecommender(k=20)
user_cf.fit(train_df, movies_df)

# --- Step-by-step debugging of the recommend method ---

# 1. Find similar users
similar_users = (
    user_cf.user_similarity[TEST_USER_ID].sort_values(ascending=False).head(user_cf.k)
)
print("--- Top 5 Similar Users ---")
print(similar_users.head())

# 2. Get ratings from these neighbors
neighbor_ratings = user_cf.user_item_matrix.loc[similar_users.index]
print("\n--- Shape of Neighbor Ratings Matrix ---")
print(neighbor_ratings.shape)

# 3. Calculate predicted scores
numerator = neighbor_ratings.mul(similar_users, axis=0).sum(axis=0)
rated_by_neighbors = neighbor_ratings.copy()
rated_by_neighbors[rated_by_neighbors > 0] = 1
denominator = rated_by_neighbors.mul(similar_users, axis=0).sum(axis=0)
denominator[denominator == 0] = 1e-10
predicted_scores = numerator / denominator

print("\n--- Top 10 Predicted Scores ---")
print(predicted_scores.sort_values(ascending=False).head(10))

# 4. Filter out already-seen movies
user_rated_movies = user_cf.user_item_matrix.loc[TEST_USER_ID]
user_rated_movies = user_rated_movies[user_rated_movies > 0].index
final_scores = predicted_scores.drop(user_rated_movies, errors="ignore")

# 5. Get final recommendations
recommendations = final_scores.sort_values(ascending=False).head(N_RECOMMENDATIONS)
recommended_ids = recommendations.index.tolist()

print("\n--- Final Recommended Movie IDs ---")
print(recommended_ids)

print("\n--- Final Recommended Movie Titles ---")
for movie_id in recommended_ids:
    print(f"- {movie_id_to_title.get(movie_id, 'Unknown Title')}")

--- Top 5 Similar Users ---
UserID
92     0.448998
933    0.440859
276    0.439639
268    0.435091
682    0.432603
Name: 1, dtype: float64

--- Shape of Neighbor Ratings Matrix ---
(20, 1656)

--- Top 10 Predicted Scores ---
MovieID
1019    5.0
515     5.0
644     5.0
1404    5.0
525     5.0
522     5.0
661     5.0
285     5.0
1168    5.0
853     5.0
dtype: float64

--- Final Recommended Movie IDs ---
[661, 525, 1019, 644, 515, 1404, 522, 114, 1007, 853]

--- Final Recommended Movie Titles ---
- High Noon (1952)
- Big Sleep, The (1946)
- Die xue shuang xiong (Killer, The) (1989)
- Thin Blue Line, The (1988)
- Boot, Das (1981)
- Withnail and I (1987)
- Down by Law (1986)
- Wallace & Gromit: The Best of Aardman Animation (1996)
- Waiting for Guffman (1996)
- Braindead (1992)


  ret = a @ b
  ret = a @ b
  ret = a @ b


In [6]:
# Get the metrics for the User-CF recommender
base_precision = precision_at_k(recommended_ids[:10], relevant_items, 10)
base_recall = recall_at_k(recommended_ids[:10], relevant_items, 10)
base_ndcg = ndcg_at_k(recommended_ids[:10], relevant_items, 10)

print(
    f"Base Precision: {base_precision:.4f}, Recall: {base_recall:.4f}, NDCG: {base_ndcg:.4f}"
)

Base Precision: 0.1000, Recall: 0.0286, NDCG: 0.0694


## 2. Debugging LLM Zero-Shot Recommender


In [7]:
from osiro_llm.llm.prompts import ZERO_SHOT_PROMPT

# Instantiate and fit
llm_wrapper = GoogleLLMWrapper()
zero_shot_rec = LLMZeroShotRecommender(llm_wrapper)
zero_shot_rec.fit(train_df, movies_df)

# --- Step-by-step debugging ---

# 1. Get user's liked movies and candidates
user_ratings = zero_shot_rec.ratings_df[
    zero_shot_rec.ratings_df["UserID"] == TEST_USER_ID
]
liked_movies = user_ratings[user_ratings["Rating"] >= zero_shot_rec.rating_threshold]
liked_movie_titles = [
    zero_shot_rec.movie_id_to_title[mid] for mid in liked_movies["MovieID"]
]

all_movie_ids = ratings_df["MovieID"].unique().tolist()
user_rated_movies = set(user_ratings["MovieID"])
candidate_movie_ids = [mid for mid in all_movie_ids if mid not in user_rated_movies]
candidate_movie_titles = [
    zero_shot_rec.movie_id_to_title[mid] for mid in candidate_movie_ids
][:100]

# 2. Construct the prompt
prompt = ZERO_SHOT_PROMPT.format(
    liked_movies="|".join(liked_movie_titles),
    candidate_movies="|".join(candidate_movie_titles),
    n=N_RECOMMENDATIONS,
)

print("--- Generated Prompt for Zero-Shot LLM ---")
# Print first 500 characters of the prompt for brevity
print(prompt)

# 3. Get response from LLM
print("\n--- Calling LLM API... ---")
response = llm_wrapper.generate_content(prompt)
print("\n--- Raw LLM Response ---")
print(response)

# 4. Parse response and get final recommendations
recommended_titles = [title.strip() for title in response.split("|")]
title_to_movie_id = {v: k for k, v in zero_shot_rec.movie_id_to_title.items()}
recommended_ids = [
    title_to_movie_id[title]
    for title in recommended_titles
    if title in title_to_movie_id
]

print("\n--- Final Recommended Movie Titles (Zero-Shot) ---")
for movie_id in recommended_ids[:N_RECOMMENDATIONS]:
    print(f"- {movie_id_to_title.get(movie_id, 'Unknown Title')}")

--- Generated Prompt for Zero-Shot LLM ---

You are a movie recommendation expert.
A user has watched and liked the following movies:
Contact (1997)|Raging Bull (1980)|Taxi Driver (1976)|Monty Python's Life of Brian (1979)|Horseman on the Roof, The (Hussard sur le toit, Le) (1995)|Return of the Pink Panther, The (1974)|Fargo (1996)|Haunted World of Edward D. Wood Jr., The (1995)|Ed Wood (1994)|Star Trek VI: The Undiscovered Country (1991)|On Golden Pond (1981)|Terminator 2: Judgment Day (1991)|Wrong Trousers, The (1993)|Last of the Mohicans, The (1992)|Reservoir Dogs (1992)|Madness of King George, The (1994)|Alien (1979)|Strange Days (1995)|Shawshank Redemption, The (1994)|Aladdin (1992)|Star Wars (1977)|Platoon (1986)|Big Night (1996)|Postino, Il (1994)|Die Hard (1988)|Three Colors: Red (1994)|Chasing Amy (1997)|Truth About Cats & Dogs, The (1996)|Godfather, The (1972)|Godfather: Part II, The (1974)|2001: A Space Odyssey (1968)|Monty Python and the Holy Grail (1974)|Nightmare Before C

In [8]:
recommended_titles

['Jackie Brown (1997)',
 'L.A. Confidential (1997)',
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)',
 'The Remains of the Day (1993)',
 "Miller's Crossing (1990)",
 'Heat (1995)',
 'The Right Stuff (1983)',
 'Annie Hall (1977)',
 'Ghost in the Shell (Kokaku kidotai) (1995)',
 'Blade Runner (1982)']

In [9]:
recommended_ids

[346, 302, 474, 518, 273, 514, 1240, 89]

In [10]:
precision = precision_at_k(recommended_ids, relevant_items, 10)
recall = recall_at_k(recommended_ids, relevant_items, 10)
ndcg = ndcg_at_k(recommended_ids, relevant_items, 10)

print(f"Re-Ranked Precision: {precision:.4f}, Recall: {recall:.4f}, NDCG: {ndcg:.4f}")

Re-Ranked Precision: 0.1000, Recall: 0.0286, NDCG: 0.0694


## 3. Debugging LLM Re-Ranker


In [15]:
from osiro_llm.llm.prompts import RERANKING_PROMPT

# Instantiate and fit
# We use the User-CF model we already created and fitted
reranker_rec = LLMReranker(base_recommender=user_cf, llm_wrapper=llm_wrapper)
reranker_rec.fit(train_df, movies_df)

# --- Step-by-step debugging ---

# 1. Get initial candidates from the base recommender
# We'll get more than we need, e.g., 50
candidate_ids = reranker_rec.base_recommender.recommend(TEST_USER_ID, 50, all_movie_ids)
candidate_titles = [
    reranker_rec.movie_id_to_title[mid]
    for mid in candidate_ids
    if mid in reranker_rec.movie_id_to_title
]

print("--- Top 10 Initial Candidates from User-CF ---")
for title in candidate_titles[:10]:
    print(f"- {title}")

# 2. Get user's liked movies
user_ratings = reranker_rec.ratings_df[
    reranker_rec.ratings_df["UserID"] == TEST_USER_ID
]
liked_movies = user_ratings[user_ratings["Rating"] >= reranker_rec.rating_threshold]
liked_movie_titles = [
    reranker_rec.movie_id_to_title[mid] for mid in liked_movies["MovieID"]
]

# 3. Construct the prompt
prompt = RERANKING_PROMPT.format(
    liked_movies="|".join(liked_movie_titles),
    candidate_movies="|".join(candidate_titles),
    n=N_RECOMMENDATIONS,
)

print("\n--- Generated Prompt for Re-Ranker LLM ---")
print(prompt)

# 4. Get response from LLM
print("\n--- Calling LLM API... ---")
response = llm_wrapper.generate_content(prompt)
print("\n--- Raw LLM Response ---")
print(response)

# 5. Parse response and get final recommendations
reranked_titles = [title.strip() for title in response.split("|")]
title_to_movie_id = {v: k for k, v in reranker_rec.movie_id_to_title.items()}
reranked_ids = [
    title_to_movie_id[title] for title in reranked_titles if title in title_to_movie_id
]

print("\n--- Final Re-Ranked Movie Titles ---")
for movie_id in reranked_ids[:N_RECOMMENDATIONS]:
    print(f"- {movie_id_to_title.get(movie_id, 'Unknown Title')}")

  ret = a @ b
  ret = a @ b
  ret = a @ b


--- Top 10 Initial Candidates from User-CF ---
- High Noon (1952)
- Big Sleep, The (1946)
- Die xue shuang xiong (Killer, The) (1989)
- Thin Blue Line, The (1988)
- Boot, Das (1981)
- Withnail and I (1987)
- Down by Law (1986)
- Wallace & Gromit: The Best of Aardman Animation (1996)
- Waiting for Guffman (1996)
- Braindead (1992)

--- Generated Prompt for Re-Ranker LLM ---

You are a movie re-ranking expert.
A user has watched and liked the following movies:
Contact (1997)|Raging Bull (1980)|Taxi Driver (1976)|Monty Python's Life of Brian (1979)|Horseman on the Roof, The (Hussard sur le toit, Le) (1995)|Return of the Pink Panther, The (1974)|Fargo (1996)|Haunted World of Edward D. Wood Jr., The (1995)|Ed Wood (1994)|Star Trek VI: The Undiscovered Country (1991)|On Golden Pond (1981)|Terminator 2: Judgment Day (1991)|Wrong Trousers, The (1993)|Last of the Mohicans, The (1992)|Reservoir Dogs (1992)|Madness of King George, The (1994)|Alien (1979)|Strange Days (1995)|Shawshank Redemption, 

In [16]:
reranked_titles

['L.A. Confidential (1997)',
 'Chinatown (1974)',
 'Usual Suspects, The (1995)',
 'Boogie Nights (1997)',
 "Miller's Crossing (1990)",
 'Secrets & Lies (1996)',
 'Touch of Evil (1958)',
 'Maltese Falcon, The (1941)',
 'Game, The (1997)',
 'Big Sleep, The (1946)']

In [17]:
reranked_ids

[302, 654, 12, 340, 518, 285, 653, 484, 333, 525]

In [18]:
precision = precision_at_k(reranked_ids, relevant_items, 10)
recall = recall_at_k(reranked_ids, relevant_items, 10)
ndcg = ndcg_at_k(reranked_ids, relevant_items, 10)

print(f"Re-Ranked Precision: {precision:.4f}, Recall: {recall:.4f}, NDCG: {ndcg:.4f}")

Re-Ranked Precision: 0.1000, Recall: 0.0286, NDCG: 0.1100
