In [1]:
import pandas as pd
import numpy as np
import lenskit as lk
import lenskit.algorithms as lk_algo
import lenskit.crossfold as xf
import lenskit.metrics.predict as lk_metrics
import lenskit.util as lk_util

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

# --- Paths tailored to your project ---
PROJ = Path.cwd().parents[0] if Path.cwd().name == 'notebooks' else Path.cwd()
DATA = PROJ / 'data' / 'external'
PRED_DIR = PROJ / 'predictions' / 'processed'
PRED_DIR.mkdir(parents=True, exist_ok=True)

RATINGS_PATH = DATA / 'ratings.csv'
MOVIES_PATH  = DATA / 'movies.csv'

# Read as latin-1 encoding
RATINGS_PATH, MOVIES_PATH


(PosixPath('/workspace/data/external/ratings.csv'),
 PosixPath('/workspace/data/external/movies.csv'))

In [3]:
import pandas as pd

# Read ratings and movies data with specified encoding
ratings = pd.read_csv(RATINGS_PATH, encoding='latin-1')
movies  = pd.read_csv(MOVIES_PATH, encoding='latin-1')

# Normalize column names to standard names
colmap = {}
if 'user' in ratings.columns: colmap['user'] = 'userId'
if 'item' in ratings.columns: colmap['item'] = 'movieId'
if 'UserID' in ratings.columns: colmap['UserID'] = 'userId'
if 'MovieID' in ratings.columns: colmap['MovieID'] = 'movieId'
if 'Rating' in ratings.columns: colmap['Rating'] = 'rating'

ratings = ratings.rename(columns=colmap)
ratings = ratings[['userId','movieId','rating']]

# Display the ratings table as output
from IPython.display import display
display(ratings.head())

Unnamed: 0,userId,movieId,rating
0,12882,1,4.0
1,12882,32,3.5
2,12882,47,5.0
3,12882,50,5.0
4,12882,110,4.5


## ranks items by their scores to get the top-N results (topn_series) and then attaches movie titles to those top-scoring IDs if the movies DataFrame contains them (with_titles).

In [4]:
def topn_series(scores: pd.Series, n=10) -> pd.Series:
    if n is None or n >= len(scores):
        return scores.sort_values(ascending=False)
    idx = np.argpartition(-scores.values, range(min(n, len(scores))))[:n]
    return scores.iloc[idx].sort_values(ascending=False)

def with_titles(series: pd.Series, movies: pd.DataFrame) -> pd.DataFrame:
    df = series.rename('score').to_frame().reset_index().rename(columns={'index':'movieId'})
    has_titles = {'movieId','title'}.issubset(movies.columns)
    return df.merge(movies[['movieId','title']], on='movieId', how='left') if has_titles else df


## Raw item means (equivalent of pandas groupby)

In [5]:
item_means = ratings.groupby('movieId')['rating'].mean().rename('mean')
top_raw = with_titles(topn_series(item_means, 10), movies)
top_raw

Unnamed: 0,movieId,score,title
0,318,4.364362,"Shawshank Redemption, The (1994)"
1,858,4.315848,"Godfather, The (1972)"
2,1248,4.259259,Touch of Evil (1958)
3,2959,4.258503,Fight Club (1999)
4,7502,4.247423,Band of Brothers (2001)
5,1203,4.246032,12 Angry Men (1957)
6,2859,4.22,Stop Making Sense (1984)
7,1221,4.218462,"Godfather: Part II, The (1974)"
8,296,4.217781,Pulp Fiction (1994)
9,2571,4.195359,"Matrix, The (1999)"


In [6]:
item_means.to_csv(PRED_DIR / 'item_means_raw.csv', index=True)


## 5) Damped item means (Bayesian mean)

> Formula: $\displaystyle \hat{\mu}_i = \frac{\sum r_{ui} + \alpha \mu}{n_i + \alpha}$, where $\mu$ is global mean, $n_i$ is item count, $\alpha$ is damping (e.g., 5).

In [7]:
ALPHA = 5.0  # matches the assignment

mu = ratings['rating'].mean()
grp = ratings.groupby('movieId')['rating'].agg(['sum','count'])
item_means_damped = ((grp['sum'] + ALPHA*mu) / (grp['count'] + ALPHA)).rename('damped_mean')

top_damped = with_titles(topn_series(item_means_damped, 10), movies)
top_damped


Unnamed: 0,movieId,score,title
0,318,4.356802,"Shawshank Redemption, The (1994)"
1,858,4.306888,"Godfather, The (1972)"
2,2959,4.252142,Fight Club (1999)
3,1203,4.226909,12 Angry Men (1957)
4,296,4.212007,Pulp Fiction (1994)
5,7502,4.210983,Band of Brothers (2001)
6,1221,4.207637,"Godfather: Part II, The (1974)"
7,1248,4.19526,Touch of Evil (1958)
8,2571,4.190223,"Matrix, The (1999)"
9,4226,4.187542,Memento (2000)


In [8]:
item_means_damped.to_csv(PRED_DIR / 'item_means_damped.csv', index=True)

# Cross-check with LensKit’s Bias model

In [9]:
# Try both import paths; different LKPY versions organize modules slightly differently.
try:
    from lenskit.algorithms.basic import Bias
except Exception:
    from lenskit.basic import Bias  # fallback for some builds

# lenskit 0.14.4 expects columns named 'user', 'item', 'rating'
ratings_lk = ratings.rename(columns={'userId': 'user', 'movieId': 'item'})

# item-only bias → item means (global + item offset)
bias = Bias(items=True, users=False, damping=ALPHA)
bias.fit(ratings_lk)  # expects user, item, rating

# In recent LKPY versions, use bias.mean_ instead of bias.global_mean
lkpy_item_means = pd.Series(
    bias.mean_ + bias.item_offsets_,
    index=bias.item_offsets_.index, name='mean_lkpy'
)
# Convert index back to movieId for comparison
lkpy_item_means.index.name = 'movieId'
lkpy_item_means.index = lkpy_item_means.index.astype(ratings['movieId'].dtype)

# Check they match our manual damped means
chk = pd.concat([item_means_damped, lkpy_item_means], axis=1).dropna()
float(np.max(np.abs(chk['damped_mean'] - chk['mean_lkpy']))), chk.head()


(8.881784197001252e-16,
          damped_mean  mean_lkpy
 movieId                        
 1           3.790460   3.790460
 2           3.077536   3.077536
 3           2.958076   2.958076
 4           2.834462   2.834462
 5           2.889140   2.889140)



## 7) Basic Association Rules — $P(i\mid j)$

> $P(i\mid j) = \frac{|U_i \cap U_j|}{|U_j|}$. Treat each user’s rated items as a basket.


In [10]:
from collections import Counter, defaultdict

def baskets_from_ratings(df: pd.DataFrame):
    return df.groupby('userId')['movieId'].apply(lambda s: set(s.values))

def basic_assoc_scores(df: pd.DataFrame, reference: int) -> pd.Series:
    baskets = baskets_from_ratings(df)
    uj = sum(1 for items in baskets if reference in items)
    if uj == 0:
        return pd.Series(dtype=float)

    co = Counter()
    for items in baskets:
        if reference in items:
            co.update(i for i in items if i != reference)
    scores = {i: c/uj for i, c in co.items()}
    return pd.Series(scores, name='assoc_basic').sort_values(ascending=False)

REFERENCE = 260  # Star Wars in ML-1M/100k; adjust to your data if needed
basic_scores = basic_assoc_scores(ratings, REFERENCE)
with_titles(topn_series(basic_scores, 10), movies)


Unnamed: 0,movieId,score,title
0,2571,0.915888,"Matrix, The (1999)"
1,1196,0.899065,Star Wars: Episode V - The Empire Strikes Back...
2,4993,0.891589,"Lord of the Rings: The Fellowship of the Ring,..."
3,1210,0.846729,Star Wars: Episode VI - Return of the Jedi (1983)
4,356,0.842991,Forrest Gump (1994)
5,5952,0.841121,"Lord of the Rings: The Two Towers, The (2002)"
6,7153,0.829907,"Lord of the Rings: The Return of the King, The..."
7,296,0.828037,Pulp Fiction (1994)
8,1198,0.790654,Raiders of the Lost Ark (Indiana Jones and the...
9,480,0.788785,Jurassic Park (1993)


In [11]:
basic_scores.to_csv(PRED_DIR / f'basic_assoc_ref_{REFERENCE}.csv', index=True)

## 8) Lift Association Rules — $\text{lift}(i\mid j)=\frac{P(i\land j)}{P(i)P(j)}$

> Equivalent with counts: $\text{lift} = \frac{|U_i\cap U_j|\cdot |U|}{|U_i|\cdot |U_j|}$.


In [12]:
def lift_assoc_scores(df: pd.DataFrame, reference: int) -> pd.Series:
    baskets = baskets_from_ratings(df)
    U = len(baskets)
    if U == 0:
        return pd.Series(dtype=float)

    item_users = Counter()
    for items in baskets:
        item_users.update(items)

    uj = item_users.get(reference, 0)
    if uj == 0:
        return pd.Series(dtype=float)

    co = Counter()
    for items in baskets:
        if reference in items:
            co.update(i for i in items if i != reference)

    out = {}
    for i, cij in co.items():
        ui = item_users.get(i, 0)
        if ui > 0:
            out[i] = (cij * U) / (ui * uj)
    return pd.Series(out, name='assoc_lift').sort_values(ascending=False)

REFERENCE_LIFT = 2761  # Iron Giant in example; change if not in your data
lift_scores = lift_assoc_scores(ratings, REFERENCE_LIFT)
with_titles(topn_series(lift_scores, 10), movies)

Unnamed: 0,movieId,score,title
0,631,4.897727,All Dogs Go to Heaven 2 (1996)
1,2532,4.810268,Conquest of the Planet of the Apes (1972)
2,3615,4.545703,Dinosaur (2000)
3,340,4.489583,"War, The (1994)"
4,1016,4.489583,"Shaggy Dog, The (1959)"
5,2439,4.489583,Affliction (1997)
6,1649,4.489583,"Fast, Cheap & Out of Control (1997)"
7,332,4.377344,Village of the Damned (1995)
8,2736,4.329241,Brighton Beach Memoirs (1986)
9,3213,4.316907,Batman: Mask of the Phantasm (1993)


In [13]:
lift_scores.to_csv(PRED_DIR / f'lift_assoc_ref_{REFERENCE_LIFT}.csv', index=True)

# Quick sanity checks (match the handout)

In [14]:
# Adjust IDs if your dataset differs from the handout
check_ids = [2959, 1203]  # Fight Club, 12 Angry Men (if present)
disp = pd.DataFrame({
    'movieId': check_ids,
    'mean': [item_means.get(i, np.nan) for i in check_ids],
    'damped_mean': [item_means_damped.get(i, np.nan) for i in check_ids],
})
with_titles(disp.set_index('movieId')['mean'], movies).merge(
    with_titles(disp.set_index('movieId')['damped_mean'], movies),
    on=['movieId','title'], suffixes=('_raw','_damped')
)


Unnamed: 0,movieId,score_raw,title,score_damped
0,2959,4.258503,Fight Club (1999),4.252142
1,1203,4.246032,12 Angry Men (1957),4.226909


# Save pretty Top-N tables

In [15]:
top_raw.to_csv(PRED_DIR / 'top10_raw_means.csv', index=False)
top_damped.to_csv(PRED_DIR / 'top10_damped_means.csv', index=False)
with_titles(topn_series(basic_scores, 50), movies).to_csv(PRED_DIR / f'top_basic_assoc_{REFERENCE}.csv', index=False)
with_titles(topn_series(lift_scores, 50), movies).to_csv(PRED_DIR / f'top_lift_assoc_{REFERENCE_LIFT}.csv', index=False)

PRED_DIR

PosixPath('/workspace/predictions/processed')

In [None]:
# Jupyter-friendly version of recsys_eval.py
# All code is now importable and callable from notebook cells.
# To run an evaluation, call `evaluate_pipeline` with your DataFrames and parameters.

import json
from collections import defaultdict, Counter
from dataclasses import dataclass
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors

# ------------------------------ utils & metrics ------------------------------

def set_seed(seed: int = 42):
    np.random.seed(seed)

def rmse(y_true, y_pred) -> float:
    return float(mean_squared_error(y_true, y_pred, squared=False))

def precision_at_k(recs: List[int], relevant: set, k: int) -> float:
    if k == 0 or not recs: return 0.0
    hits = sum(1 for x in recs[:k] if x in relevant)
    return hits / k

def recall_at_k(recs: List[int], relevant: set, k: int) -> float:
    if not relevant: return 0.0
    hits = sum(1 for x in recs[:k] if x in relevant)
    return hits / len(relevant)

def ap_at_k(recs: List[int], relevant: set, k: int) -> float:
    if not recs: return 0.0
    ap = 0.0
    hits = 0
    for i, m in enumerate(recs[:k], start=1):
        if m in relevant:
            hits += 1
            ap += hits / i
    return ap / hits if hits > 0 else 0.0

def mrr_at_k(recs: List[int], relevant: set, k: int) -> float:
    if not recs: return 0.0
    for i, m in enumerate(recs[:k], start=1):
        if m in relevant:
            return 1.0 / i
    return 0.0

def ndcg_at_k_binary(recs: List[int], relevant: set, k: int) -> float:
    if not recs or k == 0: return 0.0
    gains = [1.0 if x in relevant else 0.0 for x in recs[:k]]
    dcg = sum(g / np.log2(i + 2) for i, g in enumerate(gains))
    ideal = int(min(k, len(relevant)))
    idcg = sum(1.0 / np.log2(i + 2) for i in range(ideal))
    return float(dcg / idcg) if idcg > 0 else 0.0

def entropy(tokens: List[str]) -> float:
    if not tokens:
        return 0.0
    cnt = Counter(tokens)
    total = sum(cnt.values())
    ps = [c / total for c in cnt.values() if c > 0]
    return float(-sum(p * np.log2(p) for p in ps))

# ------------------------------ data split -----------------------------------

@dataclass
class FoldData:
    train_df: pd.DataFrame   # observed ratings available to train (incl. query parts of test users)
    test_pairs: pd.DataFrame # held-out (userId, movieId, rating) to predict/evaluate

def chronological_user_holdout(df: pd.DataFrame, holdout_per_user: int = 1) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Split each user's ratings into query (train) and test (last-N by timestamp)."""
    parts = []
    tests = []
    for uid, g in df.sort_values("timestamp").groupby("userId"):
        if len(g) <= holdout_per_user:
            # if too few, keep all in train; empty test for that user
            parts.append(g)
            continue
        q = g.iloc[:-holdout_per_user]
        t = g.iloc[-holdout_per_user:]
        parts.append(q)
        tests.append(t)
    train_part = pd.concat(parts, ignore_index=True)
    test_part = pd.concat(tests, ignore_index=True) if tests else df.iloc[0:0].copy()
    return train_part, test_part

def make_folds_user_cv(ratings: pd.DataFrame, n_folds: int, holdout_per_user: int, seed: int = 42) -> List[FoldData]:
    users = ratings["userId"].unique()
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    folds: List[FoldData] = []

    for train_users_idx, test_users_idx in kf.split(users):
        train_users = set(users[train_users_idx])
        test_users = set(users[test_users_idx])

        ratings_train_users = ratings[ratings["userId"].isin(train_users)]
        ratings_test_users  = ratings[ratings["userId"].isin(test_users)]

        # For test users: split into query (goes into training) and held-out test
        query_part, heldout = chronological_user_holdout(ratings_test_users, holdout_per_user)

        train_df = pd.concat([ratings_train_users, query_part], ignore_index=True)
        test_pairs = heldout[["userId", "movieId", "rating"]].copy()

        folds.append(FoldData(train_df=train_df, test_pairs=test_pairs))
    return folds

# ------------------------------ baselines ------------------------------------

class Baselines:
    def __init__(self):
        self.global_mean = None
        self.item_mean: Dict[int, float] = {}
        self.user_mean: Dict[int, float] = {}

    def fit(self, train_df: pd.DataFrame):
        self.global_mean = float(train_df["rating"].mean())
        self.item_mean = train_df.groupby("movieId")["rating"].mean().to_dict()
        self.user_mean = train_df.groupby("userId")["rating"].mean().to_dict()

    def predict_global(self, pairs: pd.DataFrame) -> np.ndarray:
        return np.full(len(pairs), self.global_mean, dtype=float)

    def predict_item(self, pairs: pd.DataFrame) -> np.ndarray:
        return pairs["movieId"].map(self.item_mean).fillna(self.global_mean).to_numpy(float)

    def predict_user(self, pairs: pd.DataFrame) -> np.ndarray:
        return pairs["userId"].map(self.user_mean).fillna(self.global_mean).to_numpy(float)

# ------------------------------ item-kNN CF ----------------------------------

class ItemKNN:
    def __init__(self, k_neighbors: int = 50, min_item_support: int = 2):
        self.k = k_neighbors
        self.min_item_support = min_item_support
        self.nn = None  # NearestNeighbors
        self.train_matrix: csr_matrix = None
        self.user_ids: np.ndarray = None
        self.item_ids: np.ndarray = None
        self.user2idx: Dict[int, int] = {}
        self.item2idx: Dict[int, int] = {}
        self.idx2item: Dict[int, int] = {}
        self.global_mean = None
        self.item_degrees: Dict[int, int] = {}

    def _encode(self, df: pd.DataFrame):
        # keep only supported items (>= min_item_support unique users)
        supported_items = df.groupby("movieId")["userId"].nunique()
        supported_items = supported_items[supported_items >= self.min_item_support].index.values

        users = df["userId"].unique()
        items = np.array(sorted(supported_items))

        self.user_ids = np.array(sorted(users))
        self.item_ids = items
        self.user2idx = {u: i for i, u in enumerate(self.user_ids)}
        self.item2idx = {m: j for j, m in enumerate(self.item_ids)}
        self.idx2item = {j: m for m, j in self.item2idx.items()}

    def fit(self, train_df: pd.DataFrame):
        self.global_mean = float(train_df["rating"].mean())
        self._encode(train_df)

        # Build CSR matrix (users x items) with only supported items
        rows, cols, vals = [], [], []
        for r in train_df.itertuples(index=False):
            u, m, rt = int(r.userId), int(r.movieId), float(r.rating)
            if m not in self.item2idx:  # unsupported
                continue
            rows.append(self.user2idx[u])
            cols.append(self.item2idx[m])
            vals.append(rt)
        if not rows:
            # degenerate (shouldn't happen on MovieLens)
            self.train_matrix = csr_matrix((0, 0), dtype=float)
            return

        n_users = len(self.user_ids)
        n_items = len(self.item_ids)
        self.train_matrix = csr_matrix((vals, (rows, cols)), shape=(n_users, n_items), dtype=float)

        # degrees per item (for info/coverage)
        self.item_degrees = dict(zip(self.item_ids, np.asarray(self.train_matrix.sum(axis=0)).ravel().astype(int)))

        # Fit item neighbors on item vectors (columns)
        self.nn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=min(self.k + 1, n_items))
        self.nn.fit(self.train_matrix.T)

    def predict_pair(self, user_id: int, movie_id: int) -> Tuple[float, bool]:
        """Predict rating for (user, movie). Returns (prediction, covered?)"""
        if movie_id not in self.item2idx or user_id not in self.user2idx:
            # unknown item or user
            return self.global_mean, False

        u = self.user2idx[user_id]
        i = self.item2idx[movie_id]

        # if item has no ratings in train (shouldn't happen due to support), fallback
        if self.train_matrix[:, i].nnz == 0:
            return self.global_mean, False

        dists, idxs = self.nn.kneighbors(self.train_matrix.T[i], n_neighbors=min(self.k + 1, self.train_matrix.shape[1]))
        sims = 1.0 - dists.flatten()
        neigh_items = idxs.flatten()

        # drop self neighbor if present
        mask = (neigh_items != i)
        neigh_items = neigh_items[mask]
        sims = sims[mask]

        # ratings of the user for neighbor items
        user_row = self.train_matrix[u, neigh_items].toarray().ravel()
        rated_mask = user_row > 0
        if not np.any(rated_mask):
            return self.global_mean, False

        num = float(np.dot(sims[rated_mask], user_row[rated_mask]))
        den = float(np.sum(np.abs(sims[rated_mask])) + 1e-8)
        return num / den, True

    def recommend(self, user_id: int, topk: int = 10) -> List[int]:
        """Return a list of movieIds (not indices) recommended to the user."""
        if user_id not in self.user2idx:
            return []
        u = self.user2idx[user_id]
        seen = set(int(self.idx2item[j]) for j in self.train_matrix[u].indices)

        scores = defaultdict(float)

        # Use up to S seed items from user's profile for speed
        seeds = list(self.train_matrix[u].indices)
        if not seeds:
            return []

        # Accumulate neighbor sims
        for it in seeds:
            dists, idxs = self.nn.kneighbors(self.train_matrix.T[it], n_neighbors=min(self.k + 1, self.train_matrix.shape[1]))
            sims = 1.0 - dists.flatten()
            neigh_items = idxs.flatten()
            for j, s in zip(neigh_items, sims):
                if j == it:
                    continue
                m_id = int(self.idx2item[j])
                if m_id in seen:
                    continue
                scores[m_id] += float(s)

        if not scores:
            return []

        ranked = sorted(scores.items(), key=lambda kv: -kv[1])
        return [m for m, _ in ranked[:topk]]

# ------------------------------ diversity helpers ----------------------------

def build_movie_tokens(movies: pd.DataFrame, tags: pd.DataFrame) -> Dict[int, List[str]]:
    """Map movieId -> list of tokens (tags if present, else genres as pseudo-tags)."""
    tags = tags.copy()
    tags["tag"] = tags["tag"].fillna("").astype(str).str.lower().str.strip()
    tag_map = tags.groupby("movieId")["tag"].apply(list).to_dict()

    genres = movies.set_index("movieId")["genres"].fillna("").astype(str).to_dict()

    out = {}
    for mid in set(list(genres.keys()) + list(tag_map.keys())):
        toks = []
        if mid in tag_map and len(tag_map[mid]) > 0:
            toks = [t for t in tag_map[mid] if t]
        if not toks:
            g = genres.get(mid, "")
            if g:
                toks = [x.strip().lower() for x in g.split("|") if x]
        out[mid] = toks
    return out

# ------------------------------ main eval loop -------------------------------

def evaluate_pipeline(
    movies: pd.DataFrame,
    ratings: pd.DataFrame,
    tags: pd.DataFrame,
    folds: int = 5,
    holdout_per_user: int = 1,
    neighbors: int = 50,
    topk: int = 10,
    min_item_support: int = 2,
    seed: int = 42,
    verbose: bool = True,
):
    set_seed(seed)

    # Ensure proper dtypes
    for c in ["userId", "movieId", "rating", "timestamp"]:
        assert c in ratings.columns, f"ratings.csv missing '{c}'"

    ratings = ratings.copy()
    ratings["userId"] = ratings["userId"].astype(int)
    ratings["movieId"] = ratings["movieId"].astype(int)
    ratings["rating"] = ratings["rating"].astype(float)
    ratings["timestamp"] = ratings["timestamp"].astype(int)

    folds_data = make_folds_user_cv(ratings, n_folds=folds, holdout_per_user=holdout_per_user, seed=seed)

    # Precompute tokens for diversity (once)
    movie_tokens = build_movie_tokens(movies, tags)

    # Aggregators
    rmse_global_all, rmse_item_all, rmse_user_all, rmse_itemknn_all = [], [], [], []
    pred_cov_itemknn_all = []
    # Top-N metrics for itemKNN and popularity
    topn_item_metrics = []
    topn_pop_metrics = []
    user_cov_itemknn_all = []
    catalog_cov_itemknn_all = []
    diversity_itemknn_all = []

    for fold_idx, fold in enumerate(folds_data, start=1):
        train_df = fold.train_df.copy()
        test_pairs = fold.test_pairs.copy()

        # ---------------- baselines ----------------
        bl = Baselines()
        bl.fit(train_df)

        y_true = test_pairs["rating"].to_numpy(float)
        y_pred_global = bl.predict_global(test_pairs)
        y_pred_item = bl.predict_item(test_pairs)
        y_pred_user = bl.predict_user(test_pairs)

        rmse_global_all.append(rmse(y_true, y_pred_global))
        rmse_item_all.append(rmse(y_true, y_pred_item))
        rmse_user_all.append(rmse(y_true, y_pred_user))

        # ---------------- item-kNN CF ----------------
        iknn = ItemKNN(k_neighbors=neighbors, min_item_support=min_item_support)
        iknn.fit(train_df)

        preds = []
        trues = []
        covered_flags = []
        for r in test_pairs.itertuples(index=False):
            p, covered = iknn.predict_pair(int(r.userId), int(r.movieId))
            preds.append(p)
            trues.append(float(r.rating))
            covered_flags.append(covered)

        if preds:
            rmse_itemknn_all.append(rmse(trues, preds))
            pred_cov_itemknn_all.append(np.mean(covered_flags))
        else:
            rmse_itemknn_all.append(np.nan)
            pred_cov_itemknn_all.append(0.0)

        # ---------------- Top-N: item-kNN & popularity ----------------
        # Relevant set is each test user's held-out items
        test_by_user = test_pairs.groupby("userId")["movieId"].apply(set).to_dict()

        # Popularity baseline from train
        pop_items = train_df.groupby("movieId").size().sort_values(ascending=False).index.tolist()

        # Gather recommendations & metrics
        recs_item_all = {}
        recs_pop_all = {}
        for uid in test_by_user.keys():
            recs_item_all[uid] = iknn.recommend(uid, topk=topk)
            # popularity excluding seen items
            seen = set(train_df.loc[train_df["userId"] == uid, "movieId"].values)
            pop_recs = [m for m in pop_items if m not in seen][:topk]
            recs_pop_all[uid] = pop_recs

        def aggregate_topn(recs_dict):
            precs, recs, maps, mrrs, ndcgs = [], [], [], [], []
            nonempty_users = 0
            catalog_items = set()
            for uid, rel in test_by_user.items():
                recs_list = recs_dict.get(uid, [])
                if recs_list:
                    nonempty_users += 1
                    catalog_items.update(recs_list)
                precs.append(precision_at_k(recs_list, rel, topk))
                recs.append(recall_at_k(recs_list, rel, topk))
                maps.append(ap_at_k(recs_list, rel, topk))
                mrrs.append(mrr_at_k(recs_list, rel, topk))
                ndcgs.append(ndcg_at_k_binary(recs_list, rel, topk))
            user_coverage = nonempty_users / max(1, len(test_by_user))
            catalog_coverage = len(catalog_items) / max(1, len(iknn.item_ids))
            return {
                "precision": float(np.mean(precs)),
                "recall": float(np.mean(recs)),
                "map": float(np.mean(maps)),
                "mrr": float(np.mean(mrrs)),
                "ndcg": float(np.mean(ndcgs)),
                "user_coverage": user_coverage,
                "catalog_coverage": catalog_coverage,
            }

        item_topn = aggregate_topn(recs_item_all)
        pop_topn = aggregate_topn(recs_pop_all)
        topn_item_metrics.append(item_topn)
        topn_pop_metrics.append(pop_topn)
        user_cov_itemknn_all.append(item_topn["user_coverage"])
        catalog_cov_itemknn_all.append(item_topn["catalog_coverage"])

        # ---------------- Diversity (tag entropy) for item-kNN ----------------
        entrs = []
        for uid, recs_list in recs_item_all.items():
            toks = []
            for m in recs_list:
                toks.extend(movie_tokens.get(m, []))
            entrs.append(entropy(toks))
        diversity_itemknn_all.append(float(np.mean(entrs) if entrs else 0.0))

        if verbose:
            print(f"[fold {fold_idx}/{len(folds_data)}] "
                  f"RMSE: GM={rmse_global_all[-1]:.4f} IM={rmse_item_all[-1]:.4f} UM={rmse_user_all[-1]:.4f} iKNN={rmse_itemknn_all[-1]:.4f} "
                  f"| TopN(iKNN) P@{topk}={item_topn['precision']:.3f} nDCG={item_topn['ndcg']:.3f} "
                  f"| PredCov(iKNN)={pred_cov_itemknn_all[-1]:.3f}")

    # Average over folds
    def avg_or_nan(xs): 
        xs = [x for x in xs if np.isfinite(x)]
        return float(np.mean(xs)) if xs else float("nan")

    summary = {
        "params": {
            "folds": folds,
            "holdout_per_user": holdout_per_user,
            "neighbors": neighbors,
            "topk": topk,
            "min_item_support": min_item_support,
            "seed": seed,
        },
        "RMSE": {
            "GlobalMean": round(avg_or_nan(rmse_global_all), 4),
            "ItemMean": round(avg_or_nan(rmse_item_all), 4),
            "UserMean": round(avg_or_nan(rmse_user_all), 4),
            "ItemKNN": round(avg_or_nan(rmse_itemknn_all), 4),
            "ItemKNN_PredictionCoverage": round(avg_or_nan(pred_cov_itemknn_all), 4),
        },
        "TopN_ItemKNN": {
            "Precision": round(np.mean([m["precision"] for m in topn_item_metrics]), 4),
            "Recall": round(np.mean([m["recall"] for m in topn_item_metrics]), 4),
            "MAP": round(np.mean([m["map"] for m in topn_item_metrics]), 4),
            "MRR": round(np.mean([m["mrr"] for m in topn_item_metrics]), 4),
            "nDCG": round(np.mean([m["ndcg"] for m in topn_item_metrics]), 4),
            "UserCoverage": round(avg_or_nan(user_cov_itemknn_all), 4),
            "CatalogCoverage": round(avg_or_nan(catalog_cov_itemknn_all), 4),
            "Diversity_TagEntropy": round(avg_or_nan(diversity_itemknn_all), 4),
        },
        "TopN_Popularity": {
            "Precision": round(np.mean([m["precision"] for m in topn_pop_metrics]), 4),
            "Recall": round(np.mean([m["recall"] for m in topn_pop_metrics]), 4),
            "MAP": round(np.mean([m["map"] for m in topn_pop_metrics]), 4),
            "MRR": round(np.mean([m["mrr"] for m in topn_pop_metrics]), 4),
            "nDCG": round(np.mean([m["ndcg"] for m in topn_pop_metrics]), 4),
            "UserCoverage": round(np.mean([m["user_coverage"] for m in topn_pop_metrics]), 4),
            "CatalogCoverage": round(np.mean([m["catalog_coverage"] for m in topn_pop_metrics]), 4),
        }
    }
    return summary




In [7]:
from pathlib import Path
import pandas as pd

# Load the movies data from the external data directory
movies_path = Path("/workspace/data/external/movies.csv")
movies = pd.read_csv(movies_path)
# Load the ratings and tags data from the external data directory
ratings_path = Path("/workspace/data/external/ratings.csv")
tags_path = Path("/workspace/data/external/tags.csv")

# Specify encoding='latin1' to handle UnicodeDecodeError
ratings = pd.read_csv(ratings_path, encoding='latin1')
tags = pd.read_csv(tags_path, encoding='latin1')


In [11]:
# Example usage in a notebook:
# If you encounter "TypeError: got an unexpected keyword argument 'squared'",
# remove the 'squared' argument from your function calls.
summary = evaluate_pipeline(movies, ratings, tags, folds=5, holdout_per_user=1, neighbors=50, topk=10)
print(json.dumps(summary, indent=2))

TypeError: got an unexpected keyword argument 'squared'