In [1]:
import pandas as pd
import numpy as np
import lenskit as lk
import lenskit.algorithms as lk_algo
import lenskit.crossfold as xf
import lenskit.metrics.predict as lk_metrics
import lenskit.util as lk_util

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

# --- Paths tailored to your project ---
PROJ = Path.cwd().parents[0] if Path.cwd().name == 'notebooks' else Path.cwd()
DATA = PROJ / 'data' / 'external'
PRED_DIR = PROJ / 'predictions' / 'processed'
PRED_DIR.mkdir(parents=True, exist_ok=True)

RATINGS_PATH = DATA / 'ratings.csv'
MOVIES_PATH  = DATA / 'movies.csv'

# Read as latin-1 encoding
RATINGS_PATH, MOVIES_PATH


(PosixPath('/workspace/data/external/ratings.csv'),
 PosixPath('/workspace/data/external/movies.csv'))

In [3]:
import pandas as pd

# Read ratings and movies data with specified encoding
ratings = pd.read_csv(RATINGS_PATH, encoding='latin-1')
movies  = pd.read_csv(MOVIES_PATH, encoding='latin-1')

# Normalize column names to standard names
colmap = {}
if 'user' in ratings.columns: colmap['user'] = 'userId'
if 'item' in ratings.columns: colmap['item'] = 'movieId'
if 'UserID' in ratings.columns: colmap['UserID'] = 'userId'
if 'MovieID' in ratings.columns: colmap['MovieID'] = 'movieId'
if 'Rating' in ratings.columns: colmap['Rating'] = 'rating'

ratings = ratings.rename(columns=colmap)
ratings = ratings[['userId','movieId','rating']]

# Display the ratings table as output
from IPython.display import display
display(ratings.head())

Unnamed: 0,userId,movieId,rating
0,12882,1,4.0
1,12882,32,3.5
2,12882,47,5.0
3,12882,50,5.0
4,12882,110,4.5


## ranks items by their scores to get the top-N results (topn_series) and then attaches movie titles to those top-scoring IDs if the movies DataFrame contains them (with_titles).

In [4]:
def topn_series(scores: pd.Series, n=10) -> pd.Series:
    if n is None or n >= len(scores):
        return scores.sort_values(ascending=False)
    idx = np.argpartition(-scores.values, range(min(n, len(scores))))[:n]
    return scores.iloc[idx].sort_values(ascending=False)

def with_titles(series: pd.Series, movies: pd.DataFrame) -> pd.DataFrame:
    df = series.rename('score').to_frame().reset_index().rename(columns={'index':'movieId'})
    has_titles = {'movieId','title'}.issubset(movies.columns)
    return df.merge(movies[['movieId','title']], on='movieId', how='left') if has_titles else df


## Raw item means (equivalent of pandas groupby)

In [5]:
item_means = ratings.groupby('movieId')['rating'].mean().rename('mean')
top_raw = with_titles(topn_series(item_means, 10), movies)
top_raw

Unnamed: 0,movieId,score,title
0,318,4.364362,"Shawshank Redemption, The (1994)"
1,858,4.315848,"Godfather, The (1972)"
2,1248,4.259259,Touch of Evil (1958)
3,2959,4.258503,Fight Club (1999)
4,7502,4.247423,Band of Brothers (2001)
5,1203,4.246032,12 Angry Men (1957)
6,2859,4.22,Stop Making Sense (1984)
7,1221,4.218462,"Godfather: Part II, The (1974)"
8,296,4.217781,Pulp Fiction (1994)
9,2571,4.195359,"Matrix, The (1999)"


In [6]:
item_means.to_csv(PRED_DIR / 'item_means_raw.csv', index=True)


## 5) Damped item means (Bayesian mean)

> Formula: $\displaystyle \hat{\mu}_i = \frac{\sum r_{ui} + \alpha \mu}{n_i + \alpha}$, where $\mu$ is global mean, $n_i$ is item count, $\alpha$ is damping (e.g., 5).

In [7]:
ALPHA = 5.0  # matches the assignment

mu = ratings['rating'].mean()
grp = ratings.groupby('movieId')['rating'].agg(['sum','count'])
item_means_damped = ((grp['sum'] + ALPHA*mu) / (grp['count'] + ALPHA)).rename('damped_mean')

top_damped = with_titles(topn_series(item_means_damped, 10), movies)
top_damped


Unnamed: 0,movieId,score,title
0,318,4.356802,"Shawshank Redemption, The (1994)"
1,858,4.306888,"Godfather, The (1972)"
2,2959,4.252142,Fight Club (1999)
3,1203,4.226909,12 Angry Men (1957)
4,296,4.212007,Pulp Fiction (1994)
5,7502,4.210983,Band of Brothers (2001)
6,1221,4.207637,"Godfather: Part II, The (1974)"
7,1248,4.19526,Touch of Evil (1958)
8,2571,4.190223,"Matrix, The (1999)"
9,4226,4.187542,Memento (2000)


In [8]:
item_means_damped.to_csv(PRED_DIR / 'item_means_damped.csv', index=True)

# Cross-check with LensKit’s Bias model

In [9]:
# Try both import paths; different LKPY versions organize modules slightly differently.
try:
    from lenskit.algorithms.basic import Bias
except Exception:
    from lenskit.basic import Bias  # fallback for some builds

# lenskit 0.14.4 expects columns named 'user', 'item', 'rating'
ratings_lk = ratings.rename(columns={'userId': 'user', 'movieId': 'item'})

# item-only bias → item means (global + item offset)
bias = Bias(items=True, users=False, damping=ALPHA)
bias.fit(ratings_lk)  # expects user, item, rating

# In recent LKPY versions, use bias.mean_ instead of bias.global_mean
lkpy_item_means = pd.Series(
    bias.mean_ + bias.item_offsets_,
    index=bias.item_offsets_.index, name='mean_lkpy'
)
# Convert index back to movieId for comparison
lkpy_item_means.index.name = 'movieId'
lkpy_item_means.index = lkpy_item_means.index.astype(ratings['movieId'].dtype)

# Check they match our manual damped means
chk = pd.concat([item_means_damped, lkpy_item_means], axis=1).dropna()
float(np.max(np.abs(chk['damped_mean'] - chk['mean_lkpy']))), chk.head()


(8.881784197001252e-16,
          damped_mean  mean_lkpy
 movieId                        
 1           3.790460   3.790460
 2           3.077536   3.077536
 3           2.958076   2.958076
 4           2.834462   2.834462
 5           2.889140   2.889140)



## 7) Basic Association Rules — $P(i\mid j)$

> $P(i\mid j) = \frac{|U_i \cap U_j|}{|U_j|}$. Treat each user’s rated items as a basket.


In [10]:
from collections import Counter, defaultdict

def baskets_from_ratings(df: pd.DataFrame):
    return df.groupby('userId')['movieId'].apply(lambda s: set(s.values))

def basic_assoc_scores(df: pd.DataFrame, reference: int) -> pd.Series:
    baskets = baskets_from_ratings(df)
    uj = sum(1 for items in baskets if reference in items)
    if uj == 0:
        return pd.Series(dtype=float)

    co = Counter()
    for items in baskets:
        if reference in items:
            co.update(i for i in items if i != reference)
    scores = {i: c/uj for i, c in co.items()}
    return pd.Series(scores, name='assoc_basic').sort_values(ascending=False)

REFERENCE = 260  # Star Wars in ML-1M/100k; adjust to your data if needed
basic_scores = basic_assoc_scores(ratings, REFERENCE)
with_titles(topn_series(basic_scores, 10), movies)


Unnamed: 0,movieId,score,title
0,2571,0.915888,"Matrix, The (1999)"
1,1196,0.899065,Star Wars: Episode V - The Empire Strikes Back...
2,4993,0.891589,"Lord of the Rings: The Fellowship of the Ring,..."
3,1210,0.846729,Star Wars: Episode VI - Return of the Jedi (1983)
4,356,0.842991,Forrest Gump (1994)
5,5952,0.841121,"Lord of the Rings: The Two Towers, The (2002)"
6,7153,0.829907,"Lord of the Rings: The Return of the King, The..."
7,296,0.828037,Pulp Fiction (1994)
8,1198,0.790654,Raiders of the Lost Ark (Indiana Jones and the...
9,480,0.788785,Jurassic Park (1993)


In [11]:
basic_scores.to_csv(PRED_DIR / f'basic_assoc_ref_{REFERENCE}.csv', index=True)

## 8) Lift Association Rules — $\text{lift}(i\mid j)=\frac{P(i\land j)}{P(i)P(j)}$

> Equivalent with counts: $\text{lift} = \frac{|U_i\cap U_j|\cdot |U|}{|U_i|\cdot |U_j|}$.


In [12]:
def lift_assoc_scores(df: pd.DataFrame, reference: int) -> pd.Series:
    baskets = baskets_from_ratings(df)
    U = len(baskets)
    if U == 0:
        return pd.Series(dtype=float)

    item_users = Counter()
    for items in baskets:
        item_users.update(items)

    uj = item_users.get(reference, 0)
    if uj == 0:
        return pd.Series(dtype=float)

    co = Counter()
    for items in baskets:
        if reference in items:
            co.update(i for i in items if i != reference)

    out = {}
    for i, cij in co.items():
        ui = item_users.get(i, 0)
        if ui > 0:
            out[i] = (cij * U) / (ui * uj)
    return pd.Series(out, name='assoc_lift').sort_values(ascending=False)

REFERENCE_LIFT = 2761  # Iron Giant in example; change if not in your data
lift_scores = lift_assoc_scores(ratings, REFERENCE_LIFT)
with_titles(topn_series(lift_scores, 10), movies)

Unnamed: 0,movieId,score,title
0,631,4.897727,All Dogs Go to Heaven 2 (1996)
1,2532,4.810268,Conquest of the Planet of the Apes (1972)
2,3615,4.545703,Dinosaur (2000)
3,340,4.489583,"War, The (1994)"
4,1016,4.489583,"Shaggy Dog, The (1959)"
5,2439,4.489583,Affliction (1997)
6,1649,4.489583,"Fast, Cheap & Out of Control (1997)"
7,332,4.377344,Village of the Damned (1995)
8,2736,4.329241,Brighton Beach Memoirs (1986)
9,3213,4.316907,Batman: Mask of the Phantasm (1993)


In [13]:
lift_scores.to_csv(PRED_DIR / f'lift_assoc_ref_{REFERENCE_LIFT}.csv', index=True)

# Quick sanity checks (match the handout)

In [14]:
# Adjust IDs if your dataset differs from the handout
check_ids = [2959, 1203]  # Fight Club, 12 Angry Men (if present)
disp = pd.DataFrame({
    'movieId': check_ids,
    'mean': [item_means.get(i, np.nan) for i in check_ids],
    'damped_mean': [item_means_damped.get(i, np.nan) for i in check_ids],
})
with_titles(disp.set_index('movieId')['mean'], movies).merge(
    with_titles(disp.set_index('movieId')['damped_mean'], movies),
    on=['movieId','title'], suffixes=('_raw','_damped')
)


Unnamed: 0,movieId,score_raw,title,score_damped
0,2959,4.258503,Fight Club (1999),4.252142
1,1203,4.246032,12 Angry Men (1957),4.226909


# Save pretty Top-N tables

In [15]:
top_raw.to_csv(PRED_DIR / 'top10_raw_means.csv', index=False)
top_damped.to_csv(PRED_DIR / 'top10_damped_means.csv', index=False)
with_titles(topn_series(basic_scores, 50), movies).to_csv(PRED_DIR / f'top_basic_assoc_{REFERENCE}.csv', index=False)
with_titles(topn_series(lift_scores, 50), movies).to_csv(PRED_DIR / f'top_lift_assoc_{REFERENCE_LIFT}.csv', index=False)

PRED_DIR

PosixPath('/workspace/predictions/processed')