In [None]:
import os, sys, subprocess, glob, random
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import warnings
warnings.filterwarnings("ignore")
RND = 42
random.seed(RND)
np.random.seed(RND)

In [None]:
from google.colab import files
print("Step 1: Please upload your kaggle.json (Colab will prompt a file chooser)...")
uploaded = files.upload()

Step 1: Please upload your kaggle.json (Colab will prompt a file chooser)...


Saving kaggle.json to kaggle (5).json


In [None]:
os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)
if "kaggle.json" in uploaded:
    open(os.path.expanduser("~/.kaggle/kaggle.json"), "wb").write(uploaded["kaggle.json"])
    os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 0o600)
    print("kaggle.json saved to ~/.kaggle/kaggle.json")
else:
    # if uploaded file has different name (Colab may rename), try to find first json
    json_files = [f for f in uploaded.keys() if f.lower().endswith(".json")]
    if json_files:
        src = list(uploaded.keys())[0]
        open(os.path.expanduser("~/.kaggle/kaggle.json"), "wb").write(uploaded[src])
        os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 0o600)
        print(f"{src} saved to ~/.kaggle/kaggle.json")
    else:
        raise FileNotFoundError("No kaggle.json uploaded. Please upload your Kaggle API token file.")


kaggle (5).json saved to ~/.kaggle/kaggle.json


In [None]:
os.makedirs("./data", exist_ok=True)
datasets_to_try = [
    "grouplens/movielens-100k",          # preferred official source (may require accepting terms on Kaggle)
    "prajitdatta/movielens-100k-dataset" # alternative mirror if available
]

downloaded = False
for ds in datasets_to_try:
    print(f"\nAttempting to download dataset: {ds} ...")
    ret = subprocess.run(["kaggle", "datasets", "download", "-d", ds, "-p", "./data"], capture_output=True, text=True)
    print(ret.stdout)
    if ret.returncode == 0:
        print("Download command returned success.")
        downloaded = True
        break
    else:
        print("Download failed or returned non-zero. stderr:")
        print(ret.stderr)

if not downloaded:
    raise RuntimeError(
        "Failed to download MovieLens from Kaggle. Common reasons:\n"
        "- You need to ACCEPT the dataset license on Kaggle (open the dataset page and click Accept).\n"
        "- Kaggle rate limits or network issues.\n\n"
        "Open https://www.kaggle.com/datasets/grouplens/movielens-100k in your browser, accept the terms, then re-run this cell."
    )


Attempting to download dataset: grouplens/movielens-100k ...
403 Client Error: Forbidden for url: https://www.kaggle.com/api/v1/datasets/metadata/grouplens/movielens-100k

Download failed or returned non-zero. stderr:


Attempting to download dataset: prajitdatta/movielens-100k-dataset ...
Dataset URL: https://www.kaggle.com/datasets/prajitdatta/movielens-100k-dataset
License(s): CC0-1.0
movielens-100k-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)

Download command returned success.


In [None]:
zip_candidates = glob.glob("./data/*.zip")
if not zip_candidates:
    raise FileNotFoundError("No .zip files found in ./data after download. Check the download step or dataset filename.")
# prefer file containing 'movielens' or 'ml-100k'
zip_file = None
for z in zip_candidates:
    if "movielens" in os.path.basename(z).lower() or "ml-100k" in os.path.basename(z).lower():
        zip_file = z
        break
if zip_file is None:
    zip_file = zip_candidates[0]
print(f"Unzipping {zip_file} ...")
ret = subprocess.run(["unzip","-o", zip_file, "-d", "./data"], capture_output=True, text=True)
print(ret.stdout)
if ret.returncode != 0:
    print("unzip stderr:", ret.stderr)

Unzipping ./data/movielens-100k-dataset.zip ...
Archive:  ./data/movielens-100k-dataset.zip
  inflating: ./data/ml-100k/README   
  inflating: ./data/ml-100k/allbut.pl  
  inflating: ./data/ml-100k/mku.sh   
  inflating: ./data/ml-100k/u.data   
  inflating: ./data/ml-100k/u.genre  
  inflating: ./data/ml-100k/u.info   
  inflating: ./data/ml-100k/u.item   
  inflating: ./data/ml-100k/u.occupation  
  inflating: ./data/ml-100k/u.user   
  inflating: ./data/ml-100k/u1.base  
  inflating: ./data/ml-100k/u1.test  
  inflating: ./data/ml-100k/u2.base  
  inflating: ./data/ml-100k/u2.test  
  inflating: ./data/ml-100k/u3.base  
  inflating: ./data/ml-100k/u3.test  
  inflating: ./data/ml-100k/u4.base  
  inflating: ./data/ml-100k/u4.test  
  inflating: ./data/ml-100k/u5.base  
  inflating: ./data/ml-100k/u5.test  
  inflating: ./data/ml-100k/ua.base  
  inflating: ./data/ml-100k/ua.test  
  inflating: ./data/ml-100k/ub.base  
  inflating: ./data/ml-100k/ub.test  



In [None]:
def find_ml100k_paths(base_dir="./data"):
    candidates = []
    # typical locations
    candidates.append(os.path.join(base_dir, "ml-100k", "u.data"))
    candidates.append(os.path.join(base_dir, "ml-100k", "u.item"))
    candidates.append(os.path.join(base_dir, "u.data"))
    candidates.append(os.path.join(base_dir, "u.item"))
    # also try nested
    for root, dirs, files in os.walk(base_dir):
        for f in files:
            if f.lower() == "u.data":
                candidates.append(os.path.join(root, f))
            if f.lower() == "u.item":
                candidates.append(os.path.join(root, f))
    # pick first pair that exists
    udata = None
    uitem = None
    for p in candidates:
        if os.path.basename(p).lower() == "u.data" and os.path.isfile(p):
            udata = p
        if os.path.basename(p).lower() == "u.item" and os.path.isfile(p):
            uitem = p
    return udata, uitem

u_data_path, u_item_path = find_ml100k_paths("./data")
if not u_data_path or not u_item_path:
    raise FileNotFoundError("Could not locate u.data and u.item after unzipping. Check that the MovieLens 100K files exist under ./data or ./data/ml-100k.")

print("Found u.data at:", u_data_path)
print("Found u.item at:", u_item_path)

Found u.data at: ./data/ml-100k/u.data
Found u.item at: ./data/ml-100k/u.item


In [None]:
print("\nLoading dataset into pandas...")
ratings = pd.read_csv(u_data_path, sep='\t', names=['user_id','movie_id','rating','timestamp'], encoding='latin-1')
movies = pd.read_csv(u_item_path, sep='|', header=None, encoding='latin-1', low_memory=False)
# Keep first two columns: id and title (some mirrors have different columns)
movies = movies.iloc[:, :2]
movies.columns = ['movie_id', 'title']
print("Ratings:", ratings.shape, "Movies:", movies.shape)



Loading dataset into pandas...
Ratings: (100000, 4) Movies: (1682, 2)


In [None]:
def leave_one_out(df, seed=RND):
    users = df['user_id'].unique()
    rng = np.random.RandomState(seed)
    train_parts = []
    test_parts = []
    for u in users:
        sub = df[df['user_id'] == u]
        if len(sub) <= 1:
            train_parts.append(sub)
            continue
        idx = rng.choice(sub.index, 1, replace=False)
        test_parts.append(sub.loc[idx])
        train_parts.append(sub.drop(idx))
    train_df = pd.concat(train_parts).reset_index(drop=True)
    test_df = pd.concat(test_parts).reset_index(drop=True)
    return train_df, test_df

train_df, test_df = leave_one_out(ratings)
print("Train ratings:", len(train_df), "Test (held-out) ratings:", len(test_df))


Train ratings: 99057 Test (held-out) ratings: 943


In [None]:
train_ui = train_df.pivot_table(index='user_id', columns='movie_id', values='rating')
# ensure columns include all movies so shape consistent
all_movie_ids = movies['movie_id'].unique().tolist()
train_ui = train_ui.reindex(columns=all_movie_ids)
print("Train user-item matrix shape:", train_ui.shape)

# helper mappings
user_to_idx = {u: i for i, u in enumerate(train_ui.index)}
idx_to_user = {i: u for u, i in user_to_idx.items()}
movie_to_idx = {m: i for i, m in enumerate(train_ui.columns)}
idx_to_movie = {i: m for m, i in movie_to_idx.items()}
id2title = dict(zip(movies['movie_id'], movies['title']))

Train user-item matrix shape: (943, 1682)


In [None]:
def compute_user_sim(train_ui, normalize=True, fillna=0.0):
    M = train_ui.copy()
    if normalize:
        M = M.subtract(M.mean(axis=1), axis=0)
    mat = M.fillna(fillna).values
    sim = cosine_similarity(mat)
    np.fill_diagonal(sim, 0.0)
    return sim

print("Computing user-user similarity (cosine on demeaned ratings)...")
user_sim = compute_user_sim(train_ui, normalize=True)


Computing user-user similarity (cosine on demeaned ratings)...


In [None]:
def recommend_user_based(user_id, top_k=10, n_neighbors=20):
    if user_id not in user_to_idx:
        return []
    uidx = user_to_idx[user_id]
    sims = user_sim[uidx].copy()
    sims[uidx] = 0.0
    neigh_idx = np.argsort(sims)[::-1][:n_neighbors]
    neigh_sims = sims[neigh_idx]
    R = train_ui.values  # users x items (movie_id columns)
    numer = (neigh_sims[:, None] * np.nan_to_num(R[neigh_idx, :], nan=0.0)).sum(axis=0)
    denom = (neigh_sims[:, None] * (~np.isnan(R[neigh_idx, :])).astype(float)).sum(axis=0)
    with np.errstate(divide='ignore', invalid='ignore'):
        preds = numer / denom
    preds[np.isnan(preds)] = -np.inf
    seen_mask = ~np.isnan(train_ui.loc[user_id].values)
    preds[seen_mask] = -np.inf
    top_idx = np.argsort(preds)[::-1][:top_k]
    recs = [(idx_to_movie[i], float(preds[i])) for i in top_idx if preds[i] != -np.inf]
    return recs

In [None]:
print("Computing item-item similarity (cosine)...")
item_mat = train_ui.fillna(0.0).values.T  # items x users
item_sim = cosine_similarity(item_mat)
np.fill_diagonal(item_sim, 0.0)


Computing item-item similarity (cosine)...


In [None]:
# Full Colab-ready pipeline: upload kaggle.json -> download MovieLens100k -> run recommenders + eval
# Run this whole cell in Google Colab.

# 0) Imports
import os, sys, subprocess, glob, random
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import warnings
warnings.filterwarnings("ignore")
RND = 42
random.seed(RND)
np.random.seed(RND)

# 1) Upload kaggle.json (Colab UI)
from google.colab import files
print("Step 1: Please upload your kaggle.json (Colab will prompt a file chooser)...")
uploaded = files.upload()  # choose kaggle.json

# 2) Setup ~/.kaggle
os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)
if "kaggle.json" in uploaded:
    open(os.path.expanduser("~/.kaggle/kaggle.json"), "wb").write(uploaded["kaggle.json"])
    os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 0o600)
    print("kaggle.json saved to ~/.kaggle/kaggle.json")
else:
    # if uploaded file has different name (Colab may rename), try to find first json
    json_files = [f for f in uploaded.keys() if f.lower().endswith(".json")]
    if json_files:
        src = list(uploaded.keys())[0]
        open(os.path.expanduser("~/.kaggle/kaggle.json"), "wb").write(uploaded[src])
        os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 0o600)
        print(f"{src} saved to ~/.kaggle/kaggle.json")
    else:
        raise FileNotFoundError("No kaggle.json uploaded. Please upload your Kaggle API token file.")

# 3) Download MovieLens 100K (try primary dataset id, fallback to alternative)
os.makedirs("./data", exist_ok=True)
datasets_to_try = [
    "grouplens/movielens-100k",          # preferred official source (may require accepting terms on Kaggle)
    "prajitdatta/movielens-100k-dataset" # alternative mirror if available
]

downloaded = False
for ds in datasets_to_try:
    print(f"\nAttempting to download dataset: {ds} ...")
    ret = subprocess.run(["kaggle", "datasets", "download", "-d", ds, "-p", "./data"], capture_output=True, text=True)
    print(ret.stdout)
    if ret.returncode == 0:
        print("Download command returned success.")
        downloaded = True
        break
    else:
        print("Download failed or returned non-zero. stderr:")
        print(ret.stderr)

if not downloaded:
    raise RuntimeError(
        "Failed to download MovieLens from Kaggle. Common reasons:\n"
        "- You need to ACCEPT the dataset license on Kaggle (open the dataset page and click Accept).\n"
        "- Kaggle rate limits or network issues.\n\n"
        "Open https://www.kaggle.com/datasets/grouplens/movielens-100k in your browser, accept the terms, then re-run this cell."
    )

# 4) Unzip any movielens zip present in ./data
zip_candidates = glob.glob("./data/*.zip")
if not zip_candidates:
    raise FileNotFoundError("No .zip files found in ./data after download. Check the download step or dataset filename.")
# prefer file containing 'movielens' or 'ml-100k'
zip_file = None
for z in zip_candidates:
    if "movielens" in os.path.basename(z).lower() or "ml-100k" in os.path.basename(z).lower():
        zip_file = z
        break
if zip_file is None:
    zip_file = zip_candidates[0]
print(f"Unzipping {zip_file} ...")
ret = subprocess.run(["unzip","-o", zip_file, "-d", "./data"], capture_output=True, text=True)
print(ret.stdout)
if ret.returncode != 0:
    print("unzip stderr:", ret.stderr)

# 5) Locate u.data and u.item (support various extraction structures)
def find_ml100k_paths(base_dir="./data"):
    candidates = []
    # typical locations
    candidates.append(os.path.join(base_dir, "ml-100k", "u.data"))
    candidates.append(os.path.join(base_dir, "ml-100k", "u.item"))
    candidates.append(os.path.join(base_dir, "u.data"))
    candidates.append(os.path.join(base_dir, "u.item"))
    # also try nested
    for root, dirs, files in os.walk(base_dir):
        for f in files:
            if f.lower() == "u.data":
                candidates.append(os.path.join(root, f))
            if f.lower() == "u.item":
                candidates.append(os.path.join(root, f))
    # pick first pair that exists
    udata = None
    uitem = None
    for p in candidates:
        if os.path.basename(p).lower() == "u.data" and os.path.isfile(p):
            udata = p
        if os.path.basename(p).lower() == "u.item" and os.path.isfile(p):
            uitem = p
    return udata, uitem

u_data_path, u_item_path = find_ml100k_paths("./data")
if not u_data_path or not u_item_path:
    raise FileNotFoundError("Could not locate u.data and u.item after unzipping. Check that the MovieLens 100K files exist under ./data or ./data/ml-100k.")

print("Found u.data at:", u_data_path)
print("Found u.item at:", u_item_path)

# 6) Load the MovieLens files
print("\nLoading dataset into pandas...")
ratings = pd.read_csv(u_data_path, sep='\t', names=['user_id','movie_id','rating','timestamp'], encoding='latin-1')
movies = pd.read_csv(u_item_path, sep='|', header=None, encoding='latin-1', low_memory=False)
# Keep first two columns: id and title (some mirrors have different columns)
movies = movies.iloc[:, :2]
movies.columns = ['movie_id', 'title']
print("Ratings:", ratings.shape, "Movies:", movies.shape)

# 7) Build leave-one-out train/test (one holdout per user)
def leave_one_out(df, seed=RND):
    users = df['user_id'].unique()
    rng = np.random.RandomState(seed)
    train_parts = []
    test_parts = []
    for u in users:
        sub = df[df['user_id'] == u]
        if len(sub) <= 1:
            train_parts.append(sub)
            continue
        idx = rng.choice(sub.index, 1, replace=False)
        test_parts.append(sub.loc[idx])
        train_parts.append(sub.drop(idx))
    train_df = pd.concat(train_parts).reset_index(drop=True)
    test_df = pd.concat(test_parts).reset_index(drop=True)
    return train_df, test_df

train_df, test_df = leave_one_out(ratings)
print("Train ratings:", len(train_df), "Test (held-out) ratings:", len(test_df))

# 8) Build user-item matrix (use movie_id as columns to keep ids consistent)
train_ui = train_df.pivot_table(index='user_id', columns='movie_id', values='rating')
# ensure columns include all movies so shape consistent
all_movie_ids = movies['movie_id'].unique().tolist()
train_ui = train_ui.reindex(columns=all_movie_ids)
print("Train user-item matrix shape:", train_ui.shape)

# helper mappings
user_to_idx = {u: i for i, u in enumerate(train_ui.index)}
idx_to_user = {i: u for u, i in user_to_idx.items()}
movie_to_idx = {m: i for i, m in enumerate(train_ui.columns)}
idx_to_movie = {i: m for m, i in movie_to_idx.items()}
id2title = dict(zip(movies['movie_id'], movies['title']))

# 9) User-based CF: compute user-user cosine similarity on demeaned ratings (better practice)
def compute_user_sim(train_ui, normalize=True, fillna=0.0):
    M = train_ui.copy()
    if normalize:
        M = M.subtract(M.mean(axis=1), axis=0)
    mat = M.fillna(fillna).values
    sim = cosine_similarity(mat)
    np.fill_diagonal(sim, 0.0)
    return sim

print("Computing user-user similarity (cosine on demeaned ratings)...")
user_sim = compute_user_sim(train_ui, normalize=True)

# user-based recommender returning movie_id + score
def recommend_user_based(user_id, top_k=10, n_neighbors=20):
    if user_id not in user_to_idx:
        return []
    uidx = user_to_idx[user_id]
    sims = user_sim[uidx].copy()
    sims[uidx] = 0.0
    neigh_idx = np.argsort(sims)[::-1][:n_neighbors]
    neigh_sims = sims[neigh_idx]
    R = train_ui.values  # users x items (movie_id columns)
    numer = (neigh_sims[:, None] * np.nan_to_num(R[neigh_idx, :], nan=0.0)).sum(axis=0)
    denom = (neigh_sims[:, None] * (~np.isnan(R[neigh_idx, :])).astype(float)).sum(axis=0)
    with np.errstate(divide='ignore', invalid='ignore'):
        preds = numer / denom
    preds[np.isnan(preds)] = -np.inf
    seen_mask = ~np.isnan(train_ui.loc[user_id].values)
    preds[seen_mask] = -np.inf
    top_idx = np.argsort(preds)[::-1][:top_k]
    recs = [(idx_to_movie[i], float(preds[i])) for i in top_idx if preds[i] != -np.inf]
    return recs

# 10) Item-based CF
print("Computing item-item similarity (cosine)...")
item_mat = train_ui.fillna(0.0).values.T  # items x users
item_sim = cosine_similarity(item_mat)
np.fill_diagonal(item_sim, 0.0)

def recommend_item_based(user_id, top_k=10, n_neighbors=20):
    if user_id not in user_to_idx:
        return []
    user_row = train_ui.loc[user_id].values
    seen = ~np.isnan(user_row)
    unseen_idx = np.where(np.isnan(user_row))[0]
    if unseen_idx.size == 0:
        return []
    preds = np.full(user_row.shape, -np.inf, dtype=float)
    for j in unseen_idx:
        sim_j = item_sim[j]
        rated_idx = np.where(seen)[0]
        if rated_idx.size == 0:
            continue
        # pick top neighbors among rated items
        neigh = rated_idx[np.argsort(sim_j[rated_idx])[::-1][:n_neighbors]]
        weights = sim_j[neigh]
        ratings = user_row[neigh]
        if weights.sum() == 0:
            continue
        preds[j] = np.dot(weights, ratings) / (weights.sum() + 1e-8)
    top_idx = np.argsort(preds)[::-1][:top_k]
    recs = [(idx_to_movie[i], float(preds[i])) for i in top_idx if preds[i] != -np.inf]
    return recs


Step 1: Please upload your kaggle.json (Colab will prompt a file chooser)...


Saving kaggle.json to kaggle (6).json
kaggle (6).json saved to ~/.kaggle/kaggle.json

Attempting to download dataset: grouplens/movielens-100k ...
403 Client Error: Forbidden for url: https://www.kaggle.com/api/v1/datasets/metadata/grouplens/movielens-100k

Download failed or returned non-zero. stderr:


Attempting to download dataset: prajitdatta/movielens-100k-dataset ...
Dataset URL: https://www.kaggle.com/datasets/prajitdatta/movielens-100k-dataset
License(s): CC0-1.0
movielens-100k-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)

Download command returned success.
Unzipping ./data/movielens-100k-dataset.zip ...
Archive:  ./data/movielens-100k-dataset.zip
  inflating: ./data/ml-100k/README   
  inflating: ./data/ml-100k/allbut.pl  
  inflating: ./data/ml-100k/mku.sh   
  inflating: ./data/ml-100k/u.data   
  inflating: ./data/ml-100k/u.genre  
  inflating: ./data/ml-100k/u.info   
  inflating: ./data/ml-100k/u.item   
  inflating: ./da

In [None]:
def svd_predict(train_ui, k=25):
    R = train_ui.copy()
    user_means = R.mean(axis=1)
    global_mean = R.stack().mean()
    R_filled = R.copy()
    for uid in R.index:
        user_mean = user_means.loc[uid]
        fill_val = user_mean if not np.isnan(user_mean) else global_mean
        R_filled.loc[uid] = R_filled.loc[uid].fillna(fill_val)
    M = R_filled.values
    # demean columns for svds stability
    col_mean = np.mean(M, axis=0)
    M_centered = M - col_mean
    k = min(k, min(M_centered.shape)-1)
    if k < 2:
        k = 2
    U, s, Vt = svds(M_centered, k=k)
    S = np.diag(s)
    M_hat = np.dot(np.dot(U, S), Vt) + col_mean
    pred_df = pd.DataFrame(M_hat, index=train_ui.index, columns=train_ui.columns)
    return pred_df

print("Computing SVD predictions (this may take a few seconds)...")
pred_svd = svd_predict(train_ui, k=25)

Computing SVD predictions (this may take a few seconds)...


In [None]:

def recommend_svd(user_id, top_k=10):
    if user_id not in pred_svd.index:
        return []
    preds = pred_svd.loc[user_id].values.copy()
    seen_mask = ~np.isnan(train_ui.loc[user_id].values)
    preds[seen_mask] = -np.inf
    top_idx = np.argsort(preds)[::-1][:top_k]
    recs = [(idx_to_movie[i], float(preds[i])) for i in top_idx if preds[i] != -np.inf]
    return recs

In [None]:
test_pairs = list(zip(test_df['user_id'].values, test_df['movie_id'].values))
def precision_at_k_recommender(test_pairs, recommender_fn, k=10, **kwargs):
    hits = []
    for uid, true_mid in test_pairs:
        recs = recommender_fn(uid, top_k=k, **kwargs)
        rec_ids = [mid for mid, score in recs]
        hits.append(1.0 if true_mid in rec_ids else 0.0)
    return np.mean(hits) if hits else 0.0

K = 10
print(f"\nEvaluating Precision@{K} on {len(test_pairs)} held-out ratings...")
prec_user = precision_at_k_recommender(test_pairs, recommend_user_based, k=K, n_neighbors=20)
prec_item = precision_at_k_recommender(test_pairs, recommend_item_based, k=K, n_neighbors=20)
prec_svd  = precision_at_k_recommender(test_pairs, recommend_svd, k=K)

print(f"Precision@{K}  -> UserCF: {prec_user:.4f} | ItemCF: {prec_item:.4f} | SVD: {prec_svd:.4f}")



Evaluating Precision@10 on 943 held-out ratings...
Precision@10  -> UserCF: 0.0095 | ItemCF: 0.0148 | SVD: 0.1251


In [None]:
os.makedirs("outputs", exist_ok=True)
sample_users = random.sample(list(train_ui.index), k=5)
rows = []
for uid in sample_users:
    for model_name, fn in [("user", recommend_user_based), ("item", recommend_item_based), ("svd", recommend_svd)]:
        recs = fn(uid, top_k=10, n_neighbors=20) if model_name != "svd" else fn(uid, top_k=10)
        for rank, (mid, score) in enumerate(recs, start=1):
            rows.append({"user_id": uid, "model": model_name, "rank": rank, "movie_id": mid, "title": id2title.get(mid, ""), "score": score})
pd.DataFrame(rows).to_csv("outputs/example_recommendations.csv", index=False)
print("Saved example_recommendations.csv to ./outputs/")

Saved example_recommendations.csv to ./outputs/


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
fig = plt.figure(figsize=(8,4))
sns.histplot(train_ui.stack().values, bins=6)
plt.title("Ratings distribution (train)")
plt.savefig("outputs/ratings_distribution.png")
plt.close(fig)

In [None]:
fig2 = plt.figure(figsize=(8,4))
user_density = (~train_ui.isna()).mean(axis=1)
sns.histplot(user_density.values, bins=30)
plt.title("User density (fraction of movies rated)")
plt.savefig("outputs/user_density.png")
plt.close(fig2)
print("Saved diagnostics images to ./outputs/")

Saved diagnostics images to ./outputs/


In [None]:
print("\nSample recommendations (titles) for a few users:")
for uid in sample_users:
    print(f"\nUser {uid}:")
    ur = recommend_user_based(uid, top_k=8, n_neighbors=20)
    print(" User-based:")
    for mid, score in ur:
        print("   ", id2title.get(mid, f"id:{mid}"), f"(score={score:.3f})")
    it = recommend_item_based(uid, top_k=8, n_neighbors=20)
    print(" Item-based:")
    for mid, score in it:
        print("   ", id2title.get(mid, f"id:{mid}"), f"(score={score:.3f})")
    sv = recommend_svd(uid, top_k=8)
    print(" SVD-based:")
    for mid, score in sv:
        print("   ", id2title.get(mid, f"id:{mid}"), f"(score={score:.3f})")

print("\nAll done — outputs are in the ./outputs folder. If Kaggle download failed earlier, accept the dataset license on Kaggle and re-run the cell.")



Sample recommendations (titles) for a few users:

User 655:
 User-based:
    Maya Lin: A Strong Clear Vision (1994) (score=5.000)
    Top Hat (1935) (score=5.000)
    Kansas City (1996) (score=5.000)
    Drop Dead Fred (1991) (score=5.000)
    Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991) (score=5.000)
    Billy Madison (1995) (score=5.000)
    Romper Stomper (1992) (score=5.000)
    Someone Else's America (1995) (score=5.000)
 Item-based:
    East of Eden (1955) (score=3.603)
    It's a Wonderful Life (1946) (score=3.587)
    Wild Bunch, The (1969) (score=3.587)
    Cat on a Hot Tin Roof (1958) (score=3.569)
    Singin' in the Rain (1952) (score=3.561)
    Thin Man, The (1934) (score=3.534)
    Apocalypse Now (1979) (score=3.510)
    Bringing Up Baby (1938) (score=3.507)
 SVD-based:
    It's a Wonderful Life (1946) (score=3.665)
    Some Like It Hot (1959) (score=3.635)
    Sunset Blvd. (1950) (score=3.476)
    Ben-Hur (1959) (score=3.420)
    Good