In [16]:
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD


In [17]:
# Configuration
data_dir = r"D:\internship\Task 2\ml-100k"
folds = [1,2,3,4,5]
K_eval = 10   # Precision@K (K)
TOPN = 10     # number of recommendations to return
NEIGHBORS = 20  # number of neighbors to use in k-NN prediction
SVD_COMPONENTS = 20  # components for TruncatedSVD


In [18]:
# I/O helpers
def load_ratings_from_file(path, sep="\t", names=["userId","movieId","rating","timestamp"]):
    return pd.read_csv(path, sep=sep, names=names, encoding='latin-1')

def load_item_titles(u_item_path):
    # read u.item; first two columns: movieId | title
    cols = ["movieId","title","release_date","video_release","IMDb_URL"] + [f"g{i}" for i in range(19)]
    df = pd.read_csv(u_item_path, sep="|", names=cols, encoding='latin-1', header=None)
    return df[["movieId","title"]].set_index("movieId")["title"].to_dict()


In [19]:
# Build user-item matrix (fixed item universe from u.item)
def build_user_item_matrix_from_df(df, all_movie_ids):
    """
    df: DataFrame with columns userId, movieId, rating
    all_movie_ids: list (or array) of all movie IDs to fix columns across folds
    returns: R (users x items), user_map (orig->index), item_map (orig->index)
    """
    users = np.sort(df['userId'].unique())
    items = np.sort(np.array(all_movie_ids))
    user_map = {u:i for i,u in enumerate(users)}
    item_map = {m:j for j,m in enumerate(items)}
    R = np.zeros((len(users), len(items)), dtype=float)
    for _, row in df.iterrows():
        u = row.userId; m = row.movieId; r = row.rating
        # some movie ids in all_movie_ids may not appear in train df; that's fine
        R[user_map[u], item_map[m]] = r
    return R, user_map, item_map


In [20]:
# Similarity functions
def compute_user_similarity(R):
    # add tiny eps to avoid zero-vector issues
    return cosine_similarity(R + 1e-9)

def compute_item_similarity(R):
    return cosine_similarity(R.T + 1e-9)


In [21]:
# Prediction / recommendation
def predict_user_based(R, user_sim, user_index, item_index, k=NEIGHBORS):
    """Predict rating for user_index on item_index using top-k similar users who rated the item."""
    sim_vec = user_sim[user_index].copy()
    # users who rated this item
    rated = R[:, item_index] > 0
    rated[user_index] = False  # exclude self
    candidates = np.where(rated)[0]
    if candidates.size == 0:
        return 0.0
    sims = sim_vec[candidates]
    if sims.size == 0:
        return 0.0
    # choose top-k by similarity
    topk_idx = np.argsort(sims)[-k:]
    chosen = candidates[topk_idx]
    s = sims[topk_idx]
    r = R[chosen, item_index]
    denom = np.sum(np.abs(s)) + 1e-9
    return np.dot(s, r) / denom


In [22]:
def recommend_user_based(R, user_sim, user_index, N=TOPN, k=NEIGHBORS):
    unseen = np.where(R[user_index, :] == 0)[0]
    preds = []
    for item in unseen:
        p = predict_user_based(R, user_sim, user_index, item, k=k)
        preds.append((item, p))
    preds_sorted = sorted(preds, key=lambda x: x[1], reverse=True)
    return [item for item, score in preds_sorted[:N]]


In [23]:
def predict_item_based(R, item_sim, user_index, item_index, k=NEIGHBORS):
    user_r = R[user_index, :]
    rated = user_r > 0
    rated[item_index] = False
    candidates = np.where(rated)[0]
    if candidates.size == 0:
        return 0.0
    sims = item_sim[item_index, candidates]
    topk_idx = np.argsort(sims)[-k:]
    chosen = candidates[topk_idx]
    s = sims[topk_idx]
    r = user_r[chosen]
    denom = np.sum(np.abs(s)) + 1e-9
    return np.dot(s, r) / denom


In [24]:
def recommend_item_based(R, item_sim, user_index, N=TOPN, k=NEIGHBORS):
    unseen = np.where(R[user_index, :] == 0)[0]
    preds = []
    for item in unseen:
        p = predict_item_based(R, item_sim, user_index, item, k=k)
        preds.append((item, p))
    preds_sorted = sorted(preds, key=lambda x: x[1], reverse=True)
    return [item for item,score in preds_sorted[:N]]


In [25]:
def svd_reconstruct(R, n_components=SVD_COMPONENTS):
    """Return reconstructed matrix R_hat using TruncatedSVD on user-centered data."""
    R_mask = (R > 0)
    user_means = np.zeros(R.shape[0])
    R_centered = R.copy().astype(float)
    for i in range(R.shape[0]):
        vals = R[i, R_mask[i]]
        user_means[i] = vals.mean() if vals.size > 0 else 0.0
        R_centered[i, R_mask[i]] = R_centered[i, R_mask[i]] - user_means[i]
    svd = TruncatedSVD(n_components=min(n_components, min(R_centered.shape)-1), random_state=42)
    U = svd.fit_transform(R_centered)
    VT = svd.components_
    R_hat = U.dot(VT) + user_means[:, np.newaxis]
    return R_hat


In [26]:
# Baselines & evaluation
def popularity_baseline(train_df, all_movies, N=TOPN):
    """Return top-N most popular movies (by number of ratings) as item indices in all_movies order."""
    counts = train_df['movieId'].value_counts()
    top_items = counts.index.tolist()
    # map to indices in all_movies
    top_indices = [all_movies.index(mid) for mid in top_items if mid in all_movies]
    return top_indices[:N]


In [27]:
def precision_at_k(recommended_items, test_items, k=K_eval):
    if len(recommended_items) == 0:
        return 0.0
    rec_k = recommended_items[:k]
    hits = sum(1 for r in rec_k if r in test_items)
    return hits / k


In [28]:
def evaluate_on_fold(train_df, test_df, all_movies, method="user", k_neighbors=NEIGHBORS, topN=TOPN, svd_components=SVD_COMPONENTS, k_eval=K_eval):
    """
    Evaluate method on a single train/test fold and return average Precision@k across users in test set.
    method in {"user","item","svd","pop"}
    """
    # Build R_train with fixed item universe
    R_train, user_map, item_map = build_user_item_matrix_from_df(train_df, all_movies)
    # Map test items to train indices
    test_by_user = defaultdict(set)
    for _, row in test_df.iterrows():
        u = row.userId; m = row.movieId
        if u in user_map and m in item_map:
            ui = user_map[u]; mi = item_map[m]
            test_by_user[ui].add(mi)
    # Prepare model specifics
    if method == "user":
        user_sim = compute_user_similarity(R_train)
    elif method == "item":
        item_sim = compute_item_similarity(R_train)
    elif method == "svd":
        R_hat = svd_reconstruct(R_train, n_components=svd_components)
    elif method == "pop":
        pop_top = popularity_baseline(train_df, all_movies, N=topN)
    else:
        raise ValueError("Unknown method")

    precisions = []
    for ui, test_items in test_by_user.items():
        if len(test_items) == 0:
            continue
        if method == "user":
            rec = recommend_user_based(R_train, user_sim, ui, N=topN, k=k_neighbors)
        elif method == "item":
            rec = recommend_item_based(R_train, item_sim, ui, N=topN, k=k_neighbors)
        elif method == "svd":
            # recommend topN unseen from R_hat
            preds = [(j, R_hat[ui, j]) for j in range(R_train.shape[1]) if R_train[ui, j] == 0]
            rec = [j for j,_ in sorted(preds, key=lambda x: x[1], reverse=True)[:topN]]
        elif method == "pop":
            rec = pop_top
        p = precision_at_k(rec, test_items, k=k_eval)
        precisions.append(p)
    return np.mean(precisions) if precisions else 0.0


In [29]:
# Run all folds and produce averaged table
def run_all_folds_and_report(data_dir, folds=folds, methods=("user","item","svd","pop")):
    title_map = load_item_titles(os.path.join(data_dir, "u.item"))
    all_movies = sorted(title_map.keys())
    results = {m: [] for m in methods}
    for f in folds:
        print(f"Evaluating fold u{f}.base / u{f}.test ...")
        train_df = load_ratings_from_file(os.path.join(data_dir, f"u{f}.base"))
        test_df = load_ratings_from_file(os.path.join(data_dir, f"u{f}.test"))
        for m in methods:
            score = evaluate_on_fold(train_df, test_df, all_movies, method=m)
            results[m].append(score)
            print(f"  fold u{f} {m:>4} precision@{K_eval}: {score:.4f}")
    avg_results = {m: np.mean(results[m]) for m in methods}
    print("\nAverage Precision@{} over {} folds:".format(K_eval, len(folds)))
    for m, s in avg_results.items():
        print(f"  {m:>4} : {s:.4f}")
    return avg_results, title_map, all_movies


In [30]:
# Helper: show top-N titles for a user (using a specific train fold)
def show_recommendations_for_user(train_df, user_id, all_movies, title_map, method="user", topN=TOPN):
    R_train, user_map, item_map = build_user_item_matrix_from_df(train_df, all_movies)
    if user_id not in user_map:
        print("User", user_id, "not in this training fold.")
        return []
    ui = user_map[user_id]
    if method == "user":
        user_sim = compute_user_similarity(R_train)
        rec_indices = recommend_user_based(R_train, user_sim, ui, N=topN)
    elif method == "item":
        item_sim = compute_item_similarity(R_train)
        rec_indices = recommend_item_based(R_train, item_sim, ui, N=topN)
    elif method == "svd":
        R_hat = svd_reconstruct(R_train)
        preds = [(j, R_hat[ui, j]) for j in range(R_train.shape[1]) if R_train[ui, j] == 0]
        rec_indices = [j for j,_ in sorted(preds, key=lambda x: x[1], reverse=True)[:topN]]
    elif method == "pop":
        rec_indices = popularity_baseline(train_df, all_movies, N=topN)
    else:
        rec_indices = []
    # map indices to movie IDs then titles
    inv_item_map = {v:k for k,v in item_map.items()}
    titles = []
    for idx in rec_indices:
        mid = inv_item_map.get(idx)
        if mid in title_map:
            titles.append(title_map[mid])
        else:
            titles.append(str(mid))
    return titles


In [31]:
# Run and display results (main)
if __name__ == "__main__":
    # 1) run evaluation across folds (user, item, svd, pop)
    avg_results, title_map, all_movies = run_all_folds_and_report(data_dir, folds=folds, methods=("user","item","svd","pop"))

    # 2) show example recommendations for a single user using u1.base
    example_user = 10   # change to any user id you like
    train_df = load_ratings_from_file(os.path.join(data_dir, "u1.base"))
    print(f"\nTop {TOPN} recommendations for user {example_user} (user-based, fold u1.base):")
    recs_user = show_recommendations_for_user(train_df, example_user, all_movies, title_map, method="user", topN=TOPN)
    for t in recs_user:
        print(" -", t)

    print(f"\nTop {TOPN} recommendations for user {example_user} (item-based, fold u1.base):")
    recs_item = show_recommendations_for_user(train_df, example_user, all_movies, title_map, method="item", topN=TOPN)
    for t in recs_item:
        print(" -", t)

    print(f"\nTop {TOPN} recommendations for user {example_user} (SVD, fold u1.base):")
    recs_svd = show_recommendations_for_user(train_df, example_user, all_movies, title_map, method="svd", topN=TOPN)
    for t in recs_svd:
        print(" -", t)

    print(f"\nTop {TOPN} popularity baseline (fold u1.base):")
    recs_pop = show_recommendations_for_user(train_df, example_user, all_movies, title_map, method="pop", topN=TOPN)
    for t in recs_pop:
        print(" -", t)


Evaluating fold u1.base / u1.test ...
  fold u1 user precision@10: 0.0083
  fold u1 item precision@10: 0.0590
  fold u1  svd precision@10: 0.3024
  fold u1  pop precision@10: 0.1959
Evaluating fold u2.base / u2.test ...
  fold u2 user precision@10: 0.0014
  fold u2 item precision@10: 0.0490
  fold u2  svd precision@10: 0.2410
  fold u2  pop precision@10: 0.1470
Evaluating fold u3.base / u3.test ...
  fold u3 user precision@10: 0.0007
  fold u3 item precision@10: 0.0426
  fold u3  svd precision@10: 0.1947
  fold u3  pop precision@10: 0.1147
Evaluating fold u4.base / u4.test ...
  fold u4 user precision@10: 0.0008
  fold u4 item precision@10: 0.0398
  fold u4  svd precision@10: 0.1883
  fold u4  pop precision@10: 0.1062
Evaluating fold u5.base / u5.test ...
  fold u5 user precision@10: 0.0004
  fold u5 item precision@10: 0.0324
  fold u5  svd precision@10: 0.1723
  fold u5  pop precision@10: 0.1069

Average Precision@10 over 5 folds:
  user : 0.0023
  item : 0.0445
   svd : 0.2197
   pop