<a href="https://colab.research.google.com/github/muqeetahmaad9/student-performance-internship/blob/main/task4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Task5: Movie Recommendation System (Colab-ready)
# Run in Google Colab

# 0. Install / Import (no extra installs required)
import os
import zipfile
import urllib.request
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

# 1. Download MovieLens 100K
data_url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
zip_path = "/content/ml-100k.zip"
if not os.path.exists("/content/ml-100k"):
    print("Downloading MovieLens 100k...")
    urllib.request.urlretrieve(data_url, zip_path)
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extractall("/content/ml-100k")
    print("Done.")

# 2. Load ratings and movie titles
# u.data format: user id | item id | rating | timestamp
ratings_path = "/content/ml-100k/ml-100k/u.data"
columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(ratings_path, sep='\t', names=columns, encoding='latin-1')

# u.item contains movie id | movie title | ...
items_path = "/content/ml-100k/ml-100k/u.item"
movies = pd.read_csv(items_path, sep='|', names=range(24), encoding='latin-1', header=None)
movies = movies[[0, 1]]  # first two columns: id, title
movies.columns = ['movie_id', 'title']
movies['movie_id'] = movies['movie_id'].astype(int)

# Merge for readability when showing recommendations
ratings = ratings.merge(movies, on='movie_id', how='left')

# 3. Create user-item matrix (pivot)
user_item = ratings.pivot_table(index='user_id', columns='movie_id', values='rating').fillna(0)
user_item_matrix = user_item.values  # shape (n_users, n_items)
print("user_item_matrix shape:", user_item_matrix.shape)

# Helper: mapping ids to indices
user_ids = list(user_item.index)
movie_ids = list(user_item.columns)
movie_id_to_col = {mid: idx for idx, mid in enumerate(movie_ids)}
col_to_movie_id = {idx: mid for idx, mid in enumerate(movie_ids)}
movieid_to_title = dict(zip(movies['movie_id'], movies['title']))

# 4. USER-BASED Collaborative Filtering (cosine similarity)
# compute user-user cosine similarity
user_sim = cosine_similarity(user_item_matrix)  # (n_users, n_users)

def recommend_user_based(target_user_id, top_k=10, n_neighbors=20):
    """
    Recommend top_k movies for target_user_id using user-based CF
    n_neighbors: number of similar users to consider
    """
    if target_user_id not in user_ids:
        raise ValueError("User id not in dataset")

    u_idx = user_ids.index(target_user_id)
    sim_scores = user_sim[u_idx]  # similarity to all users
    # ignore self
    sim_scores[u_idx] = 0
    # find top neighbors
    neighbors_idx = np.argsort(sim_scores)[-n_neighbors:]
    neighbor_sims = sim_scores[neighbors_idx]

    # Weighted sum of neighbor ratings
    neighbor_ratings = user_item_matrix[neighbors_idx]  # shape (n_neighbors, n_items)
    weighted_sum = np.dot(neighbor_sims, neighbor_ratings)
    sim_sum = np.sum(np.abs(neighbor_sims)) + 1e-9
    predicted_scores = weighted_sum / sim_sum

    # Mask already rated items by target user
    rated = user_item_matrix[u_idx] > 0
    predicted_scores[rated] = -np.inf

    # pick top_k movie indices
    top_indices = np.argsort(predicted_scores)[-top_k:][::-1]
    recommendations = [(col_to_movie_id[i], movieid_to_title[col_to_movie_id[i]], predicted_scores[i]) for i in top_indices]
    return recommendations

# Example: recommend for user 1
print("User-based CF recommendations for user 1:")
for mid, title, score in recommend_user_based(1, top_k=10, n_neighbors=30):
    print(f"{mid} - {title} (score: {score:.3f})")

# 5. ITEM-BASED Collaborative Filtering (bonus)
# compute item-item similarity based on columns of user_item_matrix
item_item = cosine_similarity(user_item_matrix.T)  # (n_items, n_items)

def recommend_item_based(target_user_id, top_k=10, n_sim_items=20):
    u_idx = user_ids.index(target_user_id)
    user_ratings = user_item_matrix[u_idx]  # ratings by the user
    # For each item user has rated, get similar items
    scores = np.zeros(user_item_matrix.shape[1])
    for item_idx, r in enumerate(user_ratings):
        if r > 0:
            sim_items = item_item[item_idx] # similarity of current item to all items
            # Take top similar items (excluding self)
            top_sim_idx = np.argsort(sim_items)[-n_sim_items-1:-1] # Exclude the item itself
            # Predict score for each unrated item based on similarity to rated item
            # and the user's rating of that item
            # We only consider items the user *hasn't* rated yet for recommendations
            unrated_items_idx = np.where(user_ratings == 0)[0]
            for unrated_item_idx in unrated_items_idx:
                # Find similarity between the rated item and the unrated item
                similarity = item_item[item_idx, unrated_item_idx]
                scores[unrated_item_idx] += similarity * r

    # Mask already seen (already handled by only adding to unrated items)
    # Mask items that received no prediction (no similar rated items)
    scores[user_ratings > 0] = -np.inf # ensure already rated items are not recommended
    scores[scores == 0] = -np.inf # ensure items with no similar rated items are not recommended

    top_indices = np.argsort(scores)[-top_k:][::-1]
    recommendations = [(col_to_movie_id[i], movieid_to_title[col_to_movie_id[i]], scores[i]) for i in top_indices if scores[i] > -np.inf]
    return recommendations


print("\nItem-based CF recommendations for user 1:")
for mid, title, score in recommend_item_based(1, top_k=10, n_sim_items=30):
    print(f"{mid} - {title} (score: {score:.3f})")

# 6. SVD (Matrix Factorization) using TruncatedSVD (bonus)
# We'll use user-item sparse representation (centered)
from scipy.sparse import csr_matrix

# Center ratings by subtracting user mean to improve SVD performance
user_means = np.true_divide(user_item_matrix.sum(axis=1), (user_item_matrix > 0).sum(axis=1) + 1e-9)
R_centered = user_item_matrix - user_means.reshape(-1, 1)
R_centered[np.where(user_item_matrix == 0)] = 0  # keep zeros for missing

svd = TruncatedSVD(n_components=50, random_state=42)
U = svd.fit_transform(R_centered)           # user factors (n_users x k)
Sigma = svd.singular_values_                # singular values
VT = svd.components_                        # item factors (k x n_items)

# Reconstruct approximate ratings (add back user means)
R_hat = np.dot(U, VT) + user_means.reshape(-1, 1)

def recommend_svd(target_user_id, top_k=10):
    u_idx = user_ids.index(target_user_id)
    preds = R_hat[u_idx]
    rated = user_item_matrix[u_idx] > 0
    preds[rated] = -np.inf
    top_indices = np.argsort(preds)[-top_k:][::-1]
    recommendations = [(col_to_movie_id[i], movieid_to_title[col_to_movie_id[i]], preds[i]) for i in top_indices]
    return recommendations

print("\nSVD-based recommendations for user 1:")
for mid, title, score in recommend_svd(1, top_k=10):
    print(f"{mid} - {title} (score: {score:.3f})")

# 7. EVALUATION: Precision@K for user-based CF
# Build train/test split of ratings (leave-one-out style or holdout)
def prepare_train_test(ratings_df, test_size=0.2, random_state=42):
    train_df, test_df = train_test_split(ratings_df, test_size=test_size, random_state=random_state)
    # build matrices
    train_mat = train_df.pivot_table(index='user_id', columns='movie_id', values='rating').reindex(index=user_item.index, columns=user_item.columns).fillna(0)
    test_mat = test_df.pivot_table(index='user_id', columns='movie_id', values='rating').reindex(index=user_item.index, columns=user_item.columns).fillna(0)
    return train_mat.values, test_mat.values

train_mat, test_mat = prepare_train_test(ratings, test_size=0.2)

# compute user similarity on train
train_user_sim = cosine_similarity(train_mat)

def precision_at_k_from_matrix(pred_func, train_mat, test_mat, K=10, n_users_eval=100):
    """
    pred_func(user_id) should return list of recommended movie_ids (top K)
    train_mat/test_mat: matrices aligned to user_ids and movie_ids
    """
    n_users = train_mat.shape[0]
    users_to_eval = range(min(n_users, n_users_eval))
    precisions = []
    for u in users_to_eval:
        # get ground truth items in test (relevant items)
        relevant = set(np.where(test_mat[u] > 0)[0])  # column indices
        if len(relevant) == 0:
            continue
        # get top-K recommendations using function adapted to indexed user
        # Our pred_func expects user_id from original ids; convert
        uid = user_ids[u]
        recs = pred_func(uid, top_k=K)
        rec_item_indices = [movie_ids.index(mid) for mid, _, _ in recs if mid in movie_ids]
        hit = len(set(rec_item_indices).intersection(relevant))
        precisions.append(hit / K)
    return np.mean(precisions)

# Wrap recommender to match signature
def user_cf_wrapper(uid, top_k=10):
    return recommend_user_based(uid, top_k=top_k, n_neighbors=30)

print("\nComputing Precision@10 (sample users)...")
p_at_10 = precision_at_k_from_matrix(user_cf_wrapper, train_mat, test_mat, K=10, n_users_eval=200)
print("Precision@10 (User-based CF):", round(p_at_10, 4))

# 8. Save results / show a sample of recommendations for several users
sample_users = [1, 50, 100, 200]
for u in sample_users:
    print(f"\nUser {u} - User-CF recs:")
    for mid, title, score in recommend_user_based(u, top_k=5, n_neighbors=30):
        print(f"  {title} (movie_id={mid}, score={score:.3f})")

user_item_matrix shape: (943, 1682)
User-based CF recommendations for user 1:
423 - E.T. the Extra-Terrestrial (1982) (score: 3.666)
474 - Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963) (score: 3.453)
655 - Stand by Me (1986) (score: 3.442)
568 - Speed (1994) (score: 3.237)
403 - Batman (1989) (score: 3.201)
357 - One Flew Over the Cuckoo's Nest (1975) (score: 3.106)
385 - True Lies (1994) (score: 3.099)
318 - Schindler's List (1993) (score: 3.000)
651 - Glory (1989) (score: 2.998)
433 - Heathers (1989) (score: 2.984)

Item-based CF recommendations for user 1:
423 - E.T. the Extra-Terrestrial (1982) (score: 394.094)
655 - Stand by Me (1986) (score: 368.217)
568 - Speed (1994) (score: 367.805)
403 - Batman (1989) (score: 364.944)
385 - True Lies (1994) (score: 363.508)
318 - Schindler's List (1993) (score: 357.974)
357 - One Flew Over the Cuckoo's Nest (1975) (score: 352.764)
367 - Clueless (1995) (score: 351.735)
393 - Mrs. Doubtfire (1993) (score: 345.905)