In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Load ratings data
ratings = pd.read_csv("ml-100k/u.data", sep="\t", names=["user_id", "item_id", "rating", "timestamp"])

# Load movie item data
items = pd.read_csv(
    "ml-100k/u.item",
    sep="|",
    encoding="latin-1",
    names=["movie_id", "movie_title", "release_date", "video_release_date", "IMDb_URL"] +
          [f"genre_{i}" for i in range(19)],
    usecols=range(24)
)

df = pd.merge(ratings, items, left_on="item_id", right_on="movie_id")

# Preview the merged dataset
df.head()

### **1. Non-personalized**

In [None]:
def get_top_non_personalized():
    """
    Returns top 10 non-personalized movie recommendations using IMDb-style weighted rating.
    No input parameters.
    """

    # Compute rating count and average rating per movie
    movie_counts = df.groupby('movie_id')['rating'].count()
    movie_means = df.groupby('movie_id')['rating'].mean()

    movie_stats = pd.DataFrame({
        "rating_count": movie_counts,
        "average_rating": movie_means
    })

    # Merge with movie titles
    movie_stats = movie_stats.merge(items[["movie_id", "movie_title"]], on="movie_id")
    movie_stats.set_index("movie_id", inplace=True)

    # Weighted score formula
    C = movie_stats["average_rating"].mean()  # global average
    m = 100  # minimum votes threshold
    movie_stats["weighted_score"] = (
        (movie_stats["rating_count"] / (movie_stats["rating_count"] + m)) * movie_stats["average_rating"]
        + (m / (movie_stats["rating_count"] + m)) * C
    )

    # Return top 10 movies by weighted score
    top_movies = movie_stats.sort_values(by="weighted_score", ascending=False).head(10).reset_index()
    return top_movies[["movie_title", "average_rating", "rating_count", "weighted_score"]]

In [None]:
get_top_non_personalized()

### **2. Collaborative Filtering (user-based, item-based)**

In [None]:
# Create user-item matrix (entire dataset for now)
user_item_matrix = df.pivot_table(index='user_id', columns='item_id', values='rating')

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_user_based(user_id, num_recommendations=5):
    """
    Recommend movies using User-Based Collaborative Filtering.

    Returns a pandas DataFrame with movie_title and estimated score.
    """
    # Fill missing with 0 for similarity calc
    user_similarity = cosine_similarity(user_item_matrix.fillna(0))
    user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

    # Get most similar users (excluding self)
    similar_users = user_similarity_df[user_id].drop(user_id).sort_values(ascending=False)

    weighted_ratings = pd.Series(dtype=float)

    for other_user, sim in similar_users.items():
        other_ratings = user_item_matrix.loc[other_user]
        weighted = other_ratings * sim
        weighted_ratings = weighted_ratings.add(weighted, fill_value=0)

    # Remove movies already rated by user
    seen_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id].notna()].index
    recommendations = weighted_ratings.drop(index=seen_movies, errors='ignore')

    top_n = recommendations.sort_values(ascending=False).head(num_recommendations)
    titles = items.set_index("movie_id").loc[top_n.index]["movie_title"]

    return pd.DataFrame({
        "movie_title": titles.values
    })

In [None]:
def recommend_item_based(user_id, num_recommendations=5):
    """
    Recommend movies using Item-Based Collaborative Filtering.

    Returns a pandas DataFrame with movie_title and estimated score.
    """
    # Transpose to get item-user matrix
    item_similarity = cosine_similarity(user_item_matrix.T.fillna(0))
    item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)

    user_ratings = user_item_matrix.loc[user_id].dropna()
    weighted_scores = pd.Series(dtype=float)

    for item_id, rating in user_ratings.items():
        sim_scores = item_similarity_df[item_id] * rating
        weighted_scores = weighted_scores.add(sim_scores, fill_value=0)

    # Remove seen items
    weighted_scores = weighted_scores.drop(index=user_ratings.index, errors='ignore')

    top_n = weighted_scores.sort_values(ascending=False).head(num_recommendations)
    titles = items.set_index("movie_id").loc[top_n.index]["movie_title"]

    return pd.DataFrame({
        "movie_title": titles.values
    })

In [None]:
recommend_user_based(user_id=5, num_recommendations=5), recommend_item_based(user_id=5, num_recommendations=5)


### **3. Content-based Filtering**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_content_based(user_id, num_recommendations=5):
    """
    Recommend movies based on content similarity (genre vector).

    Returns a DataFrame with top-N movie titles and similarity scores,
    with index starting at 1.
    """

    # Ensure movie_features is set up
    movie_features = items.set_index("movie_id")[["movie_title"] + [f"genre_{i}" for i in range(19)]]
    movie_features = movie_features.drop_duplicates(subset="movie_title")

    # --- Build user profile ---
    user_ratings = df[df["user_id"] == user_id][["item_id", "rating"]]
    rated_movies = pd.merge(user_ratings, movie_features, left_on="item_id", right_index=True)
    genre_matrix = rated_movies[[f"genre_{i}" for i in range(19)]]
    user_profile = genre_matrix.T.dot(rated_movies["rating"])

    # --- Score all unseen movies ---
    seen_ids = user_ratings["item_id"].tolist()
    unseen_movies = movie_features[~movie_features.index.isin(seen_ids)]
    unseen_features = unseen_movies[[f"genre_{i}" for i in range(19)]]

    # Compute cosine similarity
    similarities = cosine_similarity([user_profile], unseen_features)[0]
    unseen_movies = unseen_movies.copy()
    unseen_movies["similarity"] = similarities

    # --- Return top N recommendations ---
    top_n = unseen_movies.sort_values(by="similarity", ascending=False).head(num_recommendations)
    return pd.DataFrame({
        "movie_title": top_n["movie_title"].values
    }, index=range(1, len(top_n) + 1))


In [None]:
recommend_content_based(user_id=5, num_recommendations=5)

### **4. Matrix Factorisation (collaborative filtering)**

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np

def recommend_svd_sklearn(user_id, num_recommendations=5):
    """
    Recommend movies using Matrix Factorization (Truncated SVD).
    Applies user mean normalization, proper train/test split,
    and returns a top-N DataFrame with index starting at 1.
    Also prints RMSE of predicted ratings on test set.
    """

    # 1. Split data
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # 2. Build train matrix (user_id x item_id)
    train_matrix = train_df.pivot(index="user_id", columns="item_id", values="rating")

    # 3. Normalize: subtract user mean rating
    user_means = train_matrix.mean(axis=1)
    train_matrix_norm = train_matrix.sub(user_means, axis=0).fillna(0)

    # 4. Apply Truncated SVD
    svd = TruncatedSVD(n_components=50, random_state=42)
    reduced_matrix = svd.fit_transform(train_matrix_norm)
    approx_matrix = np.dot(reduced_matrix, svd.components_)

    # 5. Denormalize by adding user means back
    predicted_ratings = pd.DataFrame(approx_matrix, index=train_matrix.index, columns=train_matrix.columns)
    predicted_ratings = predicted_ratings.add(user_means, axis=0)

    # 6. Evaluate RMSE on test set
    true_ratings = []
    pred_ratings = []
    for _, row in test_df.iterrows():
        u, i, r = row["user_id"], row["item_id"], row["rating"]
        if u in predicted_ratings.index and i in predicted_ratings.columns:
            true_ratings.append(r)
            pred_ratings.append(predicted_ratings.loc[u, i])
    rmse = sqrt(mean_squared_error(true_ratings, pred_ratings))
    # print("📉 RMSE (SVD, normalized):", round(rmse, 4))

    # 7. Generate top-N for the given user
    if user_id not in predicted_ratings.index:
        return pd.DataFrame({"movie_title": ["User not found"], "predicted_rating": [None]})

    user_row = predicted_ratings.loc[user_id]
    seen_movies = train_matrix.loc[user_id][train_matrix.loc[user_id].notna()].index
    unseen_ratings = user_row.drop(seen_movies, errors='ignore')
    top_n = unseen_ratings.sort_values(ascending=False).head(num_recommendations)

    movie_titles = items.set_index("movie_id").loc[top_n.index]["movie_title"]
    return pd.DataFrame({
        "movie_title": movie_titles.values
    }, index=range(1, len(top_n) + 1))


In [None]:
recommend_svd_sklearn(user_id=5, num_recommendations=5)


### **5. Hybrid Approach**

In [None]:
# assignment.ipynb - Cell 16

def recommend_hybrid(user_id, num_recommendations=5, alpha=0.5):
    if user_id not in predicted_ratings.index:
        print("User ID not found.")
        return pd.DataFrame()

    # --- Collaborative Filtering Predictions ---
    cf_scores = predicted_ratings.loc[user_id]

    # --- Content-Based Predictions ---
    user_profile = build_user_profile(user_id)
    genre_matrix = movie_features[[f"genre_{i}" for i in range(19)]]
    cb_similarities = cosine_similarity([user_profile], genre_matrix)[0]
    cb_scores = pd.Series(cb_similarities, index=movie_features.index)

    # Get movies user hasn't rated
    seen_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id].notna()].index
    unseen_movie_ids = [mid for mid in movie_features.index if mid not in seen_movies]

    # Combine predictions for unseen movies
    hybrid_scores = []
    for mid in unseen_movie_ids:
        if mid in cf_scores and mid in cb_scores:
            hybrid = alpha * cf_scores[mid] + (1 - alpha) * cb_scores[mid]
            hybrid_scores.append((mid, hybrid))

    # Top N
    top_n = sorted(hybrid_scores, key=lambda x: x[1], reverse=True)[:num_recommendations]
    top_movies = [(movie_features.loc[mid, "movie_title"], score) for mid, score in top_n]

    return pd.DataFrame(top_movies, columns=["movie_title", "hybrid_score"])

# Example
recommend_hybrid(user_id=5, num_recommendations=5)

### **6. Generative AI**

In [None]:
# assignment.ipynb - Cell 17

# Simulate movie descriptions (in real app, you'd use GPT or API calls)
movie_descriptions = items[["movie_id", "movie_title"]].copy()
movie_descriptions["description"] = "This is a story about " + movie_descriptions["movie_title"].str.lower() + " with drama, action, and emotion."

movie_descriptions.set_index("movie_id", inplace=True)
movie_descriptions.head()

In [None]:
# assignment.ipynb - Cell 18

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained model (small and fast)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all movie descriptions
desc_embeddings = model.encode(movie_descriptions["description"].tolist(), show_progress_bar=True)
movie_descriptions["embedding"] = list(desc_embeddings)

movie_descriptions.head()


In [None]:
from sentence_transformers import SentenceTransformer, util

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Simulate movie descriptions (or load real ones)
movie_descriptions = items["movie_title"].astype(str).tolist()

# Encode user query
user_query = "I like psychological thrillers with a twist ending"
user_vector = model.encode(user_query, convert_to_tensor=True)

# Encode movie descriptions
movie_vectors = model.encode(movie_descriptions, convert_to_tensor=True)

# Compute similarity
cosine_scores = util.cos_sim(user_vector, movie_vectors)

# Top N results
top_n = cosine_scores[0].topk(10)

# Get titles
recommended_titles = [movie_descriptions[idx] for idx in top_n.indices.tolist()]
recommended_scores = top_n.values.tolist()

# Format nicely
import pandas as pd
recommend_df = pd.DataFrame({
    "movie_title": recommended_titles,
    "similarity_score": recommended_scores
}, index=range(1, 11))

recommend_df


In [1]:
def recommend_svd_sklearn_fold(train_df, test_df):
    """
    Trains SVD on a specific train_df, evaluates RMSE on test_df.
    Returns the RMSE score.
    """

    # Build user-item matrix
    train_matrix = train_df.pivot(index="user_id", columns="item_id", values="rating")

    # Normalize: subtract user mean
    user_means = train_matrix.mean(axis=1).fillna(0)
    norm_matrix = train_matrix.sub(user_means, axis=0).fillna(0)

    # Train SVD
    svd = TruncatedSVD(n_components=50, random_state=42)
    reduced = svd.fit_transform(norm_matrix)
    reconstructed = np.dot(reduced, svd.components_)

    # Denormalize predictions
    predicted_ratings = pd.DataFrame(reconstructed, index=train_matrix.index, columns=train_matrix.columns)
    predicted_ratings = predicted_ratings.add(user_means, axis=0)

    # Evaluate RMSE
    true_ratings = []
    pred_ratings = []

    for _, row in test_df.iterrows():
        u, i, r = row["user_id"], row["item_id"], row["rating"]
        if u in predicted_ratings.index and i in predicted_ratings.columns:
            true_ratings.append(r)
            pred_ratings.append(predicted_ratings.loc[u, i])

    if true_ratings:
        return round(sqrt(mean_squared_error(true_ratings, pred_ratings)), 4)
    else:
        return None

In [6]:
def evaluate_svd_rmse_all_folds():
    """
    Runs recommend_svd_sklearn_fold() on u1–u5 and returns RMSE DataFrame.
    """

    rmse_scores = []

    for fold in range(1, 6):
        train_path = f"ml-100k/u{fold}.base"
        test_path = f"ml-100k/u{fold}.test"

        train_df = pd.read_csv(train_path, sep="\t", names=["user_id", "item_id", "rating", "timestamp"])
        test_df = pd.read_csv(test_path, sep="\t", names=["user_id", "item_id", "rating", "timestamp"])

        rmse = recommend_svd_sklearn_fold(train_df, test_df)
        rmse_scores.append(rmse)

    return pd.DataFrame({"Fold": [f"u{f}" for f in range(1, 6)], "RMSE": rmse_scores})

In [3]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from math import sqrt

evaluate_svd_rmse_all_folds()

NameError: name 'evaluate_svd_rmse_all_folds' is not defined

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_user_based_fold(train_df, user_id, num_recommendations=10):
    """
    User-based collaborative filtering recommendation using a specific fold's train_df.
    Returns a DataFrame of top-N recommended movie_ids and movie_titles.
    """

    # Build user-item matrix
    user_item_matrix = train_df.pivot(index="user_id", columns="item_id", values="rating")

    if user_id not in user_item_matrix.index:
        return pd.DataFrame({"movie_id": [], "movie_title": []})

    # Compute user-user similarity
    similarity = cosine_similarity(user_item_matrix.fillna(0))
    similarity_df = pd.DataFrame(similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

    # Sort similar users (excluding self)
    similar_users = similarity_df[user_id].drop(user_id).sort_values(ascending=False)

    weighted_scores = pd.Series(dtype=float)

    for other_user, sim in similar_users.items():
        other_ratings = user_item_matrix.loc[other_user]
        weighted_scores = weighted_scores.add(other_ratings * sim, fill_value=0)

    # Remove already seen items
    seen_items = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id].notna()].index
    weighted_scores = weighted_scores.drop(seen_items, errors='ignore')

    # Top N unseen
    top_n = weighted_scores.sort_values(ascending=False).head(num_recommendations)
    top_n_ids = top_n.index.tolist()

    # Get movie titles safely
    top_n_titles = items.set_index("movie_id").loc[top_n_ids]["movie_title"]

    return pd.DataFrame({
        "movie_id": top_n_ids,
        "movie_title": top_n_titles.values
    }, index=range(1, len(top_n_ids) + 1))


In [6]:
def evaluate_user_cf_precision_recall_all_folds():
    """
    Evaluates user-based CF on u1-u5 folds.
    Returns a DataFrame with Precision@10 and Recall@10 per fold.
    """

    results = []

    for fold in range(1, 6):
        train_path = f"ml-100k/u{fold}.base"
        test_path = f"ml-100k/u{fold}.test"

        train_df = pd.read_csv(train_path, sep="\t", names=["user_id", "item_id", "rating", "timestamp"])
        test_df = pd.read_csv(test_path, sep="\t", names=["user_id", "item_id", "rating", "timestamp"])

        user_ids = test_df["user_id"].unique()
        precisions, recalls = [], []

        for user_id in user_ids:
            relevant_items = test_df[(test_df["user_id"] == user_id) & (test_df["rating"] >= 4)]["item_id"].tolist()
            if not relevant_items:
                continue

            try:
                recs = recommend_user_based_fold(train_df, user_id, num_recommendations=10)
                recommended_ids = recs["movie_id"].tolist()

            except:
                continue

            tp = len([item for item in recommended_ids if item in relevant_items])
            precision = tp / 10
            recall = tp / len(relevant_items)

            precisions.append(precision)
            recalls.append(recall)

        if precisions and recalls:
            avg_precision = round(np.mean(precisions), 4)
            avg_recall = round(np.mean(recalls), 4)
        else:
            avg_precision = 0.0
            avg_recall = 0.0

        fold_result = {
            "Fold": f"u{fold}",
            "Precision@10": avg_precision,
            "Recall@10": avg_recall
        }

        print(f"Fold u{fold}: Precision@10 = {fold_result['Precision@10']}, Recall@10 = {fold_result['Recall@10']}")
        results.append(fold_result)

    return pd.DataFrame(results)


In [None]:
evaluate_user_cf_precision_recall_all_folds()

Fold u1: Precision@10 = 0.0, Recall@10 = 0.0
