In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("C:/Users/PMLS/OneDrive/Desktop/ml-100k/u.data", sep="\t", header=None)

df.columns = ["user_id", "item_id", "rating", "timestamp"]
df["datetime"] = pd.to_datetime(df["timestamp"], unit="s")
print(df.head())


   user_id  item_id  rating  timestamp            datetime
0      196      242       3  881250949 1997-12-04 15:55:49
1      186      302       3  891717742 1998-04-04 19:22:22
2       22      377       1  878887116 1997-11-07 07:18:36
3      244       51       2  880606923 1997-11-27 05:02:03
4      166      346       1  886397596 1998-02-02 05:33:16


In [3]:
movies = pd.read_csv( "C:/Users/PMLS/OneDrive/Desktop/ml-100k/u.item", sep="|", header=None, encoding="latin-1")

movies.columns = ["item_id", "title", "release_date", "video_release_date", "IMDb_URL"] + [f"genre_{i}" for i in range(19)]

movies = movies[["item_id", "title"]]


In [4]:
df = df.merge(movies, on="item_id")

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_user_item = train_df.pivot_table(
    index="user_id",
    columns="title",
    values="rating"
)


In [5]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarity = cosine_similarity(train_user_item.fillna(0))
user_similarity_df = pd.DataFrame(
    user_similarity,
    index=train_user_item.index,
    columns=train_user_item.index
)


In [6]:
def recommend_movies_userCF(target_user, num_recommendations=5):
    
    similar_users = user_similarity_df[target_user].sort_values(ascending=False)
    similar_users = similar_users.drop(target_user)  # Remove self

    top_users = similar_users.head(5).index
    top_users_ratings = train_user_item.loc[top_users]

    avg_ratings = top_users_ratings.mean(axis=0)

    seen_movies = train_user_item.loc[target_user].dropna().index
    
    recommendations = avg_ratings.drop(seen_movies).sort_values(ascending=False).head(num_recommendations)

    return recommendations


In [13]:
def precision_at_k_top_user(model_func, test_data, k=5, threshold=4):
    precisions = []
    user_precisions = {}

    for user in test_data["user_id"].unique():
        if user not in train_df["user_id"].unique():
            continue

        recs = model_func(user, num_recommendations=k)

        user_test_data = test_data[test_data["user_id"] == user]
        relevant_movies = set(user_test_data[user_test_data["rating"] >= threshold]["title"])
        recommended_movies = set(recs.index)
        hits = recommended_movies & relevant_movies

        precision = len(hits) / k
        precisions.append(precision)
        user_precisions[user] = (precision, recs, hits)

    # Find user with max precision
    best_user = max(user_precisions, key=lambda u: user_precisions[u][0])
    best_precision, best_recs, best_hits = user_precisions[best_user]

    # Print only best user's recommendations
    print(f"\nMost Relevant User ID: {best_user} with Precision@{k} = {best_precision:.3f}")
    for rank, movie in enumerate(best_recs.index, start=1):
        status = "Relevant" if movie in best_hits else "Not Relevant"
        print(f"  {rank}. {movie} ({status})")

    avg_precision = sum(precisions) / len(precisions)
    return avg_precision

score = precision_at_k_top_user(recommend_movies_userCF, test_df, k=5, threshold=4)
print(f"\nItem-CF Precision@5 (average across all users): {score:.3f}")



Most Relevant User ID: 426 with Precision@5 = 0.600
  1. Fargo (1996) (Relevant)
  2. Band Wagon, The (1953) (Not Relevant)
  3. Chinatown (1974) (Relevant)
  4. Dial M for Murder (1954) (Relevant)
  5. Hamlet (1996) (Not Relevant)

Item-CF Precision@5 (average across all users): 0.047


In [14]:
item_user_matrix = train_df.pivot_table(
    index="title",
    columns="user_id",
    values="rating"
)


In [15]:
item_similarity = cosine_similarity(item_user_matrix.fillna(0))
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=item_user_matrix.index,
    columns=item_user_matrix.index
)

In [16]:
def recommend_movies_itemcf(target_user, num_recommendations=5):
    
    user_ratings = train_df[train_df["user_id"] == target_user][["title", "rating"]]

    # Score for each movie (sum of similarity * rating)
    scores = pd.Series(dtype=float)

    for movie, rating in zip(user_ratings["title"], user_ratings["rating"]):
    
     similar_movies = item_similarity_df[movie] * rating
     scores = scores.add(similar_movies, fill_value=0)

    scores = scores.drop(user_ratings["title"], errors="ignore")

    recommendations = scores.sort_values(ascending=False).head(num_recommendations)
    return recommendations


In [17]:
score = precision_at_k_top_user(recommend_movies_itemcf, test_df, k=5, threshold=4)
print(f"\nItem-CF Precision@5 (average across all users): {score:.3f}")


Most Relevant User ID: 416 with Precision@5 = 1.000
  1. Empire Strikes Back, The (1980) (Relevant)
  2. Monty Python and the Holy Grail (1974) (Relevant)
  3. Return of the Jedi (1983) (Relevant)
  4. Star Wars (1977) (Relevant)
  5. Mrs. Doubtfire (1993) (Relevant)

Item-CF Precision@5 (average across all users): 0.261
