In [62]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [63]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/ua.base', sep='\t', names=columns)
test_df = pd.read_csv('ml-100k/ua.test', sep='\t', names=columns)
n_users = df.user_id.nunique()
n_items = df.item_id.nunique()
print(f"Users: {n_users}, Items: {n_items}, Ratings: {len(df)}")

Users: 943, Items: 1680, Ratings: 90570


In [64]:
item_columns = [
    "movie id", "movie title", "release date", "video release date",
    "IMDb URL", "unknown", "Action", "Adventure", "Animation",
    "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
    "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
    "Thriller", "War", "Western"
]
items = pd.read_csv('ml-100k/u.item', sep='|', names=item_columns, encoding='latin-1')

In [65]:
matrix = df.pivot(index='user_id', columns='item_id', values='rating')
user_mean = matrix.mean(axis=1)

normalized_matrix = matrix.sub(user_mean, axis=0).fillna(0)

In [66]:
user_similarity = pd.DataFrame(
    cosine_similarity(normalized_matrix),
    index=normalized_matrix.index, columns=normalized_matrix.index
)

In [67]:
def recommend_movies(user_id, k=5, n_neighbors=10):
    # Find similar users
    similarity_scores = user_similarity.loc[user_id].drop(user_id)
    top_neighbors = similarity_scores.nlargest(n_neighbors).index
    
    # Weighted sum of neighbor ratings
    neighbor_ratings = normalized_matrix.loc[top_neighbors]
    weights = similarity_scores[top_neighbors].values
    weighted_sum = neighbor_ratings.T.dot(weights)
    sim_sums = np.abs(weights).sum()
    scores = weighted_sum / (sim_sums + 1e-8)
    
    # Remove movies already rated by the user in training
    already_rated = normalized_matrix.loc[user_id] > 0
    scores = scores[~already_rated]
    
    # Get top k movie IDs
    return scores.nlargest(k).index.tolist()


In [68]:
def convert_ids_to_movieTitles(ids):
    # Map IDs to titles using `items` DataFrame
    return items.loc[items['movie id'].isin(ids), 'movie title'].tolist()


In [82]:
#Example Usage
print(df[df["user_id"]==6])
ids = recommend_movies(6)
recs = convert_ids_to_movieTitles(ids)
print("Recommendations For the Given User: ", recs, sep='\n')

     user_id  item_id  rating  timestamp
537        6        1       4  883599478
538        6        7       2  883599102
539        6        8       4  883600657
540        6        9       4  883599205
541        6       12       4  883601053
..       ...      ...     ...        ...
733        6      535       2  883600030
734        6      536       4  883599400
735        6      537       4  883601277
736        6      538       2  883268483
737        6      539       2  883681433

[201 rows x 4 columns]
Recommendations For the Given User: 
['Taxi Driver (1976)', 'Secrets & Lies (1996)', 'Rear Window (1954)', 'Ran (1985)', 'Chinatown (1974)']


In [80]:
def precision_at_k(test_df, k=5):
    precisions = []
    
    for user_id in test_df['user_id'].unique():
        # Get top-k recommendations
        recommended_ids = recommend_movies(user_id, k=k) 
        
        # Get the set of relevant items for this user in the test set
        #Relevant Movies Set as Rated 3 or Above
        relevant_items = set(
            test_df[(test_df['user_id'] == user_id) & (test_df['rating'] >= 3)]['item_id']
        )
        
        # Count Relevant Movies that are Recommended
        hits = 0 
        for movie in recommended_ids:
            if movie in relevant_items:
                hits += 1
        
        # Precision: hits / k
        if k > 0:
            precisions.append(hits / k)
    
    return np.mean(precisions)


In [81]:
print("Precision@K (K = 10): ", float(precision_at_k(test_df, k=10)))

Precision@K (K = 10):  0.10127253446447508
