In [15]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/ua.base', sep='\t', names=columns)
test_df = pd.read_csv('ml-100k/ua.test', sep='\t', names=columns)
n_users = df.user_id.nunique()
n_items = df.item_id.nunique()
print(f"Users: {n_users}, Items: {n_items}, Ratings: {len(df)}")

Users: 943, Items: 1680, Ratings: 90570


In [17]:
item_columns = [
    "movie id", "movie title", "release date", "video release date",
    "IMDb URL", "unknown", "Action", "Adventure", "Animation",
    "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
    "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
    "Thriller", "War", "Western"
]
items = pd.read_csv('ml-100k/u.item', sep='|', names=item_columns, encoding='latin-1')

In [18]:
matrix = df.pivot(index='user_id', columns='item_id', values='rating')
normalized_matrix = matrix.subtract(matrix.mean(axis=0), axis=0)

In [23]:
item_similarity = pd.DataFrame(
    cosine_similarity(normalized_matrix.T.fillna(0)),
    index=normalized_matrix.columns,
    columns=normalized_matrix.columns
)

item_similarity

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.142712,0.070968,0.156014,0.182296,0.081251,0.305262,0.284771,0.190762,0.138466,...,0.0,0.0,0.000000,0.000000,0.013249,0.0,0.0,0.0,-0.005521,-0.032733
2,0.142712,1.000000,0.149801,0.182327,0.139889,-0.016387,0.126270,0.124144,-0.018611,0.054265,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.068137,0.013964
3,0.070968,0.149801,1.000000,0.000255,0.111543,0.032032,0.083419,-0.030727,0.004141,0.038509,...,0.0,0.0,0.000000,0.000000,0.183692,0.0,0.0,0.0,0.000000,0.013893
4,0.156014,0.182327,0.000255,1.000000,-0.006137,0.043804,0.221732,0.255854,0.178527,0.055254,...,0.0,0.0,-0.080422,-0.080422,0.085784,0.0,0.0,0.0,0.055402,-0.052984
5,0.182296,0.139889,0.111543,-0.006137,1.000000,-0.016161,0.153427,0.173429,0.084098,0.003599,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.016301
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,1.0,1.0,1.0,0.000000,0.000000
1679,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,1.0,1.0,1.0,0.000000,0.000000
1680,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,1.0,1.0,1.0,0.000000,0.000000
1681,-0.005521,0.068137,0.000000,0.055402,0.000000,0.000000,-0.005669,-0.060362,-0.006150,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,1.000000,0.000000


In [20]:
def recommend_movies(user_id, k=5):
    user_ratings = normalized_matrix.loc[user_id]
    rated_movies = user_ratings[user_ratings.notna()]
    scores = pd.Series(dtype=float)
    for movie, rating in rated_movies.items():
        similar_movies = item_similarity[movie]
        weighted_scores = similar_movies * rating
        scores = scores.add(weighted_scores, fill_value=0)
    
    scores = scores.drop(rated_movies.index, errors='ignore')
    
    return scores.sort_values(ascending=False).head(k)


In [27]:
def convert_ids_to_movieTitles(ids):
    # Map IDs to titles using `items` DataFrame
    return items.loc[items['movie id'].isin(ids.index), 'movie title'].tolist()


In [28]:
#Example Usage
print(df[df["user_id"]==6])
ids = recommend_movies(6)
recs = convert_ids_to_movieTitles(ids)
print("Recommendations For the Given User: ", recs, sep='\n')

     user_id  item_id  rating  timestamp
537        6        1       4  883599478
538        6        7       2  883599102
539        6        8       4  883600657
540        6        9       4  883599205
541        6       12       4  883601053
..       ...      ...     ...        ...
733        6      535       2  883600030
734        6      536       4  883599400
735        6      537       4  883601277
736        6      538       2  883268483
737        6      539       2  883681433

[201 rows x 4 columns]
Recommendations For the Given User: 
['Taxi Driver (1976)', 'Clockwork Orange, A (1971)', 'Rear Window (1954)', 'Chinatown (1974)', 'Manchurian Candidate, The (1962)']


In [31]:
def precision_at_k(test_df, k=5):
    precisions = []
    
    for user_id in test_df['user_id'].unique():
        # Get top-k recommendations
        recommended_ids = recommend_movies(user_id, k=k) 
        
        # Get the set of relevant items for this user in the test set
        #Relevant Movies Set as Rated 3 or Above
        relevant_items = set(
            test_df[(test_df['user_id'] == user_id) & (test_df['rating'] >= 3)]['item_id']
        )
        
        # Count Relevant Movies that are Recommended
        hits = 0 
        for movie in recommended_ids.index:
            if movie in relevant_items:
                hits += 1
        
        # Precision: hits / k
        if k > 0:
            precisions.append(hits / k)
    
    return np.mean(precisions)


In [32]:
print("Precision@K (K = 10): ", float(precision_at_k(test_df, k=10)))

Precision@K (K = 10):  0.08621420996818664
