In [34]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=columns)
n_users = df.user_id.nunique()
n_items = df.item_id.nunique()
print(f"Users: {n_users}, Items: {n_items}, Ratings: {len(df)}")

Users: 943, Items: 1682, Ratings: 100000


In [36]:
item_columns = [
    "movie id", "movie title", "release date", "video release date",
    "IMDb URL", "unknown", "Action", "Adventure", "Animation",
    "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
    "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
    "Thriller", "War", "Western"
]
items = pd.read_csv('ml-100k/u.item', sep='|', names=item_columns, encoding='latin-1')

In [37]:
matrix = df.pivot(index='user_id', columns='item_id', values='rating')
user_mean = matrix.mean(axis=1)

normalized_matrix = matrix.sub(user_mean, axis=0).fillna(0)

In [38]:
user_similarity = pd.DataFrame(
    cosine_similarity(normalized_matrix),
    index=normalized_matrix.index, columns=normalized_matrix.index
)

In [39]:
def recommend_for_user(user_id, k=5, n_neighbors=10):
    # Find similar users
    similarity_scores = user_similarity.loc[user_id].drop(user_id)
    top_neighbors = similarity_scores.nlargest(n_neighbors).index
    
    # Weighted sum of neighbor ratings
    neighbor_ratings = normalized_matrix.loc[top_neighbors]
    weights = similarity_scores[top_neighbors].values
    weighted_sum = neighbor_ratings.T.dot(weights)
    sim_sums = np.abs(weights).sum()
    scores = weighted_sum / (sim_sums + 1e-8)
    
    # Remove movies already rated by the user in training
    already_rated = normalized_matrix.loc[user_id] > 0
    scores = scores[~already_rated]
    
    # Get top k movie IDs
    top_movie_ids = scores.nlargest(k).index.tolist()

    # Map IDs to titles using `items` DataFrame
    recommended_titles = items.loc[items['movie id'].isin(top_movie_ids), 'movie title'].tolist()
    
    return recommended_titles


In [43]:
#Example Usage
print(df[df["user_id"]==6])
recommendations = recommend_for_user(6)
print("Recommendations For the Given User: ", recommendations, sep='\n')

       user_id  item_id  rating  timestamp
9            6       86       3  883603013
231          6       14       5  883599249
612          6       98       5  883600680
866          6      463       4  883601713
877          6      301       2  883600406
...        ...      ...     ...        ...
90756        6      237       2  883599914
93521        6      539       2  883681433
94111        6      537       4  883601277
95462        6      490       5  883601365
97068        6      202       3  883602690

[211 rows x 4 columns]
Recommendations For the Given User: 
['Patton (1970)', 'Secrets & Lies (1996)', 'Rear Window (1954)', 'Ran (1985)', 'Chinatown (1974)']
