In [15]:
# Import all necessary libraries
import pandas as pd
import numpy as np

# For similarity calculations
from sklearn.metrics.pairwise import cosine_similarity

# For matrix factorization (SVD)
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy


In [16]:
# Load u.data (user_id, item_id, rating, timestamp)
ratings = pd.read_csv("/kaggle/input/movielens-100k-dataset/ml-100k/u.data",
                      sep="\t", names=["user_id", "item_id", "rating", "timestamp"])

# Load u.item to get movie titles
movies = pd.read_csv("/kaggle/input/movielens-100k-dataset/ml-100k/u.item",
                     sep="|", encoding="latin-1", header=None,
                     names=["item_id", "title", "release_date", "video_release_date",
                            "IMDb_URL", "unknown", "Action", "Adventure", "Animation",
                            "Children's", "Comedy", "Crime", "Documentary", "Drama",
                            "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery",
                            "Romance", "Sci-Fi", "Thriller", "War", "Western"])

# Merge ratings with movie titles
ratings = ratings.merge(movies[["item_id", "title"]], on="item_id")
ratings.head()


Unnamed: 0,user_id,item_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [33]:
# Create the user-item matrix
user_item_matrix = ratings.pivot_table(index='user_id', columns='title', values='rating')

# Fill NaN with 0 for similarity calculation
user_item_filled = user_item_matrix.fillna(0)

user_item_filled.head()


title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0


In [34]:
# Compute cosine similarity between users
user_similarity = cosine_similarity(user_item_filled)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

# Function to recommend movies for a user based on similar users
def recommend_user_based(user_id, num_recommendations=5):
    # Find similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:]
    
    # Get movies the target user has already watched
    watched_movies = user_item_matrix.loc[user_id].dropna().index
    
    # Aggregate ratings from similar users
    recommendations = pd.Series(dtype=float)
    for sim_user in similar_users:
        sim_user_ratings = user_item_matrix.loc[sim_user]
        sim_user_ratings = sim_user_ratings.drop(watched_movies, errors='ignore')
        recommendations = recommendations.add(sim_user_ratings, fill_value=0)
    
    return recommendations.sort_values(ascending=False).head(num_recommendations)

# Example: Recommend for user 1
recommend_user_based(1)


title
English Patient, The (1996)    1759.0
Scream (1996)                  1645.0
Air Force One (1997)           1565.0
Liar Liar (1997)               1531.0
Titanic (1997)                 1486.0
dtype: float64

In [35]:
# Compute cosine similarity between items (movies)
item_similarity = cosine_similarity(user_item_filled.T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)

# Function to recommend similar movies based on a given movie
def recommend_item_based(movie_title, num_recommendations=5):
    similar_items = item_similarity_df[movie_title].sort_values(ascending=False).index[1:num_recommendations+1]
    return similar_items

# Example: Recommend similar to "Toy Story (1995)"
recommend_item_based("Toy Story (1995)")


Index(['Star Wars (1977)', 'Return of the Jedi (1983)',
       'Independence Day (ID4) (1996)', 'Rock, The (1996)',
       'Mission: Impossible (1996)'],
      dtype='object', name='title')

In [56]:
# Prepare data for Surprise library
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'item_id', 'rating']], reader)

# Train-test split
trainset, testset = train_test_split(data, test_size=0.2)

# Train SVD model
svd = SVD(
    n_factors=150,    # default 100 
    n_epochs=80,      # default 20 
    lr_all=0.005,     # learning rate 
    reg_all=0.02      # regularization to avoid overfitting
)

svd.fit(trainset)

# Predict on test set
svd_predictions = svd.test(testset)

# Evaluate RMSE
accuracy.rmse(svd_predictions)


RMSE: 0.9690


0.9690281147745594

In [58]:
# Function to compute Precision@K
def precision_at_k(predictions, k=5, threshold=3.5):
    # Map the predictions to each user
    user_est_true = {}
    for uid, iid, true_r, est, _ in predictions:
        user_est_true.setdefault(uid, []).append((est, true_r))
    
    precisions = {}
    for uid, user_ratings in user_est_true.items():
        # Sort by estimated rating
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        # Compute precision@K
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings[:k])
        precisions[uid] = n_rel / k
    
    return precisions

# Calculate Precision@k
ks = [1, 3, 5]  # values of k to test

for k in ks:
    precisions = precision_at_k(svd_predictions, k=k)
    avg_precision = np.mean(list(precisions.values()))
    print(f"Precision@{k}: {avg_precision:.4f}")



Precision@1: 0.7728
Precision@3: 0.7307
Precision@5: 0.6892
