In [2]:
#Import Libraries
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split as surprise_train_test_split
import matplotlib.pyplot as plt

print("Libraries imported successfully.")


Libraries imported successfully.


In [None]:
#Load Dataset
try:
    df = pd.read_csv('movie_dataset.csv')
    print("Dataset loaded successfully.")
    print(f"Shape: {df.shape}")
    df.head()
except FileNotFoundError:
    print("Error: 'movie_dataset.csv' not found. Please place it in the same directory.")


Dataset loaded successfully.
Shape: (100000, 5)


In [None]:
#Create User-Movie Pivot Table
# User-Movie Matrix (for similarity-based recommendations)
movie_matrix = df.pivot_table(index='title', columns='user_id', values='rating').fillna(0)

# Sparse matrix for kNN
movie_csr = csr_matrix(movie_matrix.values)

print("User-Movie Matrix created.")
print(f"Matrix Shape: {movie_matrix.shape}")
movie_matrix.head()


User-Movie Matrix created.
Matrix Shape: (1664, 943)


user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-900 (1994),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (1996),2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),5.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
187 (1997),0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#Train kNN Model (User-Based Collaborative Filtering)
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(movie_csr)

print("User-based kNN Model trained successfully.")


User-based kNN Model trained successfully.


In [None]:
#Define Recommendation Function (User-Based)
def recommend_movie(movie_name, n_recommendations=5):
    """
    Recommend movies similar to a given movie using user similarity.
    """
    try:
        query_index = movie_matrix.index.get_loc(movie_name)
    except KeyError:
        print(f"Movie '{movie_name}' not found in the dataset.")
        return

    movie_vector = movie_matrix.iloc[query_index, :].values.reshape(1, -1)
    distances, indices = model_knn.kneighbors(movie_vector, n_neighbors=n_recommendations + 1)

    print(f"Recommendations for '{movie_name}':\n")
    for i in range(1, len(distances.flatten())):
        idx = indices.flatten()[i]
        dist = distances.flatten()[i]
        print(f"{i}: {movie_matrix.index[idx]} (Distance: {dist:.4f})")


In [None]:
#Test User-Based Recommendations
test_movie = "Star Wars (1977)"

if test_movie in movie_matrix.index:
    recommend_movie(test_movie)
else:
    # fallback
    recommend_movie(movie_matrix.index[0])


Recommendations for 'Star Wars (1977)':

1: Return of the Jedi (1983) (Distance: 0.1155)
2: Raiders of the Lost Ark (1981) (Distance: 0.2351)
3: Empire Strikes Back, The (1980) (Distance: 0.2502)
4: Toy Story (1995) (Distance: 0.2654)
5: Godfather, The (1972) (Distance: 0.3027)


In [8]:
# Transpose matrix for item-based similarity
item_matrix = df.pivot_table(index='user_id', columns='title', values='rating').fillna(0)
item_csr = csr_matrix(item_matrix.values)

# Train kNN for item-based
item_knn = NearestNeighbors(metric='cosine', algorithm='brute')
item_knn.fit(item_csr)

def recommend_item_based(movie_name, n_recommendations=5):
    try:
        query_index = item_matrix.columns.get_loc(movie_name)
    except KeyError:
        print(f"Movie '{movie_name}' not found in the dataset.")
        return
    
    movie_vector = item_matrix.iloc[:, query_index].values.reshape(1, -1)
    distances, indices = item_knn.kneighbors(movie_vector, n_neighbors=n_recommendations + 1)

    print(f"Item-Based Recommendations for '{movie_name}':\n")
    for i in range(1, len(distances.flatten())):
        idx = indices.flatten()[i]
        dist = distances.flatten()[i]
        print(f"{i}: {item_matrix.columns[idx]} (Distance: {dist:.4f})")


In [16]:
test_movie = "Star Wars (1977)" 

if test_movie in movie_matrix.index:
    recommend_movie(test_movie)
else:

    first_movie = movie_matrix.index[0]
    recommend_movie(first_movie)

Recommendations for 'Star Wars (1977)':

1: Return of the Jedi (1983) (Distance: 0.1155)
2: Raiders of the Lost Ark (1981) (Distance: 0.2351)
3: Empire Strikes Back, The (1980) (Distance: 0.2502)
4: Toy Story (1995) (Distance: 0.2654)
5: Godfather, The (1972) (Distance: 0.3027)


In [None]:
# Prepare data for surprise library
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'title', 'rating']], reader)
trainset, testset = surprise_train_test_split(data, test_size=0.2, random_state=42)

# Train SVD model
svd_model = SVD()
svd_model.fit(trainset)

# Predict top-n movies for a given user
def recommend_svd(user_id, n_recommendations=5):
    all_movies = df['title'].unique()
    predictions = [svd_model.predict(user_id, movie) for movie in all_movies]
    predictions.sort(key=lambda x: x.est, reverse=True)

    print(f"Top {n_recommendations} recommended movies for User {user_id}:\n")
    for i, pred in enumerate(predictions[:n_recommendations]):
        print(f"{i+1}: {pred.iid} (Predicted Rating: {pred.est:.2f})")


In [None]:
# Test SVD Recommendations
recommend_svd(user_id=242, n_recommendations=5)

In [25]:
def precision_at_k(recommended, actual, k=5):

    recommended_k = recommended[:k]
    hits = sum([1 for movie in recommended_k if movie in actual])
    return hits / k

In [20]:
test_user_id = 242

# Movies that the user actually rated positively (rating >=4)
user_actual_movies = df[(df['user_id']==test_user_id) & (df['rating']>=4)]['title'].tolist()
print(f"User {test_user_id} actually liked {len(user_actual_movies)} movies.")


User 242 actually liked 17 movies.


In [21]:
def get_user_based_recommendations(user_movie_name, n_recommendations=10):
    try:
        query_index = movie_matrix.index.get_loc(user_movie_name)
    except KeyError:
        return []
    movie_vector = movie_matrix.iloc[query_index, :].values.reshape(1, -1)
    distances, indices = model_knn.kneighbors(movie_vector, n_neighbors=n_recommendations+1)
    recommended = [movie_matrix.index[i] for i in indices.flatten()[1:]]
    return recommended

# Example: use one of the user's highly-rated movies
if user_actual_movies:
    recommended_movies = get_user_based_recommendations(user_actual_movies[0], n_recommendations=5)
    print("Recommended Movies:", recommended_movies)

Recommended Movies: ["It's My Party (1995)", 'Celluloid Closet, The (1995)', 'Sum of Us, The (1994)', 'Wild Reeds (1994)', 'Kiss Me, Guido (1997)']


In [22]:
if user_actual_movies:
    p_at_5 = precision_at_k(recommended_movies, user_actual_movies, k=5)
    print(f"Precision@5 for user {test_user_id}: {p_at_5:.2f}")

Precision@5 for user 242: 0.00


In [None]:
def get_svd_recommendations(user_id, n_recommendations=10):
    all_movies = df['title'].unique()
    predictions = [svd_model.predict(user_id, movie) for movie in all_movies]
    predictions.sort(key=lambda x: x.est, reverse=True)
    return [pred.iid for pred in predictions[:n_recommendations]]

# Compute Precision@5 for SVD
svd_recommended = get_svd_recommendations(test_user_id, n_recommendations=5)
p_at_5_svd = precision_at_k(svd_recommended, user_actual_movies, k=5)
print(f"SVD Precision@5 for user {test_user_id}: {p_at_5_svd:.2f}")