In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Load the dataset
ratings_df = pd.read_csv('./dataset/ratings.csv')
movies_df = pd.read_csv('./dataset/movies.csv')

In [3]:
# Split the ratings data based on user IDs
train_data, test_data = train_test_split(ratings_df, test_size=0.5, stratify=ratings_df['userId'])

In [4]:
# Create a recommendation model (you can use collaborative filtering or other methods)
# For collaborative filtering using surprise library
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import cross_validate

In [5]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()

In [6]:
# Use SVD algorithm
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ee4ef655ed0>

In [7]:
# Make recommendations for a user
def get_top_n_recommendations(user_id, n=10):
    user_movies = train_data[train_data['userId'] == user_id]['movieId']
    user_unseen_movies = movies_df[~movies_df['movieId'].isin(user_movies)]['movieId']
    user_unseen_movies = user_unseen_movies[:n] if len(user_unseen_movies) >= n else user_unseen_movies
    user_unseen_movies = [(user_id, movie_id, 3.5) for movie_id in user_unseen_movies]
    predictions = algo.test(user_unseen_movies)
    top_n = [(pred.iid, pred.est) for pred in predictions]
    top_n.sort(key=lambda x: x[1], reverse=True)
    return top_n[:n]

In [8]:
user_id = 1
top_movies = get_top_n_recommendations(user_id)
print(f"Top {len(top_movies)} movie recommendations for user {user_id}:")
for movie_id, rating in top_movies:
    movie_title = movies_df[movies_df['movieId'] == movie_id]['title'].values[0]
    print(f"{movie_title} (predicted rating: {rating})")

Top 10 movie recommendations for user 1:
Heat (1995) (predicted rating: 4.754831314999759)
Toy Story (1995) (predicted rating: 4.626934715034225)
GoldenEye (1995) (predicted rating: 4.189714511846028)
American President, The (1995) (predicted rating: 4.093413326610825)
Waiting to Exhale (1995) (predicted rating: 4.061833572475976)
Jumanji (1995) (predicted rating: 3.9070561309133325)
Tom and Huck (1995) (predicted rating: 3.905631892269317)
Sudden Death (1995) (predicted rating: 3.88437590933953)
Sabrina (1995) (predicted rating: 3.770371508057371)
Father of the Bride Part II (1995) (predicted rating: 3.665155951745617)


In [9]:
from surprise import accuracy

# Make predictions on the test dataset
testset = Dataset.load_from_df(test_data[['userId', 'movieId', 'rating']], reader).build_full_trainset().build_testset()
predictions = algo.test(testset)

# Calculate RMSE
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")

RMSE: 0.8904
RMSE: 0.8903653270313755
