# Objective:
- Step 1: Identify the top 3 most-rated movies from your dataset to present to the user.
- Step 2: Get the new user's ratings for those movies.
- Step 3: Add the new user and their ratings to the model.
- Step 4: Generate movie recommendations for the new user.

In [1]:
import kagglehub
import os
import pandas as pd

In [2]:
# Download latest version
path = kagglehub.dataset_download("prajitdatta/movielens-100k-dataset")
print("Path to dataset files:", path)

Path to dataset files: /Users/nicholasqualitza/.cache/kagglehub/datasets/prajitdatta/movielens-100k-dataset/versions/1


In [3]:
ml_100k_path = os.path.join("/Users/nicholasqualitza/.cache/kagglehub/datasets/prajitdatta/movielens-100k-dataset/versions/1", "ml-100k")
files_in_ml_100k = os.listdir(ml_100k_path)

In [4]:
# import ratings and movie dataset
ratings_path = os.path.join(ml_100k_path, "u.data")
ratings = pd.read_csv(ratings_path, sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
print(ratings.head())
movies_path = os.path.join(ml_100k_path, "u.item")
movies = pd.read_csv(movies_path, sep='|', encoding='ISO-8859-1', names=['movie_id', 'title'] + [f'col_{i}' for i in range(22)], usecols=['movie_id', 'title'])
print(movies.head())

   user_id  movie_id  rating  timestamp
0      196       242       3  881250949
1      186       302       3  891717742
2       22       377       1  878887116
3      244        51       2  880606923
4      166       346       1  886397596
   movie_id              title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)


In [5]:
# Merge ratings and movies
data = pd.merge(ratings, movies, on = 'movie_id')
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [6]:
# Find the top 3 most rated movies
most_rated_movies = data.groupby('movie_id').size().sort_values(ascending = False).head(3)
# Get movie IDs
top_movie_ids = most_rated_movies.index.tolist()
print("Top 3 most-rated movies:", top_movie_ids)

Top 3 most-rated movies: [50, 258, 100]


In [7]:
# Real World scenarion you would collect user input through API, for now simulate for testing
new_user_ratings = [
    {'user_id': 'new_user', 'movie_id': top_movie_ids[0], 'rating': 4.0},
    {'user_id': 'new_user', 'movie_id': top_movie_ids[1], 'rating': 5.0},
    {'user_id': 'new_user', 'movie_id': top_movie_ids[2], 'rating': 3.5},
]

In [8]:
# Convert to df
new_user_df = pd.DataFrame(new_user_ratings)

In [9]:
# add new user to df
maindf_with_new_user = pd.concat([data, new_user_df])
print(len(data))
print(len(maindf_with_new_user))

100000
100003


In [10]:
from surprise import Dataset, Reader

# Create a Surprise dataset from the updated DataFrame
reader = Reader(rating_scale=(1, 5))
data_with_new_user = Dataset.load_from_df(maindf_with_new_user[['user_id', 'movie_id', 'rating']], reader)

# Build the trainset
trainset = data_with_new_user.build_full_trainset()

In [11]:
# import model 
import pickle

with open('final_svd_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

print("Model loaded successfully")

Model loaded successfully


In [12]:
## train svd model on new trainset
loaded_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x122e49f50>

In [13]:
# Generate Recs for New User
all_movie_ids = data['movie_id'].unique()

# Filter out movies the new user has rated
rated_movie_ids = [rating['movie_id'] for rating in new_user_ratings]
unrated_movie_ids = [movie for movie in all_movie_ids if movie not in rated_movie_ids]

In [14]:
#Predict ratings for the unrated movies
recommendations = []
for movie_id in unrated_movie_ids:
    pred = loaded_model.predict('new_user', movie_id)
    recommendations.append((movie_id, pred.est))

In [15]:
# sort the recommendations by preicted rating in descending order
recommendations = sorted(recommendations, key = lambda x: x[1], reverse = True)

In [16]:
# Print top 10 Recommendations for new user
print("top 10 recommendations for new user:")
for movie_id, rating in recommendations[:10]:
    print(f"Movie ID: {movie_id}, Predicted Rating: {rating:.2f}")

top 10 recommendations for new user:
Movie ID: 1449, Predicted Rating: 4.55
Movie ID: 408, Predicted Rating: 4.51
Movie ID: 318, Predicted Rating: 4.50
Movie ID: 64, Predicted Rating: 4.49
Movie ID: 483, Predicted Rating: 4.45
Movie ID: 169, Predicted Rating: 4.45
Movie ID: 114, Predicted Rating: 4.43
Movie ID: 272, Predicted Rating: 4.37
Movie ID: 603, Predicted Rating: 4.35
Movie ID: 178, Predicted Rating: 4.34


In [19]:
# convert recs into dataframe
recommendation_df = pd.DataFrame(recommendations, columns = ['movie_id', 'predicted_rating'])

In [22]:
# Merge recommendations with maindf_with_newuser
top_recommendations = recommendation_df.merge(
  maindf_with_new_user[['movie_id', 'title']].drop_duplicates(),
  on = 'movie_id')

In [23]:
top_recommendations = top_recommendations.sort_values(by='predicted_rating', ascending=False)

print("Top 10 recommended movies:")
print(top_recommendations[['title', 'predicted_rating']].head(10))

Top 10 recommended movies:
                                               title  predicted_rating
0                             Pather Panchali (1955)          4.549928
1                              Close Shave, A (1995)          4.507649
2                            Schindler's List (1993)          4.497812
3                   Shawshank Redemption, The (1994)          4.491347
4                                  Casablanca (1942)          4.454427
5                         Wrong Trousers, The (1993)          4.448903
6  Wallace & Gromit: The Best of Aardman Animatio...          4.429150
7                           Good Will Hunting (1997)          4.365520
8                                 Rear Window (1954)          4.352151
9                                12 Angry Men (1957)          4.339688


In [31]:
# Check the movies the new user originally rated to see if this is similar list
data[data['movie_id'].isin([50, 258, 100])]['title'].unique()

array(['Fargo (1996)', 'Contact (1997)', 'Star Wars (1977)'], dtype=object)