# Question 6: Movie Recommendation System

In [3]:
import pandas as pd
import numpy as np

movie_data_path = '../data//MovieData/ratings_small.csv'
movie_data = pd.read_csv(movie_data_path)
movie_data.drop('timestamp', axis=1, inplace=True)
unique_movies = movie_data['movieId'].unique()
uniique_users = movie_data['userId'].unique()

print(movie_data.head())
print(f'SHAPE : {movie_data.shape}')
print(f'DESCRIPTION : \n{movie_data.describe()}')
print(f'INFO : \n{movie_data.info()}')
print(f'Total Unique Movies : {len(unique_movies)}')
print(f'Total Unique Users : {len(uniique_users)}')

   userId  movieId  rating
0       1       31     2.5
1       1     1029     3.0
2       1     1061     3.0
3       1     1129     2.0
4       1     1172     4.0
SHAPE : (100004, 3)
DESCRIPTION : 
              userId        movieId         rating
count  100004.000000  100004.000000  100004.000000
mean      347.011310   12548.664363       3.543608
std       195.163838   26369.198969       1.058064
min         1.000000       1.000000       0.500000
25%       182.000000    1028.000000       3.000000
50%       367.000000    2406.500000       4.000000
75%       520.000000    5418.000000       4.000000
max       671.000000  163949.000000       5.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100004 non-null  int64  
 1   movieId  100004 non-null  int64  
 2   rating   100004 non-null  float64
dtypes: float64(1), int64(2)
memory usa

In [4]:
active_users = movie_data['userId'].value_counts().head(1000).index
popular_movies = movie_data['movieId'].value_counts().head(1000).index

subset_data = movie_data[movie_data['userId'].isin(active_users) & movie_data['movieId'].isin(popular_movies)]
print(subset_data.head())
print(f'SHAPE : {subset_data.shape}')

   userId  movieId  rating
0       1       31     2.5
1       1     1029     3.0
2       1     1061     3.0
3       1     1129     2.0
4       1     1172     4.0
SHAPE : (62397, 3)


In [5]:
user_movie_matrix = subset_data.pivot_table(index='userId', columns='movieId', values='rating')
user_movie_matrix.fillna(0, inplace=True)
print(user_movie_matrix.head())

movieId  1       2       3       5       6       7       10      11      \
userId                                                                    
1           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     4.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     4.0     0.0   
5           0.0     0.0     4.0     0.0     0.0     0.0     0.0     0.0   

movieId  14      16      ...  106920  109374  109487  111759  112552  112852  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

user_similarity_matrix = cosine_similarity(user_movie_matrix)
np.fill_diagonal(user_similarity_matrix, 0)
user_similarity_df = pd.DataFrame(user_similarity_matrix, index=user_movie_matrix.index, columns=user_movie_matrix.index)
print(user_similarity_df.head())

userId       1         2         3         4         5         6         7    \
userId                                                                         
1       0.000000  0.000000  0.000000  0.085142  0.016905  0.000000  0.085997   
2       0.000000  0.000000  0.146985  0.143051  0.109724  0.000000  0.229968   
3       0.000000  0.146985  0.000000  0.104785  0.171020  0.069457  0.178091   
4       0.085142  0.143051  0.104785  0.000000  0.150117  0.092800  0.368138   
5       0.016905  0.109724  0.171020  0.150117  0.000000  0.065360  0.098811   

userId       8         9         10   ...       662       663       664  \
userId                                ...                                 
1       0.000000  0.013781  0.000000  ...  0.000000  0.000000  0.018491   
2       0.123408  0.128084  0.056379  ...  0.511752  0.122564  0.104607   
3       0.277715  0.162022  0.126357  ...  0.184262  0.132724  0.252653   
4       0.226033  0.037311  0.194262  ...  0.133033  0.099404  0

In [7]:
print(user_similarity_df.index.max())

random_user = np.random.choice(user_movie_matrix.index)
print(random_user)

671
170


In [8]:
def get_top_similar_users(userId, user_similarity_df, threshold = 0, n =10):
    user_similarity_scores = user_similarity_df.iloc[userId, :]
    if threshold > 0:
        similar_users = user_similarity_scores[user_similarity_scores > threshold].sort_values(ascending=False). head(n)
    else:
        similar_users = user_similarity_scores.sort_values(ascending=False).head(n)
    return similar_users

In [9]:
similar_users = get_top_similar_users(random_user, user_similarity_df, threshold=0, n=10)
print(similar_users)
similar_users.index.to_list()

userId
476    0.368285
467    0.357991
190    0.351228
644    0.347794
626    0.293841
647    0.276544
457    0.274282
669    0.273889
608    0.266694
95     0.261030
Name: 171, dtype: float64


[476, 467, 190, 644, 626, 647, 457, 669, 608, 95]

In [10]:
similar_user_ratings = movie_data[movie_data['userId'].isin(similar_users.index.to_list())]
print(similar_user_ratings.head())

       userId  movieId  rating
14511      95        6     5.0
14512      95        9     3.0
14513      95       11     4.0
14514      95       16     5.0
14515      95       19     4.0


In [11]:
# Candidate scoring: 
# Add up ratings for each item, weighted by user similarity
from collections import defaultdict
scores=defaultdict(float)
for index, similar_user_rating in similar_user_ratings.iterrows():
    movie_id = similar_user_rating['movieId']
    user_rating = similar_user_rating['rating']
    user_id = similar_user_rating['userId']
    user_similarity_score = similar_users[int(user_id)]
    scores[int(movie_id)] += (user_rating / 5.0) * user_similarity_score

In [12]:
movie_data.loc[movie_data['userId'] == random_user].sort_values(by='rating', ascending=False).head(10)

Unnamed: 0,userId,movieId,rating
23987,170,1805,4.5
23984,170,1373,4.0
23983,170,1299,3.5
23980,170,1240,3.0
23999,170,3703,3.0
23982,170,1272,3.0
23989,170,2020,3.0
23991,170,2144,3.0
23986,170,1682,3.0
23998,170,3702,3.0


In [13]:
watched = {}
for index, row in movie_data.loc[movie_data['userId'] == random_user].iterrows():
    watched[row['movieId']] = row['rating']

In [14]:
from operator import itemgetter

recommendations = {}
pos = 0

for movie, score in sorted(scores.items(), key=itemgetter(1), reverse=True):
    if movie not in watched:
        recommendations[movie] = score
        pos += 1
    if pos >= 10:
        break

for recommendation, score in recommendations.items():
    print(f"Movie : {recommendation} , Score : {score}")

Movie : 2858 , Score : 2.522261375063027
Movie : 2762 , Score : 2.2980588905835555
Movie : 2395 , Score : 2.185399007704273
Movie : 2396 , Score : 2.101008084421222
Movie : 2599 , Score : 2.0422973625828766
Movie : 2580 , Score : 2.0004255568880063
Movie : 2716 , Score : 1.7272531441502688
Movie : 2997 , Score : 1.7270988396799996
Movie : 2712 , Score : 1.7014712955050242
Movie : 2959 , Score : 1.6243966211861844


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the data
data_path = "../data/MovieData/ratings_small.csv"
movie_data = pd.read_csv(data_path)

# Drop the timestamp column
movie_data.drop("timestamp", axis=1, inplace=True)

# Split the data into training and test sets
train_data, test_data = train_test_split(movie_data, test_size=0.2, random_state=42)

# Create a user-item matrix for training
train_user_item_matrix = train_data.pivot(index="userId", columns="movieId", values="rating").fillna(0)

# Calculate user similarity using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
user_similarity_matrix = cosine_similarity(train_user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity_matrix, index=train_user_item_matrix.index, columns=train_user_item_matrix.index)

# Generate recommendations for users in the test set
def recommend_movies(user_id, user_similarity_df, train_user_item_matrix, top_n=10):
    if user_id not in user_similarity_df.index:
        return []  # No recommendations for users not in the training set
    similar_users = user_similarity_df.loc[user_id].sort_values(ascending=False).index
    scores = train_user_item_matrix.loc[similar_users].sum(axis=0)
    scores = scores.sort_values(ascending=False)
    recommended_movies = scores.index[:top_n]
    return recommended_movies

# Evaluate the recommendations
precision_list = []
recall_list = []
f1_list = []

for user_id in test_data["userId"].unique():
    # Ground truth: Movies the user interacted with in the test set
    ground_truth = test_data[test_data["userId"] == user_id]["movieId"].tolist()
    
    # Predicted: Top-N recommended movies
    if user_id in train_user_item_matrix.index:
        recommendations = recommend_movies(user_id, user_similarity_df, train_user_item_matrix, top_n=10)
    else:
        recommendations = []  # No recommendations for users not in the training set

    # Calculate precision, recall, and F1-score for this user
    if len(ground_truth) > 0:  # Avoid division by zero
        true_positives = len(set(recommendations) & set(ground_truth))
        precision = true_positives / len(recommendations) if len(recommendations) > 0 else 0
        recall = true_positives / len(ground_truth)
        f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0

        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

# Calculate average metrics across all users
average_precision = np.mean(precision_list)
average_recall = np.mean(recall_list)
average_f1 = np.mean(f1_list)

print(f"Precision: {average_precision:.4f}")
print(f"Recall: {average_recall:.4f}")
print(f"F1-score: {average_f1:.4f}")

Precision: 0.0811
Recall: 0.0481
F1-score: 0.0492


In [17]:
import numpy as np

# Create an item-user matrix for training
train_item_user_matrix = train_data.pivot(index="movieId", columns="userId", values="rating").fillna(0)

# Calculate item similarity using cosine similarity
item_similarity_matrix = cosine_similarity(train_item_user_matrix)
item_similarity_df = pd.DataFrame(item_similarity_matrix, index=train_item_user_matrix.index, columns=train_item_user_matrix.index)

# Predict ratings for a user
def predict_ratings(user_id, train_item_user_matrix, item_similarity_df, top_n=10):
    if user_id not in train_item_user_matrix.columns:
        return []  # No recommendations for users not in the training set
    
    user_ratings = train_item_user_matrix[user_id]
    
    # Compute scores for items
    scores = item_similarity_df.dot(user_ratings) / np.abs(item_similarity_df).sum(axis=1)
    scores = pd.Series(scores, index=train_item_user_matrix.index)
    
    # Exclude items the user has already rated
    rated_items = user_ratings[user_ratings > 0].index
    scores = scores.drop(index=rated_items)
    
    # Recommend top-N items
    recommended_items = scores.sort_values(ascending=False).head(top_n).index.tolist()
    return recommended_items

# Evaluate the recommendations
precision_list = []
recall_list = []
f1_list = []

for user_id in test_data["userId"].unique():
    # Ground truth: Movies the user interacted with in the test set
    ground_truth = test_data[test_data["userId"] == user_id]["movieId"].tolist()
    
    # Predicted: Top-N recommended movies
    recommendations = predict_ratings(user_id, train_item_user_matrix, item_similarity_df, top_n=10)
    
    # Calculate precision, recall, and F1-score for this user
    if len(ground_truth) > 0:  # Avoid division by zero
        true_positives = len(set(recommendations) & set(ground_truth))
        precision = true_positives / len(recommendations) if len(recommendations) > 0 else 0
        recall = true_positives / len(ground_truth)
        f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0

        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

# Calculate average metrics across all users
average_precision = np.mean(precision_list)
average_recall = np.mean(recall_list)
average_f1 = np.mean(f1_list)

print(f"Precision: {average_precision:.4f}")
print(f"Recall: {average_recall:.4f}")
print(f"F1-score: {average_f1:.4f}")

Precision: 0.0033
Recall: 0.0008
F1-score: 0.0011
