# Project 4

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML

In [2]:
ratings = pd.read_csv(
    "ml-1m/ratings.dat", 
    sep=':',
    header=None,
    usecols=[0, 2, 4, 6],
    names=['UserID', 'MovieID', 'Rating', 'Timestamp'],
    dtype={'UserID': 'int', 'MovieID': 'int', 'Rating': 'int', 'Timestamp': 'int'}
)

ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
with open("ml-1m/movies.dat", 'r', encoding='latin1') as file:
    movies_raw = file.readlines()

movies = pd.DataFrame([line.strip().split("::") for line in movies_raw], columns=['MovieID', 'Title', 'Genres'])

movies['MovieID'] = movies['MovieID'].astype(int)
movies['Year'] = movies['Title'].str.extract(r'\((\d{4})\)').astype(int)

movies.head()

Unnamed: 0,MovieID,Title,Genres,Year
0,1,Toy Story (1995),Animation|Children's|Comedy,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [4]:
users = pd.read_csv(
    "ml-1m/users.dat", 
    sep=':',
    header=None,
    usecols=[0, 2, 4, 6, 8],
    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'],
    dtype={'UserID': 'int', 'Age': 'int'}
)

users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## System I: Recommendation Based on Popularity

We create a weighted ranking based on the number of 5-star and 4-star ratings, as well as the total number of ratings. It gives priority to movies with high ratings, while also accounting for overall popularity. 

The weighting score is determined by summing the following weighted components:

* <b>n_5_star</b>: Number of 5-star ratings. It carries the most weight (0.5) since it indicates high satisfaction
* <b>n_4_star</b>: Number of 4-star ratings. It has a moderate weight (0.3) as it also signifies positive feedback
* <b>n_ratings</b>: Total number of ratings. It ensures that popular movies with a broader audience also get attention

Additionally, we only consider movies that have at least 50 ratings

In [5]:
n_min = 50 # min number of ratings to be deemed relevant

rating_counts = ratings.groupby(['MovieID', 'Rating']).size().unstack(fill_value=0)

rating_counts['n_5_star'] = rating_counts.get(5, 0)
rating_counts['n_4_star'] = rating_counts.get(4, 0)
rating_counts['n_ratings'] = rating_counts.sum(axis=1)

rating_counts['WeightedScore'] = (
    0.5 * rating_counts['n_5_star'] +
    0.3 * rating_counts['n_4_star'] +
    0.2 * rating_counts['n_ratings']
)

movies_with_scores = pd.merge(movies, rating_counts.reset_index(), on='MovieID')

movies_with_scores = movies_with_scores[movies_with_scores['n_ratings'] >= n_min]

top_movies = movies_with_scores.sort_values('WeightedScore', ascending=False).head(10)


images_folder = "MovieImages/"

top_movies['Image'] = top_movies['MovieID'].apply(
    lambda x: f'<img src="{images_folder}{x}.jpg" style="width:100px;height:auto;">'
)

columns_to_display = ['Image', 'MovieID', 'Title', 'WeightedScore']
display(HTML(top_movies[columns_to_display].sort_values('WeightedScore', ascending=False).to_html(escape=False, index=False)))

Image,MovieID,Title,WeightedScore
,2858,American Beauty (1999),2504.7
,260,Star Wars: Episode IV - A New Hope (1977),2274.4
,1196,Star Wars: Episode V - The Empire Strikes Back (1980),2149.6
,2028,Saving Private Ryan (1998),1941.6
,1198,Raiders of the Lost Ark (1981),1932.8
,593,"Silence of the Lambs, The (1991)",1911.6
,2571,"Matrix, The (1999)",1889.5
,2762,"Sixth Sense, The (1999)",1850.3
,1210,Star Wars: Episode VI - Return of the Jedi (1983),1845.7
,608,Fargo (1996),1795.2


# System II: Recommendation Based on IBCF

In [6]:
Rmat = pd.read_csv("rmat.csv")

In [7]:
# step 1: normalize
row_means = Rmat.mean(axis=1, skipna=True)
R_centered = Rmat.sub(row_means, axis=0)

In [8]:
R_centered

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
u1,0.811321,,,,,,,,,,...,,,,,,,,,,
u10,0.885287,,,,,,,,,,...,,,,,,,,,,
u100,,,,,,,,,,,...,,,,,,,,,,
u1000,0.869048,,,,,,,,,,...,,,,,,,,,,
u1001,0.347480,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
u995,,,,,,,,,,,...,,,,,,,,,,
u996,0.064189,,,,,,,,,,...,,,,,,,,,,-0.935811
u997,0.066667,,,,,,,,,,...,,,,,,,,,,
u998,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# step 2 - compute cosine sim matrix
def cosine_sim(movie_i, movie_j, R):
    ratings_i = R.iloc[:, movie_i]
    ratings_j = R.iloc[:, movie_j]
    
    # ignore pairs with less than 3 common ratings
    valid_users = ~ratings_i.isna() & ~ratings_j.isna()
    if valid_users.sum() < 3:
        return np.nan
    
    R_i = ratings_i[valid_users]
    R_j = ratings_j[valid_users]
    
    numerator = np.sum(R_i * R_j)
    denominator = np.sqrt(np.sum(R_i ** 2)) * np.sqrt(np.sum(R_j ** 2))
    
    if denominator == 0:
        return np.nan
    
    return 0.5 + (0.5 * numerator / denominator)

n_movies = R_centered.shape[1]
sim_matrix = np.full((n_movies, n_movies), np.nan)

for i in range(n_movies):
    for j in range(i + 1, n_movies):
        sim = cosine_sim(i, j, R_centered)
        sim_matrix[i, j] = sim
        sim_matrix[j, i] = sim

sim_df = pd.DataFrame(sim_matrix, columns=R_centered.columns, index=R_centered.columns)

sim_df.head()

In [None]:
# step 3
def sort_and_keep_top_k(mat, top_k=30):
    sorted_mat = mat.copy()
    
    for i in range(sorted_mat.shape[0]):
        row = sorted_mat.iloc[i].sort_values(ascending=False, na_position='last')
        row[:top_k] = row[:top_k]
        sorted_mat.iloc[i] = row
        
    return sorted_mat

sorted_sim_mat = sort_and_keep_top_k(sim_df, top_k=30)

sorted_sim_mat.to_csv("transformed_similarity_matrix.csv")

specified_movies = [1, 10, 100, 1510, 260, 3212]

pairwise_similarities = sorted_sim_mat.iloc[specified_movies, specified_movies]

pairwise_similarities_rounded = pairwise_similarities.round(7)

print(pairwise_similarities_rounded)
