# Project 4

In [22]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML


## System I: Recommendation Based on Popularity

In [16]:
ratings = pd.read_csv(
    "ml-1m/ratings.dat", 
    sep=':',
    header=None,
    usecols=[0, 2, 4, 6],
    names=['UserID', 'MovieID', 'Rating', 'Timestamp'],
    dtype={'UserID': 'int', 'MovieID': 'int', 'Rating': 'int', 'Timestamp': 'int'}
)

ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [17]:
with open("ml-1m/movies.dat", 'r', encoding='latin1') as file:
    movies_raw = file.readlines()

movies = pd.DataFrame([line.strip().split("::") for line in movies_raw], columns=['MovieID', 'Title', 'Genres'])

movies['MovieID'] = movies['MovieID'].astype(int)
movies['Year'] = movies['Title'].str.extract(r'\((\d{4})\)').astype(int)

movies.head()

Unnamed: 0,MovieID,Title,Genres,Year
0,1,Toy Story (1995),Animation|Children's|Comedy,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [20]:
users = pd.read_csv(
    "ml-1m/users.dat", 
    sep=':',
    header=None,
    usecols=[0, 2, 4, 6, 8],
    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'],
    dtype={'UserID': 'int', 'Age': 'int'}
)

users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [23]:
n_min = 100 # min number of ratings to be deemed relevant

movie_stats = ratings.groupby('MovieID').agg(
    RatingsCount=('Rating', 'size'), # number of ratings per movie
    AverageRating=('Rating', 'mean') # average rating per movie
).reset_index()

filtered_stats = movie_stats[movie_stats['RatingsCount'] >= n_min]

top_movies = filtered_stats.sort_values('AverageRating', ascending=False).head(10)

# Merge with the movies DataFrame to get movie titles
top_movies = pd.merge(movies_df, top_movies, on='MovieID')

# Display the results
print(top_movies[['Title', 'AverageRating', 'RatingsCount']])


                                               Title  AverageRating  \
0                         Usual Suspects, The (1995)       4.517106   
1                   Shawshank Redemption, The (1994)       4.554558   
2                            Schindler's List (1993)       4.510417   
3                              Close Shave, A (1995)       4.520548   
4                              Godfather, The (1972)       4.524966   
5                                 Rear Window (1954)       4.476190   
6      Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)       4.491489   
7                         Wrong Trousers, The (1993)       4.507937   
8                     Raiders of the Lost Ark (1981)       4.477725   
9  Seven Samurai (The Magnificent Seven) (Shichin...       4.560510   

   RatingsCount  
0          1783  
1          2227  
2          2304  
3           657  
4          2223  
5          1050  
6           470  
7           882  
8          2514  
9           628  


In [24]:
images_folder = "MovieImages/"

# Assume `top_movies` is your DataFrame with the top movie recommendations
# Add a new column with the image HTML
top_movies['Image'] = top_movies['MovieID'].apply(
    lambda x: f'<img src="{images_folder}{x}.jpg" style="width:100px;height:auto;">'
)

# Select columns to display, including the new image column
columns_to_display = ['Title', 'AverageRating', 'RatingsCount', 'Image']

# Render the DataFrame with HTML in a Jupyter Notebook
display(HTML(top_movies[columns_to_display].to_html(escape=False, index=False)))

Title,AverageRating,RatingsCount,Image
"Usual Suspects, The (1995)",4.517106,1783,
"Shawshank Redemption, The (1994)",4.554558,2227,
Schindler's List (1993),4.510417,2304,
"Close Shave, A (1995)",4.520548,657,
"Godfather, The (1972)",4.524966,2223,
Rear Window (1954),4.47619,1050,
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.491489,470,
"Wrong Trousers, The (1993)",4.507937,882,
Raiders of the Lost Ark (1981),4.477725,2514,
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),4.56051,628,
