# Project 4

## Olivia Dalglish (od4) and Arindam Saha (saha2)

Contibutions: collaborated on approach to System 1, both coded respective portions and 

In [239]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML

In [240]:
ratings = pd.read_csv(
    "ml-1m/ratings.dat", 
    sep=':',
    header=None,
    usecols=[0, 2, 4, 6],
    names=['UserID', 'MovieID', 'Rating', 'Timestamp'],
    dtype={'UserID': 'int', 'MovieID': 'int', 'Rating': 'int', 'Timestamp': 'int'}
)

ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [241]:
with open("ml-1m/movies.dat", 'r', encoding='latin1') as file:
    movies_raw = file.readlines()

movies = pd.DataFrame([line.strip().split("::") for line in movies_raw], columns=['MovieID', 'Title', 'Genres'])

movies['MovieID'] = movies['MovieID'].astype(int)
movies['MovieIDm'] = movies['MovieID'].apply(lambda x: f"m{x}")
movies['Year'] = movies['Title'].str.extract(r'\((\d{4})\)').astype(int)

movies.head()

Unnamed: 0,MovieID,Title,Genres,MovieIDm,Year
0,1,Toy Story (1995),Animation|Children's|Comedy,m1,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,m2,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,m3,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,m4,1995
4,5,Father of the Bride Part II (1995),Comedy,m5,1995


In [242]:
users = pd.read_csv(
    "ml-1m/users.dat", 
    sep=':',
    header=None,
    usecols=[0, 2, 4, 6, 8],
    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'],
    dtype={'UserID': 'int', 'Age': 'int'}
)

users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## System I: Recommendation Based on Popularity

We create a weighted ranking based on the number of 5-star and 4-star ratings, as well as the total number of ratings. It gives priority to movies with high ratings, while also accounting for overall popularity. 

The weighting score is determined by summing the following weighted components:

* <b>n_5_star</b>: Number of 5-star ratings. It carries the most weight (0.5) since it indicates high satisfaction
* <b>n_4_star</b>: Number of 4-star ratings. It has a moderate weight (0.3) as it also signifies positive feedback
* <b>n_ratings</b>: Total number of ratings. It ensures that popular movies with a broader audience also get attention

Additionally, we only consider movies that have at least 50 ratings

In [243]:
n_min = 50 # min number of ratings to be deemed relevant

rating_counts = ratings.groupby(['MovieID', 'Rating']).size().unstack(fill_value=0)

rating_counts['n_5_star'] = rating_counts.get(5, 0)
rating_counts['n_4_star'] = rating_counts.get(4, 0)
rating_counts['n_ratings'] = rating_counts.sum(axis=1)

rating_counts['WeightedScore'] = (
    0.5 * rating_counts['n_5_star'] +
    0.3 * rating_counts['n_4_star'] +
    0.2 * rating_counts['n_ratings']
)

movies_with_scores = pd.merge(movies, rating_counts.reset_index(), on='MovieID')

movies_with_scores = movies_with_scores[movies_with_scores['n_ratings'] >= n_min]

sorted_movies = movies_with_scores.sort_values('WeightedScore', ascending=False)
sorted_movies.to_csv("movies_ranked_by_popularity.csv")
top_movies = sorted_movies.head(10)


images_folder = "MovieImages/"

top_movies['Image'] = top_movies['MovieID'].apply(
    lambda x: f'<img src="{images_folder}{x}.jpg" style="width:100px;height:auto;">'
)

columns_to_display = ['Image', 'MovieID', 'Title', 'WeightedScore']
display(HTML(top_movies[columns_to_display].sort_values('WeightedScore', ascending=False).to_html(escape=False, index=False)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_movies['Image'] = top_movies['MovieID'].apply(


Image,MovieID,Title,WeightedScore
,2858,American Beauty (1999),2504.7
,260,Star Wars: Episode IV - A New Hope (1977),2274.4
,1196,Star Wars: Episode V - The Empire Strikes Back (1980),2149.6
,2028,Saving Private Ryan (1998),1941.6
,1198,Raiders of the Lost Ark (1981),1932.8
,593,"Silence of the Lambs, The (1991)",1911.6
,2571,"Matrix, The (1999)",1889.5
,2762,"Sixth Sense, The (1999)",1850.3
,1210,Star Wars: Episode VI - Return of the Jedi (1983),1845.7
,608,Fargo (1996),1795.2


# System II: Recommendation Based on IBCF

In [244]:
Rmat = pd.read_csv("rmat.csv")

In [245]:
# step 1: normalize
row_means = Rmat.mean(axis=1, skipna=True)
R_centered = Rmat.sub(row_means, axis=0)

In [246]:
R_centered

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
u1,0.811321,,,,,,,,,,...,,,,,,,,,,
u10,0.885287,,,,,,,,,,...,,,,,,,,,,
u100,,,,,,,,,,,...,,,,,,,,,,
u1000,0.869048,,,,,,,,,,...,,,,,,,,,,
u1001,0.347480,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
u995,,,,,,,,,,,...,,,,,,,,,,
u996,0.064189,,,,,,,,,,...,,,,,,,,,,-0.935811
u997,0.066667,,,,,,,,,,...,,,,,,,,,,
u998,,,,,,,,,,,...,,,,,,,,,,


In [247]:
import numpy as np
import pandas as pd

# followed zoom recording https://campuswire.com/c/GB46E5679/feed/1145
def cosine_similarity_matrix(R_centered):
    
    num_movies = R_centered.shape[1]
    cosine_sim_matrix = np.full((num_movies, num_movies), np.nan)

    for i in range(num_movies):
        if i % 500 == 0:
            print(f"Processing row {i}...")
        for j in range(i + 1, num_movies):
            
            m1 = R_centered.iloc[:, i]
            m2 = R_centered.iloc[:, j]
            
            # get common ratings between the two movies (both non-NaN)
            common = m1.notna() & m2.notna()
            
            if common.sum() >= 3:  # only compute similarity if at least 3 common ratings
                m1_notna = m1.fillna(0) * common
                m2_notna = m2.fillna(0) * common

                dot = np.dot(m1_notna, m2_notna)
                
                m1_den = np.sqrt(np.sum(m1_notna**2))
                m2_den = np.sqrt(np.sum(m2_notna**2))

                cosine_sim_matrix[i, j] = 0.5 * (1 + dot / (m1_den * m2_den))
                cosine_sim_matrix[j, i] = cosine_sim_matrix[i, j]  # symmetric matrix
    
    return cosine_sim_matrix

s = cosine_similarity_matrix(R_centered)

Processing row 0...
Processing row 500...
Processing row 1000...
Processing row 1500...
Processing row 2000...
Processing row 2500...
Processing row 3000...
Processing row 3500...


In [248]:
sim_df = pd.DataFrame(s, columns=R_centered.columns, index=R_centered.columns.values)
sim_df.to_csv("similarity_matrix.csv")

In [249]:
# step 3
def retain_top_n(df, n):
    top_30_indices = np.argsort(-df.values, axis=1)[:, :n]
    
    mask = np.zeros(df.shape, dtype=bool)
    
    # set True for the top n indices in each row
    row_indices = np.arange(df.shape[0])[:, None]
    mask[row_indices, top_30_indices] = True
    
    # apply mask top N values, set others to nan
    df_filtered = df.where(mask)
    
    return df_filtered

transformed_sim_df = retain_top_n(sim_df, n=30)

transformed_sim_df.to_csv("transformed_similarity_matrix.csv")

In [250]:
specified_modies = ["m1", "m10", "m100", "m1510", "m260", "m3212"]

pairwise_similarities = sim_df.loc[specified_movies, specified_movies]
pairwise_similarities_rounded = pairwise_similarities.round(7)
pairwise_similarities_rounded

Unnamed: 0,m1,m10,m100,m1510,m260
m1,,0.512105,0.392,,0.741148
m10,0.512105,,0.547458,,0.534334
m100,0.392,0.547458,,,0.329694
m1510,,,,,
m260,0.741148,0.534334,0.329694,,


In [251]:
def IBCF(newuser, similarity_matrix):
    predicted_ratings = pd.Series(index=newuser.index)
    
    for i, idx in enumerate(newuser.index.values):
        if np.isnan(newuser[i]):
            S_i = similarity_matrix.iloc[i].dropna().index  # movies that are similar to movie i
            
            rated_indices = newuser.index[~newuser.isna()]
            common_indices = rated_indices.intersection(S_i)
            
            if len(common_indices) > 0:
                numerator = np.sum([similarity_matrix.iloc[i][j] * newuser[j] for j in common_indices])
                denominator = np.sum([similarity_matrix.iloc[i][j] for j in common_indices])

                predicted_rating = np.nan if denominator == 0 else numerator / denominator
                predicted_ratings.loc[idx] = predicted_rating
            else:
                predicted_ratings.loc[idx] = np.nan

    predicted_ratings.name = "pred"
    return predicted_ratings.sort_values(ascending=False)

def myIBCF(newuser, similarity_matrix):  
    icbf_ranking = IBCF(newuser, similarity_matrix).head(10).dropna()
    
    popularity_ranking = pd.read_csv("movies_ranked_by_popularity.csv", usecols=['MovieIDm', 'WeightedScore'])
    popularity_ranking['ibcf'] = False
    popularity_ranking.columns = ['movie_id', 'rating', 'ibcf']
    
    icbf_ranking = icbf_ranking.reset_index()
    icbf_ranking['ibcf'] = True
    icbf_ranking.columns = ['movie_id', 'rating', 'ibcf']
    
    output = pd.concat([icbf_ranking, popularity_ranking]).drop_duplicates(subset=['movie_id'], keep='first').head(10)
    return output.head(10)

### Test your function

In [252]:
test_user = pd.Series(index=sim_df.index)
test_user.loc["m1613"] = 5
test_user.loc["m1755"] = 4

print("Top movie predictions for test user")
predictions = myIBCF(test_user, transformed_sim_df)
print(predictions)

Top movie predictions for test user


  if np.isnan(newuser[i]):


  movie_id  rating  ibcf
0    m1017     5.0  True
1     m340     5.0  True
2      m74     5.0  True
3    m3269     5.0  True
4    m2870     5.0  True
5    m3466     5.0  True
6     m338     5.0  True
7    m3258     5.0  True
8    m3254     5.0  True
9     m316     5.0  True


In [253]:
test_user = Rmat.loc["u1181"]

print("Top movie predictions for user 1181")
predictions = myIBCF(test_user, transformed_sim_df)
print(predictions)

Top movie predictions for user 1181


  if np.isnan(newuser[i]):


  movie_id    rating  ibcf
0    m3732  5.000000  True
1     m749  4.526559  True
2    m3899  4.526066  True
3    m1235  4.000000  True
4    m1914  4.000000  True
5    m2082  4.000000  True
6     m249  4.000000  True
7     m504  4.000000  True
8    m1039  4.000000  True
9    m2793  4.000000  True
