# Project 4

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML

In [2]:
ratings = pd.read_csv(
    "ml-1m/ratings.dat", 
    sep=':',
    header=None,
    usecols=[0, 2, 4, 6],
    names=['UserID', 'MovieID', 'Rating', 'Timestamp'],
    dtype={'UserID': 'int', 'MovieID': 'int', 'Rating': 'int', 'Timestamp': 'int'}
)

ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
with open("ml-1m/movies.dat", 'r', encoding='latin1') as file:
    movies_raw = file.readlines()

movies = pd.DataFrame([line.strip().split("::") for line in movies_raw], columns=['MovieID', 'Title', 'Genres'])

movies['MovieID'] = movies['MovieID'].astype(int)
movies['MovieIDm'] = movies['MovieID'].apply(lambda x: f"m{x}")
movies['Year'] = movies['Title'].str.extract(r'\((\d{4})\)').astype(int)

movies.head()

Unnamed: 0,MovieID,Title,Genres,MovieIDm,Year
0,1,Toy Story (1995),Animation|Children's|Comedy,m1,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,m2,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,m3,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,m4,1995
4,5,Father of the Bride Part II (1995),Comedy,m5,1995


In [4]:
users = pd.read_csv(
    "ml-1m/users.dat", 
    sep=':',
    header=None,
    usecols=[0, 2, 4, 6, 8],
    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'],
    dtype={'UserID': 'int', 'Age': 'int'}
)

users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## System I: Recommendation Based on Popularity

We create a weighted ranking based on the number of 5-star and 4-star ratings, as well as the total number of ratings. It gives priority to movies with high ratings, while also accounting for overall popularity. 

The weighting score is determined by summing the following weighted components:

* <b>n_5_star</b>: Number of 5-star ratings. It carries the most weight (0.5) since it indicates high satisfaction
* <b>n_4_star</b>: Number of 4-star ratings. It has a moderate weight (0.3) as it also signifies positive feedback
* <b>n_ratings</b>: Total number of ratings. It ensures that popular movies with a broader audience also get attention

Additionally, we only consider movies that have at least 50 ratings

In [20]:
n_min = 50 # min number of ratings to be deemed relevant

rating_counts = ratings.groupby(['MovieID', 'Rating']).size().unstack(fill_value=0)

rating_counts['n_5_star'] = rating_counts.get(5, 0)
rating_counts['n_4_star'] = rating_counts.get(4, 0)
rating_counts['n_ratings'] = rating_counts.sum(axis=1)

rating_counts['WeightedScore'] = (
    0.5 * rating_counts['n_5_star'] +
    0.3 * rating_counts['n_4_star'] +
    0.2 * rating_counts['n_ratings']
)

movies_with_scores = pd.merge(movies, rating_counts.reset_index(), on='MovieID')

movies_with_scores = movies_with_scores[movies_with_scores['n_ratings'] >= n_min]

top_movies = movies_with_scores.sort_values('WeightedScore', ascending=False).head(10)


images_folder = "MovieImages/"

top_movies['Image'] = top_movies['MovieID'].apply(
    lambda x: f'<img src="{images_folder}{x}.jpg" style="width:100px;height:auto;">'
)

columns_to_display = ['Image', 'MovieID', 'Title', 'WeightedScore']
display(HTML(top_movies[columns_to_display].sort_values('WeightedScore', ascending=False).to_html(escape=False, index=False)))

Image,MovieID,Title,WeightedScore
,2858,American Beauty (1999),2504.7
,260,Star Wars: Episode IV - A New Hope (1977),2274.4
,1196,Star Wars: Episode V - The Empire Strikes Back (1980),2149.6
,2028,Saving Private Ryan (1998),1941.6
,1198,Raiders of the Lost Ark (1981),1932.8
,593,"Silence of the Lambs, The (1991)",1911.6
,2571,"Matrix, The (1999)",1889.5
,2762,"Sixth Sense, The (1999)",1850.3
,1210,Star Wars: Episode VI - Return of the Jedi (1983),1845.7
,608,Fargo (1996),1795.2


# System II: Recommendation Based on IBCF

In [59]:
Rmat = pd.read_csv("rmat.csv")

In [60]:
# step 1: normalize
row_means = Rmat.mean(axis=1, skipna=True)
R_centered = Rmat.sub(row_means, axis=0)

In [61]:
R_centered

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
u1,0.811321,,,,,,,,,,...,,,,,,,,,,
u10,0.885287,,,,,,,,,,...,,,,,,,,,,
u100,,,,,,,,,,,...,,,,,,,,,,
u1000,0.869048,,,,,,,,,,...,,,,,,,,,,
u1001,0.347480,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
u995,,,,,,,,,,,...,,,,,,,,,,
u996,0.064189,,,,,,,,,,...,,,,,,,,,,-0.935811
u997,0.066667,,,,,,,,,,...,,,,,,,,,,
u998,,,,,,,,,,,...,,,,,,,,,,


In [62]:
import numpy as np
import pandas as pd

# followed zoom recording https://campuswire.com/c/GB46E5679/feed/1145
def cosine_similarity_matrix(R_centered):
    
    num_movies = R_centered.shape[1]
    cosine_sim_matrix = np.full((num_movies, num_movies), np.nan)

    for i in range(num_movies):
        print(i)
        for j in range(i + 1, num_movies):
            
            m1 = R_centered.iloc[:, i]
            m2 = R_centered.iloc[:, j]
            
            # get common ratings between the two movies (both non-NaN)
            common = m1.notna() & m2.notna()
            
            if common.sum() >= 3:  # only compute similarity if at least 3 common ratings
                m1_notna = m1.fillna(0) * common
                m2_notna = m2.fillna(0) * common

                dot = np.dot(m1_notna, m2_notna)
                
                m1_den = np.sqrt(np.sum(m1_notna**2))
                m2_den = np.sqrt(np.sum(m2_notna**2))

                cosine_sim_matrix[i, j] = 0.5 * (1 + dot / (m1_den * m2_den))
                cosine_sim_matrix[j, i] = cosine_sim_matrix[i, j]  # symmetric matrix
    
    return cosine_sim_matrix

s = cosine_similarity_matrix(R_centered)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [63]:
sim_df = pd.DataFrame(s, columns=R_centered.columns, index=R_centered.columns.values)
sim_df.to_csv("similarity_matrix.csv")

In [64]:
sim_df

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
m1,,0.512106,0.392000,0.729637,0.405249,0.344362,0.193479,0.292097,0.275762,0.434214,...,0.525635,0.167886,0.438244,0.204408,0.551756,0.683422,0.290653,0.514043,0.383772,0.414505
m10,0.512106,,0.547458,0.490472,,0.610983,0.423742,0.460659,0.657699,0.549540,...,0.261701,0.465863,0.448079,0.385735,,0.454464,0.547504,0.668733,0.448290,0.600812
m100,0.392000,0.547458,,0.482965,,0.836584,0.629538,0.568282,0.811807,0.488525,...,0.410753,0.642616,0.493640,0.193671,0.802844,0.306743,0.629374,0.269576,0.478923,0.612815
m1000,0.729637,0.490472,0.482965,,,0.180765,,,,0.705223,...,,,0.207393,0.901521,,0.226027,0.668436,,0.725336,0.680574
m1002,0.405249,,,,,,,,,,...,,,,,,0.722766,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m994,0.683422,0.454464,0.306743,0.226027,0.722766,0.251738,0.227186,0.140286,0.249062,0.274397,...,0.401180,0.148686,0.470518,0.192858,0.539714,,0.215561,0.449014,0.307824,0.398517
m996,0.290653,0.547504,0.629374,0.668436,,0.790889,0.711965,0.691134,0.806075,0.621695,...,0.618137,0.779649,0.478071,0.797518,,0.215561,,0.077113,0.556378,0.622558
m997,0.514043,0.668733,0.269576,,,0.366023,0.932724,0.949228,0.214426,0.210009,...,0.215711,0.866121,0.416222,,0.412018,0.449014,0.077113,,0.642635,0.460646
m998,0.383772,0.448290,0.478923,0.725336,,0.445008,0.843772,0.604815,0.354571,0.504146,...,,0.698391,0.662904,0.852328,,0.307824,0.556378,0.642635,,0.642727


In [111]:
# step 3
def retain_top_n(df, n):
    top_30_indices = np.argsort(-df.values, axis=1)[:, :n]
    
    mask = np.zeros(df.shape, dtype=bool)
    
    # set True for the top n indices in each row
    row_indices = np.arange(df.shape[0])[:, None]
    mask[row_indices, top_30_indices] = True
    
    # apply mask top N values, set others to nan
    df_filtered = df.where(mask)
    
    return df_filtered

transformed_sim_df = retain_top_n(sim_df, n=30)

transformed_sim_df.to_csv("transformed_similarity_matrix.csv")

In [114]:
specified_modies = ["m1", "m10", "m100", "m1510", "m260", "m3212"]

pairwise_similarities = sim_df.loc[specified_movies, specified_movies]
pairwise_similarities_rounded = pairwise_similarities.round(7)
pairwise_similarities_rounded

Unnamed: 0,m1,m10,m100,m1510,m260
m1,,0.512105,0.392,,0.741148
m10,0.512105,,0.547458,,0.534334
m100,0.392,0.547458,,,0.329694
m1510,,,,,
m260,0.741148,0.534334,0.329694,,


In [177]:
def IBCF(newuser, similarity_matrix):  
    predicted_ratings = pd.Series(index=newuser.index)
    
    for i, idx in enumerate(newuser.index.values):
        if np.isnan(newuser[i]):
            S_i = similarity_matrix.iloc[i].dropna().index  # movies that are similar to movie i
            
            rated_indices = newuser.index[~newuser.isna()]
            common_indices = rated_indices.intersection(S_i)
            
            if len(common_indices) > 0:
                numerator = np.sum([similarity_matrix.iloc[i][j] * newuser[j] for j in common_indices])
                denominator = np.sum([similarity_matrix.iloc[i][j] for j in common_indices])

                predicted_rating = np.nan if denominator == 0 else numerator / denominator
                predicted_ratings.loc[idx] = predicted_rating
            else:
                predicted_ratings.loc[idx] = np.nan

    predicted_ratings.name = "pred"
    return predicted_ratings


newuser = Rmat.loc["u1181"]
preds = myIBCF(newuser, transformed_sim_df)
preds.sort_values(ascending=False).head(10)

m3732    5.000000
m749     4.526559
m3899    4.526066
m1235    4.000000
m1914    4.000000
m2082    4.000000
m249     4.000000
m504     4.000000
m1039    4.000000
m2793    4.000000
dtype: float64

In [168]:
#predictions.sort_values(by='pred').head(50)
predictions.sort_values(by='pred', ascending=False).head(10)

Unnamed: 0,movie_id,title,pred
660,m1661,Switchback (1997),5.0
452,m1446,Kolya (1996),5.0
887,m1904,Henry Fool (1997),5.0
372,m1366,"Crucible, The (1996)",5.0
2673,m3567,Bossa Nova (1999),5.0
2494,m340,"War, The (1994)",5.0
3651,m947,My Man Godfrey (1936),5.0
2311,m3224,Woman in the Dunes (Suna no onna) (1964),5.0
3324,m61,Eye for an Eye (1996),5.0
1777,m2729,Lolita (1962),5.0


### Test your function

In [137]:
test_user = pd.Series(index=sim_df.index)
test_user.loc["m1613"] = 5
test_user.loc["m1755"] = 4

print("Top movie predictions for test user")
predictions = IBCF(test_user, sim_df).reset_index()
predictions = pd.merge(predictions, movies, left_on="index", right_on="MovieIDm")[["index", "Title", "pred"]]
predictions.columns = ['movie_id', 'title', 'pred']
predictions.sort_values(by='pred', ascending=False).head(10)

Top movie predictions for test user


  if np.isnan(newuser[i]):


Unnamed: 0,movie_id,title,pred
660,m1661,Switchback (1997),5.0
452,m1446,Kolya (1996),5.0
887,m1904,Henry Fool (1997),5.0
372,m1366,"Crucible, The (1996)",5.0
2673,m3567,Bossa Nova (1999),5.0
2494,m340,"War, The (1994)",5.0
3651,m947,My Man Godfrey (1936),5.0
2311,m3224,Woman in the Dunes (Suna no onna) (1964),5.0
3324,m61,Eye for an Eye (1996),5.0
1777,m2729,Lolita (1962),5.0


In [138]:
#However, for user "u1181," the recommendation list must include movies m3732, m749, and m3899.
Rmat.loc["u1181"]
predictions = IBCF(Rmat.loc["u1181"], sim_df)

print("Top movie predictions for user 1181")
predictions = IBCF(test_user, sim_df).reset_index()
predictions = pd.merge(predictions, movies, left_on="index", right_on="MovieIDm")[["index", "Title", "pred"]]
predictions.columns = ['movie_id', 'title', 'pred']
predictions.sort_values(by='pred', ascending=False).head(10)

  if np.isnan(newuser[i]):


Top movie predictions for user 1181


  if np.isnan(newuser[i]):


Unnamed: 0,movie_id,title,pred
660,m1661,Switchback (1997),5.0
452,m1446,Kolya (1996),5.0
887,m1904,Henry Fool (1997),5.0
372,m1366,"Crucible, The (1996)",5.0
2673,m3567,Bossa Nova (1999),5.0
2494,m340,"War, The (1994)",5.0
3651,m947,My Man Godfrey (1936),5.0
2311,m3224,Woman in the Dunes (Suna no onna) (1964),5.0
3324,m61,Eye for an Eye (1996),5.0
1777,m2729,Lolita (1962),5.0


In [86]:

indices = []
for idx, row in sim_df.iterrows():
    sorted_row = row.fillna(-1).sort_values(ascending=False)
    sorted_row.iloc[30:] = np.nan
    indices.append(list(sorted_row.index.values))

pd.DataFrame(indices, index=sim_df.index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3696,3697,3698,3699,3700,3701,3702,3703,3704,3705
m1,m53,m2487,m2304,m3880,m755,m3644,m3293,m2127,m567,m3292,...,m744,m3641,m286,m2869,m749,m2887,m2895,m2909,m2911,m1
m10,m3292,m767,m2630,m1651,m503,m2185,m1872,m137,m2341,m3050,...,m120,m1903,m3151,m1749,m2994,m3202,m2685,m2219,m3353,m3337
m100,m1585,m3806,m3339,m2209,m964,m963,m2689,m2626,m3374,m2582,...,m3333,m1369,m3312,m3323,m3322,m3321,m137,m3315,m3314,m3228
m1000,m605,m158,m2374,m3732,m1025,m1959,m1623,m2454,m3061,m2779,...,m3096,m3094,m3093,m3092,m3091,m309,m1830,m1832,m3086,m2938
m1002,m45,m1207,m47,m3358,m1244,m1721,m924,m1719,m3608,m899,...,m2318,m2320,m2322,m2323,m2325,m2326,m2327,m2328,m2330,m999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m994,m1067,m2833,m3934,m2627,m3092,m962,m960,m3670,m3808,m2304,...,m220,m1494,m2198,m624,m623,m618,m3443,m2172,m3454,m3665
m996,m3496,m3402,m2317,m958,m722,m626,m1312,m649,m3047,m1864,...,m3609,m1811,m3611,m2358,m129,m3621,m3625,m1815,m363,m3656
m997,m2227,m2911,m2260,m2511,m976,m3902,m2191,m2615,m715,m2204,...,m3377,m3376,m3375,m3374,m3373,m3372,m1731,m1739,m336,m2798
m998,m3680,m3013,m2509,m1631,m263,m3571,m3537,m726,m3898,m1501,...,m2659,m3575,m1444,m3585,m1450,m3580,m2666,m1455,m2667,m3720


In [140]:
newuser = Rmat.loc["u1181"]
newuser

m1       3.0
m10      4.0
m100     NaN
m1000    NaN
m1002    NaN
        ... 
m994     4.0
m996     2.0
m997     3.0
m998     NaN
m999     2.0
Name: u1181, Length: 3706, dtype: float64

In [148]:
movie_id = "m749"
if np.isnan(newuser.loc[movie_id]):
    similar_movies = sim_df.loc[movie_id].dropna().index.values
    print(similar_movies)
    # Filter for movies rated by the user
    user_rated_movies = [m_id for m_id in similar_movies if not np.isnan(newuser.loc[m_id])]
    print(user_rated_movies)
    # If no overlap, skip this movie
    if user_rated_movies:

        # Compute weighted average prediction
        numer = sum(sim_df.loc[movie_id, j] * newuser[j] for j in user_rated_movies)
        denom = sum(sim_df.loc[movie_id, j] for j in user_rated_movies)

        print(numer / denom)
        #if denom != 0:
        #    predictions[i] = numer / denom
    else:
        print("no rated indices")

['m1617' 'm1945' 'm480' 'm589' 'm858' 'm963']
['m1617', 'm1945', 'm480', 'm589', 'm858']
4.5265591633724265
