In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import math

In [2]:
ratings = pd.read_csv('ratings.dat', sep='::', engine = 'python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
movies = pd.read_csv('movies.dat', sep='::', engine = 'python',
                     encoding="ISO-8859-1", header = None)
movies.columns = ['MovieID', 'Title', 'Genres']

In [3]:
# Simplify movie genres: multiple genres to ‘Multiple’
multiple_idx = pd.Series([("|" in movie) for movie in movies['Genres']])
movies.loc[multiple_idx, 'Genres'] = 'Multiple'

In [4]:
mpl.rcParams['figure.dpi'] = 300
sns.set()

In [5]:
# Merge ratings and movie datasets
rating_merged = ratings.merge(movies, on = 'MovieID')
rating_merged

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,James and the Giant Peach (1996),Multiple
2,1,914,3,978301968,My Fair Lady (1964),Multiple
3,1,3408,4,978300275,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,"Bug's Life, A (1998)",Multiple
...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,Weekend at Bernie's (1989),Comedy
1000205,6040,1094,5,956704887,"Crying Game, The (1992)",Multiple
1000206,6040,562,5,956704746,Welcome to the Dollhouse (1995),Multiple
1000207,6040,1096,4,956715648,Sophie's Choice (1982),Drama


### System I: Recommendation Based on Popularity

The strategy used to choose the top 10 “most popular” movies is prioritizing movies that have the largest number of reviews above a specified rating threshold (set as 4.0 here).

In [6]:
# Set a rating threshold (e.g., 4.0)
rating_threshold = 4.0

# Filter the DataFrame to include only reviews above the threshold
filtered_ratings = rating_merged[rating_merged['Rating'] >= rating_threshold]

# Group by MovieID and count the number of reviews above the threshold
popularity_ranking = filtered_ratings.groupby('MovieID')['Rating'].count().sort_values(ascending=False)
top_10_movies = popularity_ranking.head(10)

In [7]:
popularity_ranking.index = 'm' + popularity_ranking.index.astype(str)
popularity_ranking.to_csv('popularity_ranking.csv')

In [8]:
top_10_movies

MovieID
2858    2853
260     2622
1196    2510
2028    2260
1198    2260
593     2252
2571    2171
2762    2163
1210    2127
608     2074
Name: Rating, dtype: int64

In [9]:
# Extract MovieIDs and create a DataFrame
top_10_movie_ids = top_10_movies.index
top_10_df = pd.DataFrame({'MovieID': top_10_movie_ids})
top_10_df

Unnamed: 0,MovieID
0,2858
1,260
2,1196
3,2028
4,1198
5,593
6,2571
7,2762
8,1210
9,608


In [10]:
# Merge top_10_df and movie datasets
merged_top_10 = top_10_df.merge(movies, on = 'MovieID')
merged_top_10

Unnamed: 0,MovieID,Title,Genres
0,2858,American Beauty (1999),Multiple
1,260,Star Wars: Episode IV - A New Hope (1977),Multiple
2,1196,Star Wars: Episode V - The Empire Strikes Back...,Multiple
3,2028,Saving Private Ryan (1998),Multiple
4,1198,Raiders of the Lost Ark (1981),Multiple
5,593,"Silence of the Lambs, The (1991)",Multiple
6,2571,"Matrix, The (1999)",Multiple
7,2762,"Sixth Sense, The (1999)",Thriller
8,1210,Star Wars: Episode VI - Return of the Jedi (1983),Multiple
9,608,Fargo (1996),Multiple


In [11]:
from IPython.display import HTML, display
html_table = """
<table class="nowrap hover row-border">
   <thead>
       <tr>
          <th>MovieID</th>
          <th>Title</th>
          <th>Image</th>
        </tr>
    </thead>
    <tbody>
"""
small_image_url = "https://liangfgithub.github.io/MovieImages/"
for index, row in merged_top_10.iterrows():
    image_url = f"{small_image_url}{row['MovieID']}.jpg?raw=true"
    html_table += f"""
    <tr>
      <td>{row['MovieID']}</td>
      <td>{row['Title']}</td>
      <td><img src="{image_url}" alt="Movie Poster" width="100"></td>
    </tr>
    """
html_table += """
  </tbody>
</table>
"""

display(HTML(html_table))

MovieID,Title,Image
2858,American Beauty (1999),
260,Star Wars: Episode IV - A New Hope (1977),
1196,Star Wars: Episode V - The Empire Strikes Back (1980),
2028,Saving Private Ryan (1998),
1198,Raiders of the Lost Ark (1981),
593,"Silence of the Lambs, The (1991)",
2571,"Matrix, The (1999)",
2762,"Sixth Sense, The (1999)",
1210,Star Wars: Episode VI - Return of the Jedi (1983),
608,Fargo (1996),


### System II: Recommendation Based on IBCF

In [12]:
R = pd.read_csv('Rmat.csv')

In [13]:
R

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
u1,5.0,,,,,,,,,,...,,,,,,,,,,
u10,5.0,,,,,,,,,,...,,,,,,,,,,
u100,,,,,,,,,,,...,,,,,,,,,,
u1000,5.0,,,,,,,,,,...,,,,,,,,,,
u1001,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
u995,,,,,,,,,,,...,,,,,,,,,,
u996,4.0,,,,,,,,,,...,,,,,,,,,,3.0
u997,4.0,,,,,,,,,,...,,,,,,,,,,
u998,,,,,,,,,,,...,,,,,,,,,,


Step 1: Normalize the rating matrix by centering each row.

In [14]:
def normalize_rating_matrix(R):
    
    # Calculate row means, ignoring NaN values
    row_means = np.nanmean(R, axis=1, keepdims=True)

    # Subtract row means from each element, handling NaN values
    normalized_R = R - row_means
    normalized_R = np.nan_to_num(normalized_R)

    return normalized_R

normalized_R = normalize_rating_matrix(R)

Step 2: Compute the (transformed) Cosine similarity among the 3,706 movies.

In [15]:
def compute_cosine_similarity(R):
    num_movies = R.shape[1]
    S = np.full((num_movies, num_movies), np.nan)
    for i in range(num_movies):
        for j in range(i+1, num_movies):
          # Find users who rated both movies i and j
          common_users = np.where((R[:, i] != 0) & (R[:, j] != 0))[0]

          # If there are at least 3 common users, compute similarity
          if len(common_users) >= 3:
            numerator = np.dot(R[common_users, i], R[common_users, j])
            denominator = np.sqrt(np.dot(R[common_users, i], R[common_users, i])) * np.sqrt(np.dot(R[common_users, j], R[common_users, j]))
            similarity = 0.5 + 0.5 * (numerator / denominator)
            S[i, j] = similarity
            S[j, i] = similarity
            
    return S

S = compute_cosine_similarity(normalized_R)

In [16]:
S

array([[       nan, 0.51210553, 0.39199995, ..., 0.5140432 , 0.38377183,
        0.41450545],
       [0.51210553,        nan, 0.54745829, ..., 0.66873273, 0.44828951,
        0.60081163],
       [0.39199995, 0.54745829,        nan, ..., 0.26957569, 0.47892265,
        0.6128149 ],
       ...,
       [0.5140432 , 0.66873273, 0.26957569, ...,        nan, 0.64263547,
        0.4606457 ],
       [0.38377183, 0.44828951, 0.47892265, ..., 0.64263547,        nan,
        0.64272702],
       [0.41450545, 0.60081163, 0.6128149 , ..., 0.4606457 , 0.64272702,
               nan]])

Step 3: Let 𝑆 denote the 3706-by-3706 similarity matrix computed in previous step. For each row, sort the non-NA similarity measures and keep the top 30, setting the rest to NA. This new similarity matrix, still denoted as 𝑆
, is no longer symmetric. Save this matrix. Note that some rows of the 𝑆
 matrix may contain fewer than 30 non-NA values.

In [17]:
def filter_similarity_matrix(S, top_k=30):

    num_movies = S.shape[0]
    filtered_S = S.copy()

    for i in range(num_movies):
        # Sort the row in descending order, ignoring NaN values
        indices = np.argsort(-S[i, :], kind='mergesort')

        # Set similarities beyond top-k to NaN
        filtered_S[i, indices[top_k:]] = np.nan

    return filtered_S

filtered_S = filter_similarity_matrix(S, top_k=30)

In [18]:
filtered_S

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [19]:
movie_names = R.columns
filtered_S = pd.DataFrame(filtered_S, index = movie_names, columns = movie_names)
pd.set_option('display.precision', 7)
filtered_S

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
m1,,,,,,,,,,,...,,,,,,,,,,
m10,,,,,,,,,,,...,,,,,,,,,,
m100,,,,,,,,,,,...,,,,,,,,,,
m1000,,,,,,,,,,,...,,,,,,,,,,
m1002,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m994,,,,,,,,,,,...,,,,,,,,,,
m996,,,,,,,,,,,...,,,,,,,,,,
m997,,,,,,,,,,,...,,,,,,,,,,
m998,,,,,,,,,,,...,,,,,,,,,,


Display the pairwise similarity values from the 𝑆
 matrix (you obtained at Step 2) for the following specified movies: “m1”, “m10”, “m100”, “m1510”, “m260”, “m3212”. Please round the results to 7 decimal places.

In [20]:
movie_names = R.columns
S = pd.DataFrame(S, index = movie_names, columns = movie_names)
pd.set_option('display.precision', 7)
S

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
m1,,0.5121055,0.3919999,0.7296371,0.4052488,0.3443622,0.1934793,0.2920968,0.2757620,0.4342140,...,0.5256346,0.1678860,0.4382444,0.2044081,0.5517557,0.6838283,0.2906526,0.5140432,0.3837718,0.4145054
m10,0.5121055,,0.5474583,0.4904717,,0.6109830,0.4237425,0.4606591,0.6576989,0.5495395,...,0.2617006,0.4658628,0.4480788,0.3857350,,0.4544643,0.5475044,0.6687327,0.4482895,0.6008116
m100,0.3919999,0.5474583,,0.4829650,,0.8365839,0.6295382,0.5682818,0.8118070,0.4885245,...,0.4107531,0.6426157,0.4936404,0.1936714,0.8028437,0.3067432,0.6293738,0.2695757,0.4789227,0.6128149
m1000,0.7296371,0.4904717,0.4829650,,,0.1807649,,,,0.7052228,...,,,0.2073925,0.9015211,,0.2260270,0.6684361,,0.7253362,0.6805737
m1002,0.4052488,,,,,,,,,,...,,,,,,0.7227661,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m994,0.6838283,0.4544643,0.3067432,0.2260270,0.7227661,0.2517379,0.2271863,0.1402858,0.2490622,0.2743969,...,0.4011803,0.1486861,0.4705176,0.1928585,0.5397140,,0.2155611,0.4490137,0.3078245,0.3985167
m996,0.2906526,0.5475044,0.6293738,0.6684361,,0.7908892,0.7119653,0.6911337,0.8060751,0.6216948,...,0.6181369,0.7796494,0.4780710,0.7975184,,0.2155611,,0.0771135,0.5563784,0.6225577
m997,0.5140432,0.6687327,0.2695757,,,0.3660229,0.9327237,0.9492277,0.2144257,0.2100087,...,0.2157111,0.8661206,0.4162218,,0.4120181,0.4490137,0.0771135,,0.6426355,0.4606457
m998,0.3837718,0.4482895,0.4789227,0.7253362,,0.4450076,0.8437724,0.6048153,0.3545709,0.5041464,...,,0.6983913,0.6629043,0.8523279,,0.3078245,0.5563784,0.6426355,,0.6427270


In [21]:
# Select the desired movies
movies = ["m1", "m10", "m100", "m1510", "m260", "m3212"]

# Extract the submatrix for the selected movies
S_subset = S.loc[movies, movies]
S_subset

Unnamed: 0,m1,m10,m100,m1510,m260,m3212
m1,,0.5121055,0.3919999,,0.7415971,
m10,0.5121055,,0.5474583,,0.5343487,
m100,0.3919999,0.5474583,,,0.3296943,
m1510,,,,,,
m260,0.7415971,0.5343487,0.3296943,,,
m3212,,,,,,


Step 4: Create a function named myIBCF

In [22]:
def myIBCF(w):
    # Get movie names from R
    movie_names = R.columns
    
    # Identify unrated movies
    unrated_movies = w[w.isna()].index

    # Compute predictions for unrated movies
    predictions = []
    for movie_idx in unrated_movies:
        similar_movies = filtered_S.loc[movie_idx].dropna().index
        rated_similar_movies = similar_movies.intersection(w.dropna().index)

        if len(rated_similar_movies) > 0:
            numerator = np.sum(filtered_S.loc[movie_idx, rated_similar_movies] * w.loc[rated_similar_movies])
            denominator = np.sum(filtered_S.loc[movie_idx, rated_similar_movies])
            prediction = numerator / denominator
            predictions.append((movie_idx, prediction))
        else:
            predictions.append((movie_idx, np.nan))

    # Sort predictions by descending order
    predictions.sort(key=lambda x: float(x[1]) if not math.isnan(x[1]) else float('-inf'), reverse=True)


    # Select top 10 predictions
    top_10_predictions = predictions[:10]

    # Get movie names for top 10 predictions
    top_10_movie_names = [movie_idx for movie_idx, _ in top_10_predictions]

    # Handle cases where fewer than 10 predictions are non-NA
    if len(top_10_movie_names) < 10:
        # Load popularity ranking (assuming it's saved as a CSV)
        popularity_ranking = pd.read_csv('popularity_ranking.csv', index_col=0)

        # Filter out already rated movies and already predicted movies
        remaining_movies = popularity_ranking.index[
            ~((popularity_ranking.index.isin(w.dropna().index) | popularity_ranking.index.isin(top_10_movie_names)))
        ]
        # select top popular ones
        top_remaining_movie_names = remaining_movies[:10 - len(top_10_movie_names)]
        
        top_10_movie_names = top_10_movie_names.tolist() + top_remaining_movie_names.tolist()

    return top_10_movie_names

Step 5 Test: For your function myIBCF, print the top 10 recommendations for the following two users:

- User “u1181” from the rating matrix 𝑅
- A hypothetical user who rates movie “m1613” with 5 and movie “m1755” with 4.

In [23]:
newuser1 = R.loc["u1181", :]
myIBCF(newuser1)

['m3732',
 'm749',
 'm3899',
 'm1039',
 'm1235',
 'm1253',
 'm1734',
 'm1914',
 'm2082',
 'm2361']

In [24]:
newuser2 = pd.Series(np.nan, index=R.columns)
newuser2['m1613'] = 5.0
newuser2['m1755'] = 4.0
myIBCF(newuser2)

['m1017',
 'm3269',
 'm340',
 'm46',
 'm74',
 'm765',
 'm1100',
 'm1468',
 'm1541',
 'm158']

References: https://liangfgithub.github.io/Python_W13_Movie_RS.html