In [192]:
# Petar Micevski
# Professor Avirappattu
# DASC 8211WA
# May 18th, 2022
#--------------------------------------------------------------

import pandas as pd 
import numpy as np

from itertools import permutations # For permutation function in popular movies section
from scipy.sparse.linalg import svds # For Matrix Factorization (SVD)
from sklearn.metrics import mean_squared_error # For evaluating engine


# Reading in the csv files we will use to make the recommendation engines
movie_genre_df = pd.read_csv('~\\Jupyter Notebooks\\CSV Files\\movies.csv')
user_ratings_df = pd.read_csv('~\\Jupyter Notebooks\\CSV Files\\user_ratings.csv')




# Get the counts of occurrences of each movie title
movie_popularity = user_ratings_df['title'].value_counts()

# Inspect the most common values
print(movie_popularity.head().index)
print(movie_genre_df.head())

Index(['Forrest Gump (1994)', 'Shawshank Redemption, The (1994)',
       'Pulp Fiction (1994)', 'Silence of the Lambs, The (1991)',
       'Matrix, The (1999)'],
      dtype='object')
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [193]:
# Finds the mean of the ratings given to each title
average_rating_df = user_ratings_df[["title", "rating"]].groupby('title').mean()

# Orders the entries by highest average rating to lowest
sorted_average_ratings = average_rating_df.sort_values(by="rating", ascending=False)

# Inspects the top movies
print(sorted_average_ratings.head())





# Creates a list of only movies appearing > 50 times in the dataset
movie_popularity = user_ratings_df["title"].value_counts()
popular_movies = movie_popularity[movie_popularity > 50].index

# Uses this popular_movies list to filter the original DataFrame
popular_movies_rankings =  user_ratings_df[user_ratings_df["title"].isin(popular_movies)]

# Finds the average rating given to these frequently watched films
popular_movies_average_rankings = popular_movies_rankings[["title", "rating"]].groupby('title').mean()
print(popular_movies_average_rankings.sort_values(by="rating", ascending=False).head())

                                     rating
title                                      
Gena the Crocodile (1969)               5.0
True Stories (1986)                     5.0
Cosmic Scrat-tastrophe (2015)           5.0
Love and Pigeons (1985)                 5.0
Red Sorghum (Hong gao liang) (1987)     5.0
                                                      rating
title                                                       
Shawshank Redemption, The (1994)                    4.429022
Godfather, The (1972)                               4.289062
Fight Club (1999)                                   4.272936
Cool Hand Luke (1967)                               4.271930
Dr. Strangelove or: How I Learned to Stop Worry...  4.268041


In [194]:
# Non personalized suggestions and commonly paired movies by user


# Create the function to find all permutations
def find_movie_pairs(x):
  pairs = pd.DataFrame(list(permutations(x.values, 2)),
                       columns=['movie_a', 'movie_b'])
  return pairs

# Apply the function to the title column and reset the index
movie_combinations = user_ratings_df.groupby('userId')['title'].apply(
  find_movie_pairs).reset_index(drop=True)

print(movie_combinations)





# Calculate how often each item in movie_a occurs with the items in movie_b
combination_counts = movie_combinations.groupby(['movie_a', 'movie_b']).size()

# Convert the results to a DataFrame and reset the index
combination_counts_df = combination_counts.to_frame(name='size').reset_index()
print(combination_counts_df.head())

                   movie_a                           movie_b
0         Toy Story (1995)           Grumpier Old Men (1995)
1         Toy Story (1995)                       Heat (1995)
2         Toy Story (1995)       Seven (a.k.a. Se7en) (1995)
3         Toy Story (1995)        Usual Suspects, The (1995)
4         Toy Story (1995)        From Dusk Till Dawn (1996)
...                    ...                               ...
60793295         31 (2016)                 Gen-X Cops (1999)
60793296         31 (2016)                  Bloodmoon (1997)
60793297         31 (2016)  Sympathy for the Underdog (1971)
60793298         31 (2016)                     Hazard (2005)
60793299         31 (2016)                Blair Witch (2016)

[60793300 rows x 2 columns]
      movie_a                                     movie_b  size
0  '71 (2014)                 (500) Days of Summer (2009)     1
1  '71 (2014)                  10 Cloverfield Lane (2016)     1
2  '71 (2014)                            127 Ho

In [195]:
#Matrix Transformation and Engine Evaluation

# Transform the table
user_ratings_df = user_ratings.pivot_table(index='userId', columns='title', values='rating')

In [196]:
# Get the average rating for each user 
avg_ratings = user_ratings_df.mean(axis=1)


# Center each user's ratings around 0
user_ratings_centered = user_ratings_df.sub(avg_ratings, axis=0)


# Fill in all missing values with 0s
user_ratings_centered.fillna(0, inplace=True)


In [197]:
print(user_ratings_centered.shape)

(610, 9719)


In [198]:
# Decompose the matrix
U, sigma, Vt = svds(user_ratings_centered)


In [199]:
# Convert sigma to diagonal matrix
sigma = np.diag(sigma)


print(U.shape)
print(sigma.shape)
print(Vt.shape)

(610, 6)
(6, 6)
(6, 9719)


In [200]:
# Dot product of U and sigma
U_sigma = np.dot(U, sigma)


# Dot product of result and Vt
U_sigma_Vt = np.dot(U_sigma, Vt)


# Add back on the row means contained in avg_ratings
uncentered_ratings = U_sigma_Vt + avg_ratings.values.reshape(-1, 1)


# Create DataFrame of the results
calc_pred_ratings_df = pd.DataFrame(uncentered_ratings, index=user_ratings_df.index,
                                    columns=user_ratings_df.columns
                                   )
# Print the recalculated matrix
print(calc_pred_ratings_df)

title   '71 (2014)  'Hellboy': The Seeds of Creation (2004)  \
userId                                                        
1         4.366758                                 4.366864   
2         3.948463                                 3.948253   
3         2.436697                                 2.435659   
4         3.553547                                 3.556060   
5         3.636042                                 3.636611   
...            ...                                      ...   
606       3.656747                                 3.660728   
607       3.785770                                 3.786352   
608       3.144360                                 3.135605   
609       3.270412                                 3.270386   
610       3.711661                                 3.689777   

title   'Round Midnight (1986)  'Salem's Lot (2004)  \
userId                                                
1                     4.366117             4.366425   
2              

In [201]:
# Comparison of actual values vs. predicted values from previous section
# You have to do a "split" here, so the 20 and 100 is considered the split
actual_values = user_ratings_df.iloc[:20, :100].values
predicted_values = calc_pred_ratings_df.iloc[:20, :100].values

# Creates a mask of actual_values to only look at the non-missing values in the ground truth
mask = ~np.isnan(actual_values)

# Prints the performance of the predictions
print(mean_squared_error(actual_values[mask], predicted_values[mask], squared=False))

# Output: 0.8967

0.8967747616983752
