In [1]:
import numpy as np
import pandas as pd
import matrix_factorization_utilities

In [2]:
# Load user ratings
df = pd.read_csv('movie_ratings_data_set.csv')

# Load movie titles
movies_df = pd.read_csv('movies.csv', index_col='movie_id')

In [3]:
df.head()

Unnamed: 0,user_id,movie_id,value
0,1,28,4
1,1,26,4
2,1,9,4
3,1,1,4
4,1,14,4


In [4]:
movies_df.head()

Unnamed: 0_level_0,title,genre
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,The Sheriff 1,"crime drama, western"
2,The Big City Judge 1,legal drama
3,The Sheriff 2,"crime drama, western"
4,Just a Regular Family,reality
5,The Big City Judge 2,legal drama


In [5]:
# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(df, index='user_id', columns='movie_id', aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(ratings_df.values,
                                                                    num_features=15,
                                                                    regularization_amount=1.0)


Optimization terminated successfully.
         Current function value: 312.762757
         Iterations: 1563
         Function evaluations: 2344
         Gradient evaluations: 2344


In [6]:
# Swap the rows and columns of product_features just so it's easier to work with
M = np.transpose(M)

In [7]:
# Choose a movie to find similar movies to. Let's find movies similar to movie #5:
movie_id = 5

# Get movie #1's name and genre
movie_information = movies_df.loc[movie_id]
movie_information

title    The Big City Judge 2
genre             legal drama
Name: 5, dtype: object

In [8]:
# Get the features for movie #1 we found via matrix factorization
current_movie_features = M[movie_id - 1]
current_movie_features

array([ 0.6655182 , -0.82916139, -0.72697674,  0.52215399, -0.8482545 ,
       -1.84166238, -0.78719117,  0.25967982, -0.11945904,  0.11391648,
       -0.15064774, -0.17666695, -0.23299691, -0.81292525,  1.082787  ])

In [9]:
# The main logic for finding similar movies:

# 1. Subtract the current movie's features from every other movie's features
difference = M - current_movie_features

# 2. Take the absolute value of that difference (so all numbers are positive)
absolute_difference = np.abs(difference)

# 3. Each movie has 15 features. Sum those 15 features to get a total 'difference score' for each movie
total_difference = np.sum(absolute_difference, axis=1)

# 4. Create a new column in the movie list with the difference score for each movie
movies_df['difference_score'] = total_difference

# 5. Sort the movie list by difference score, from least different to most different
sorted_movie_list = movies_df.sort_values('difference_score')

# 6. Print the result, showing the 5 most similar movies to movie_id #1
sorted_movie_list.head()

Unnamed: 0_level_0,title,genre,difference_score
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,The Big City Judge 2,legal drama,0.0
10,Surrounded by Zombies 1,"horror, zombie fiction",1.872546
9,Biker Gangs,"crime drama, action",2.600013
3,The Sheriff 2,"crime drama, western",2.695921
24,The Big City Judge 3,legal drama,2.787909
