# Movie Reccomendations

In [32]:
# Importing libraries
import pandas as pd
import numpy as np
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
# Importing data
df_users = pd.read_csv("users.csv")
df_movies = pd.read_csv('movies.csv', encoding='latin-1') # Had to use special encoding for this to fix an error
df_ratings = pd.read_csv("ratings.csv")

In [4]:
df_users.head()

Unnamed: 0,user_id,gender,age,occupation,zipcode,age_desc,occ_desc
0,1,F,1,10,48067,Under 18,K-12 student
1,2,M,56,16,70072,56+,self-employed
2,3,M,25,15,55117,25-34,scientist
3,4,M,45,7,2460,45-49,executive/managerial
4,5,M,25,20,55455,25-34,writer


In [5]:
df_ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1,3
1,1,3,5
2,1,4,4
3,1,5,3
4,1,6,3


In [6]:
df_movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## Content-Based Reccomendation Model
* Find list of used genres which is used to category the movies
* Vectorize the relationship between movies and genres and put them into Ij.
* Vectorize the relationship between users and genres and put them into Uj (if user rate for a movie, he/she has the related history with the movies’ genres).
* Compute the cosine_similarity between movies and users

In [7]:
# Copy list of genres that aren't NaN
df_genres = df_movies['genres'].dropna()

In [8]:
# Find set of all genres
genres_set = []

for g in df_genres:
    g_list = g.split('|')
    for i in g_list:
        genres_set.append(i)

genres_set = set(genres_set)

In [9]:
# Display our genres
print(genres_set)

{"Children's", 'Crime', 'Romance', 'Thriller', 'Musical', 'Animation', 'War', 'Comedy', 'Sci-Fi', 'Documentary', 'Drama', 'Action', 'Horror', 'Fantasy', 'Adventure'}


In [10]:
# Vectorizing relationship between movies and genres
movie_genre_matrix = []

# Loop through each movies' string of genres
for movie_genres in df_movies['genres']:
    
    row = []
    
    # Loop through our 15 genres
    for genre in genres_set:
        
        # If null -> 0
        if pd.isnull(movie_genres):
            row.append(0)
        
        # If the genre is found in the set -> 1
        elif genre in movie_genres.split('|'):
            row.append(1)
        
        # If the genre is not found in the set -> 0
        else:
            row.append(0)
    
    # Add the row to our matrix
    movie_genre_matrix.append(row)


# Print the relationship matrix
print("Movie Genre Matrix (first 4 rows):")
for row in movie_genre_matrix[:4]:
    print(row)


Movie Genre Matrix (first 4 rows):
[1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]


In [11]:
# Vectorizing relationship between users and genres

user_genre_matrix = []

# Loop through all users (there are 100)
for user in df_users['user_id']:
    
    # Create a blank list the size of our genre set
    user_row = [0 for i in range(len(genres_set))]
    
    # Get us the rating records belonging to our current user
    user_records = df_ratings.loc[df_ratings['user_id'] == user].values
    
    # Loop through each rating record
    for record in user_records:
    
        current_movie_id = record[1] # movie our user has seen
        movie_record = df_movies.loc[df_movies['movie_id'] == current_movie_id].values
        movie_genres = movie_record[0][2] # genres of the movie
        
        # Collect all genres for the movie and update the user row accordingly
        for index, genre in enumerate(genres_set):
            
            # Skip NaN values
            if pd.isnull(movie_genres):
                continue
            
            # Update row if we find a genre
            if genre in movie_genres.split('|'):
                user_row[index] = 1
                
    # Append our row            
    user_genre_matrix.append(user_row)
        
# Print the relationship matrix
print("User Genre Matrix (first 4 rows):")
for row in user_genre_matrix[:4]:
    print(row)

User Genre Matrix (first 4 rows):
[1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]


I'm hoping I've done the above User Genre matrix correctly. My aim was to ask "What movies has this user seen?" and from that "What genres out of all these movies has the user seen?"

In [12]:
# Calculating cosine_similarity between movies and users.
cos_sim = cosine_similarity(movie_genre_matrix, user_genre_matrix)

print(cos_sim)

[[0.46291005 0.46291005 0.4472136  ... 0.46291005 0.4472136  0.4472136 ]
 [0.46291005 0.46291005 0.4472136  ... 0.46291005 0.4472136  0.4472136 ]
 [0.37796447 0.37796447 0.36514837 ... 0.37796447 0.36514837 0.36514837]
 ...
 [0.37796447 0.37796447 0.36514837 ... 0.37796447 0.36514837 0.36514837]
 [0.26726124 0.26726124 0.25819889 ... 0.26726124 0.25819889 0.25819889]
 [0.37796447 0.37796447 0.36514837 ... 0.37796447 0.36514837 0.36514837]]


## Collaborative Filtering Recommendation Model by Users
* Use train_test_split to split above dataset with the ratio 50/50. The test dataset will be used as groundtruth to evaluate the rating calculated by using the train dataset.
* Create matrix for users, movies and ratings in both training and testing datasets.
* Calculate the user correlation.
* Implement a predict based on user correlation coefficient.
* Predict on train dataset and compare the RMSE with the test dataset.

In [34]:
# Similarity matrix function
def sim_matrix(M, dimension='user'):
    N = M.shape[0] if dimension == 'user' else M.shape[1]
    print(N)
    sim = np.zeros([N,N])
    for i in range(N):
        for j in range(N):
            if i == j:
                sim[i,j] = 0 #Cancel out the effect of self-similarity in the sums later
                continue
            if dimension == 'user':
                v1, v2 = M[i,:], M[j,:]
            else:
                v1, v2 = M[:,i], M[:,j]
            sim[i][j] = correlation_similarity(v1,v2)
    print(sim.shape)
    return sim

In [14]:
# Cleaning & Splitting the data (Massive help from Thanh working through this)

# Import a version of ratings dataframe with only user_id, movie_id, and the rating
ratings = pd.read_csv('ratings.csv', encoding='latin-1', usecols=['user_id', 'movie_id', 'rating'])

# Replace user_id & movie_id NaN values with 0
ratings['user_id'] = ratings['user_id'].fillna(0)
ratings['movie_id'] = ratings['movie_id'].fillna(0)

# Replace rating NaN values with average of all values
ratings['rating'] = ratings['rating'].fillna(ratings['rating'].mean())

# Splitting data 50% into training and testing
train_data, test_data = train_test_split(ratings, test_size=0.5)

# Taking a look at our newly split data
print("Testing Data:\n", test_data.head(), "\n")
print("Training Data:\n", train_data.head())

Testing Data:
       user_id  movie_id  rating
1889       26        10       4
2658       36        49       3
6133       82        64       5
5216       70        60       5
535         7        92       2 

Training Data:
       user_id  movie_id  rating
6136       82        67       3
7331       98        46       2
1518       20        90       3
884        12        38       4
3703       50        51       1


In [15]:
# Creating our user matrix
train_matrix = train_data.pivot_table(index='movie_id', columns='user_id', values='rating').astype('float64')

# Looking at one of our matricies
train_matrix

user_id,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.0,,3.0,,,5.0,,,,,...,,,4.0,,3.0,,4.0,,,
2,,,,,4.0,2.0,,,,4.0,...,4.0,,,,,,,3.0,,4.0
3,5.0,,,3.0,,,,3.0,5.0,,...,,,2.0,,,,,5.0,,
4,,,3.0,,4.0,,4.0,,,,...,,,,,3.0,,4.0,,,
5,,,,3.0,,,,4.0,,,...,1.0,5.0,,,,1.0,,3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,3.0,,4.0,4.0,2.0,3.0,,,1.0,,...,2.0,,3.0,,,3.0,,4.0,,
97,4.0,,,,5.0,,,4.0,,4.0,...,,,3.0,5.0,,,5.0,,4.0,
98,5.0,,,,,4.0,,,,2.0,...,,,,,3.0,5.0,4.0,4.0,,
99,,,,,,,3.0,2.0,,4.0,...,,3.0,4.0,,,,4.0,,,


In [16]:
# Functions for calculating correlation
def correlation_similarity(v1,v2):
    v1 = v1 - np.nanmean(v1)
    v2 = v2 - np.nanmean(v2)
    "compute similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        if np.isnan(x) or np.isnan(y) or x == 0 or y ==0: continue
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    if sumxx == 0 and sumyy == 0:
        sumxx = sumyy = 1
    return sumxy/math.sqrt(sumxx*sumyy)

def sim_matrix(M, dimension='user'):
    N = M.shape[0] if dimension == 'user' else M.shape[1]
    sim = np.zeros([N,N])
    for i in range(N):
        for j in range(N):
            if i == j:
                sim[i,j] = 0 #Cancel out the effect of self-similarity in the sums later
                continue
            if dimension == 'user':
                v1, v2 = M[i,:], M[j,:]
            else:
                v1, v2 = M[:,i], M[:,j]
            sim[i][j] = correlation_similarity(v1,v2)
    return sim

In [17]:
# Calculate the user similarity matrix
user_corr = sim_matrix(train_matrix.values)
user_corr[np.isnan(user_corr)] = 0

print("User similarity matrix:\n", user_corr)

User similarity matrix:
 [[ 0.         -0.17126248 -0.45493983 ... -0.11125558 -0.30347798
   0.28447222]
 [-0.17126248  0.          0.04591422 ... -0.08033823  0.19254631
  -0.51778112]
 [-0.45493983  0.04591422  0.         ... -0.00981859 -0.11645925
  -0.55050039]
 ...
 [-0.11125558 -0.08033823 -0.00981859 ...  0.          0.16529112
  -0.35022379]
 [-0.30347798  0.19254631 -0.11645925 ...  0.16529112  0.
  -0.39193538]
 [ 0.28447222 -0.51778112 -0.55050039 ... -0.35022379 -0.39193538
   0.        ]]


In [18]:
# Function for user collaborative filtering
def user_cf(M, sim_users):
    pred = np.copy(M)
    n_users, n_items = M.shape
    avg_ratings = np.nanmean(M, axis=1)
    for i in range(n_users):
        for j in range(n_items):
            if np.isnan(M[i,j]):
                # We are to use the actual formula for this as in the lecture slides
                pred[i,j] = avg_ratings[i] + np.nansum(sim_users[i] * (M[:,j] - avg_ratings)) / sum(sim_users[i])                                                   
    return pred

In [33]:
# Making prediction
pred_user = user_cf(train_matrix.values, user_corr)

print("Prediction matrix:\n", pred_user)

Prediction matrix:
 [[ 3.          3.64647217  3.         ...  3.85086687  3.79224685
   4.03916539]
 [ 1.68158295  5.61236793 -0.44693422 ...  3.         12.30342401
   4.        ]
 [ 5.          3.02293089  3.61722194 ...  5.          4.23312687
   4.09764858]
 ...
 [ 5.          7.56693036 -0.598787   ...  4.         10.12761754
   8.82042202]
 [ 3.64832925  3.74455149  3.2417313  ...  3.42595553  3.65699251
   3.85160319]
 [ 1.          0.90307967  6.70010754 ...  5.51901314  3.
   5.        ]]


<b><font color='purple'>Not sure where the test_data comes into play here for making the predictions...</b></font>

In [None]:
# RMSE

## Collaborative Filtering Recommendation Model by Items
* Calculate the item correlation.
* Implement function to predict ratings based on Item Similarity.
* Predict on train dataset and compare the RMSE with the test dataset.
* Compare the results between User-based and Item-based. Make conclusion.

In [20]:
item_corr = sim_matrix(train_matrix.values)
item_corr[np.isnan(item_corr)] = 0

item_corr

array([[ 0.        , -0.17126248, -0.45493983, ..., -0.11125558,
        -0.30347798,  0.28447222],
       [-0.17126248,  0.        ,  0.04591422, ..., -0.08033823,
         0.19254631, -0.51778112],
       [-0.45493983,  0.04591422,  0.        , ..., -0.00981859,
        -0.11645925, -0.55050039],
       ...,
       [-0.11125558, -0.08033823, -0.00981859, ...,  0.        ,
         0.16529112, -0.35022379],
       [-0.30347798,  0.19254631, -0.11645925, ...,  0.16529112,
         0.        , -0.39193538],
       [ 0.28447222, -0.51778112, -0.55050039, ..., -0.35022379,
        -0.39193538,  0.        ]])

<b><font color='purple'>Item_corr is exactly the same as user_corr, is this okay? I understand they are recieving the exact same parameters...</b></font>

In [21]:
# Function for item collaborative filtering
def item_cf(M, sim_items):
    pred = np.copy(M)
    n_users, n_items = M.shape
    avg_ratings = np.nanmean(M, axis=0)
    for i in range(n_users):
        for j in range(n_items):
            if np.isnan(M[i,j]):
                # very similar to the previous, just using sim_items now instead
                pred[i,j] = avg_ratings[i] + np.nansum(sim_items[i] * (M[:,j] - avg_ratings)) / sum(sim_items[i])
    return pred

In [22]:
# Making prediction
pred_item = item_cf(train_matrix.values, item_corr)

print("Prediction matrix:\n", pred_item)

Prediction matrix:
 [[ 3.          3.40568071  3.         ...  3.63555806  3.46587365
   3.66061511]
 [ 1.31724003  3.21703745 -2.78931697 ...  3.         10.71351082
   4.        ]
 [ 5.          3.03046165  3.52157511 ...  5.          4.01271435
   3.89304943]
 ...
 [ 5.          8.00695771  0.61516591 ...  4.          9.36140409
  10.1716739 ]
 [ 3.77180985  3.79512039  3.40535564 ...  3.56060648  3.77365496
   3.8382189 ]
 [ 1.          1.53665376  7.11304289 ...  7.13906285  3.
   5.        ]]


<b><font color='purple'>Not sure where the test_data comes into play here for making the predictions...</b></font>

In [None]:
# RMSE

### Comparison between User-based and Item-based results
We see that ... <b><font color='purple'>Missing RMSE</b></font>