# Movie Reccomendations

helpful video: https://www.youtube.com/watch?v=umSM8rFtVMs

In [53]:
# Importing libraries
import pandas as pd
import numpy as np
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Importing data
df_users = pd.read_csv("users.csv")
df_movies = pd.read_csv('movies.csv', encoding='latin-1') # Had to use special encoding for this to fix an error
df_ratings = pd.read_csv("ratings.csv")

In [5]:
df_users.head()

Unnamed: 0,user_id,gender,age,occupation,zipcode,age_desc,occ_desc
0,1,F,1,10,48067,Under 18,K-12 student
1,2,M,56,16,70072,56+,self-employed
2,3,M,25,15,55117,25-34,scientist
3,4,M,45,7,2460,45-49,executive/managerial
4,5,M,25,20,55455,25-34,writer


In [47]:
df_ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1,3
1,1,3,5
2,1,4,4
3,1,5,3
4,1,6,3


In [7]:
df_movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## Content-Based Reccomendation Model
* Find list of used genres which is used to category the movies
* Vectorize the relationship between movies and genres and put them into Ij.
* Vectorize the relationship between users and genres and put them into Uj (if user rate for a movie, he/she has the related history with the movies’ genres).
* Compute the cosine_similarity between movies and users

In [8]:
# Copy list of genres that aren't NaN
df_genres = df_movies['genres'].dropna()

In [9]:
# Find set of all genres
genres_set = []

for g in df_genres:
    g_list = g.split('|')
    for i in g_list:
        genres_set.append(i)

genres_set = set(genres_set)

In [10]:
# Display our genres
print(genres_set)

{'Action', 'Romance', "Children's", 'Comedy', 'Drama', 'Adventure', 'Musical', 'War', 'Crime', 'Fantasy', 'Horror', 'Documentary', 'Animation', 'Thriller', 'Sci-Fi'}


In [11]:
# Vectorizing relationship between movies and genres
movie_genre_matrix = []

# Loop through each movies' string of genres
for movie_genres in df_movies['genres']:
    
    row = []
    
    # Loop through our 15 genres
    for genre in genres_set:
        
        # If null -> 0
        if pd.isnull(movie_genres):
            row.append(0)
        
        # If the genre is found in the set -> 1
        elif genre in movie_genres.split('|'):
            row.append(1)
        
        # If the genre is not found in the set -> 0
        else:
            row.append(0)
    
    # Add the row to our matrix
    movie_genre_matrix.append(row)


# Print the relationship matrix
print("Movie Genre Matrix (first 4 rows):")
for row in movie_genre_matrix[:4]:
    print(row)


Movie Genre Matrix (first 4 rows):
[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0]
[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [48]:
# Vectorizing relationship between users and genres

user_genre_matrix = []

# Loop through all users (there are 100)
for user in df_users['user_id']:
    
    # Create a blank list the size of our genre set
    user_row = [0 for i in range(len(genres_set))]
    
    # Get us the rating records belonging to our current user
    user_records = df_ratings.loc[df_ratings['user_id'] == user].values
    
    # Loop through each rating record
    for record in user_records:
    
        current_movie_id = record[1] # movie our user has seen
        movie_record = df_movies.loc[df_movies['movie_id'] == current_movie_id].values
        movie_genres = movie_record[0][2] # genres of the movie
        
        # Collect all genres for the movie and update the user row accordingly
        for index, genre in enumerate(genres_set):
            
            # Skip NaN values
            if pd.isnull(movie_genres):
                continue
            
            # Update row if we find a genre
            if genre in movie_genres.split('|'):
                user_row[index] = 1
                
    # Append our row            
    user_genre_matrix.append(user_row)
        
# Print the relationship matrix
print("User Genre Matrix (first 4 rows):")
for row in user_genre_matrix[:4]:
    print(row)

User Genre Matrix (first 4 rows):
[1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]


I'm hoping I've done the above User Genre matrix correctly. My aim was to ask "What movies has this user seen?" and from that "What genres out of all these movies has the user seen?"

In [56]:
# Calculating cosine_similarity between movies and users.
cos_sim = cosine_similarity(movie_genre_matrix, user_genre_matrix)

print(cos_sim)

[[0.46291005 0.46291005 0.4472136  ... 0.46291005 0.4472136  0.4472136 ]
 [0.46291005 0.46291005 0.4472136  ... 0.46291005 0.4472136  0.4472136 ]
 [0.37796447 0.37796447 0.36514837 ... 0.37796447 0.36514837 0.36514837]
 ...
 [0.37796447 0.37796447 0.36514837 ... 0.37796447 0.36514837 0.36514837]
 [0.26726124 0.26726124 0.25819889 ... 0.26726124 0.25819889 0.25819889]
 [0.37796447 0.37796447 0.36514837 ... 0.37796447 0.36514837 0.36514837]]


## Collaborative Filtering Recommendation Model by Users
* Use train_test_split to split above dataset with the ratio 50/50. The test dataset will be used as groundtruth to evaluate the rating calculated by using the train dataset.
* Create matrix for users, movies and ratings in both training and testing datasets.
* Calculate the user correlation.
* Implement a predict based on user correlation coefficient.
* Predict on train dataset and compare the RMSE with the test dataset.

## Collaborative Filtering Recommendation Model by Items
* Calculate the item correlation.
* Implement function to predict ratings based on Item Similarity.
* Predict on train dataset and compare the RMSE with the test dataset.
* Compare the results between User-based and Item-based. Make conclusion.