# Movie Reccomendations

In [19]:
# Importing libraries
import pandas as pd
import numpy as np
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [3]:
# Importing data
df_users = pd.read_csv("users.csv")
df_movies = pd.read_csv('movies.csv', encoding='latin-1') # Had to use special encoding for this to fix an error
df_ratings = pd.read_csv("ratings.csv")

In [4]:
df_users.head()

Unnamed: 0,user_id,gender,age,occupation,zipcode,age_desc,occ_desc
0,1,F,1,10,48067,Under 18,K-12 student
1,2,M,56,16,70072,56+,self-employed
2,3,M,25,15,55117,25-34,scientist
3,4,M,45,7,2460,45-49,executive/managerial
4,5,M,25,20,55455,25-34,writer


In [5]:
df_ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1,3
1,1,3,5
2,1,4,4
3,1,5,3
4,1,6,3


In [6]:
df_movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## Content-Based Reccomendation Model
* Find list of used genres which is used to category the movies
* Vectorize the relationship between movies and genres and put them into Ij.
* Vectorize the relationship between users and genres and put them into Uj (if user rate for a movie, he/she has the related history with the movies’ genres).
* Compute the cosine_similarity between movies and users

In [7]:
# Copy list of genres that aren't NaN
df_genres = df_movies['genres'].dropna()

In [8]:
# Find set of all genres
genres_set = []

for g in df_genres:
    g_list = g.split('|')
    for i in g_list:
        genres_set.append(i)

genres_set = set(genres_set)

In [9]:
# Display our genres
print(genres_set)

{'Animation', 'Sci-Fi', 'Comedy', 'Documentary', 'Drama', 'Action', 'Fantasy', "Children's", 'Crime', 'Romance', 'Musical', 'War', 'Thriller', 'Adventure', 'Horror'}


In [10]:
# Vectorizing relationship between movies and genres
movie_genre_matrix = []

# Loop through each movies' string of genres
for movie_genres in df_movies['genres']:
    
    row = []
    
    # Loop through our 15 genres
    for genre in genres_set:
        
        # If null -> 0
        if pd.isnull(movie_genres):
            row.append(0)
        
        # If the genre is found in the set -> 1
        elif genre in movie_genres.split('|'):
            row.append(1)
        
        # If the genre is not found in the set -> 0
        else:
            row.append(0)
    
    # Add the row to our matrix
    movie_genre_matrix.append(row)


# Print the relationship matrix
print("Movie Genre Matrix (first 4 rows):")
for row in movie_genre_matrix[:4]:
    print(row)


Movie Genre Matrix (first 4 rows):
[1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0]
[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [11]:
# Vectorizing relationship between users and genres

user_genre_matrix = []

# Loop through all users (there are 100)
for user in df_users['user_id']:
    
    # Create a blank list the size of our genre set
    user_row = [0 for i in range(len(genres_set))]
    
    # Get us the rating records belonging to our current user
    user_records = df_ratings.loc[df_ratings['user_id'] == user].values
    
    # Loop through each rating record
    for record in user_records:
    
        current_movie_id = record[1] # movie our user has seen
        movie_record = df_movies.loc[df_movies['movie_id'] == current_movie_id].values
        movie_genres = movie_record[0][2] # genres of the movie
        
        # Collect all genres for the movie and update the user row accordingly
        for index, genre in enumerate(genres_set):
            
            # Skip NaN values
            if pd.isnull(movie_genres):
                continue
            
            # Update row if we find a genre
            if genre in movie_genres.split('|'):
                user_row[index] = 1
                
    # Append our row            
    user_genre_matrix.append(user_row)
        
# Print the relationship matrix
print("User Genre Matrix (first 4 rows):")
for row in user_genre_matrix[:4]:
    print(row)

User Genre Matrix (first 4 rows):
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1]


I'm hoping I've done the above User Genre matrix correctly. My aim was to ask "What movies has this user seen?" and from that "What genres out of all these movies has the user seen?"

In [12]:
# Calculating cosine_similarity between movies and users.
cos_sim = cosine_similarity(movie_genre_matrix, user_genre_matrix)

print(cos_sim)

[[0.46291005 0.46291005 0.4472136  ... 0.46291005 0.4472136  0.4472136 ]
 [0.46291005 0.46291005 0.4472136  ... 0.46291005 0.4472136  0.4472136 ]
 [0.37796447 0.37796447 0.36514837 ... 0.37796447 0.36514837 0.36514837]
 ...
 [0.37796447 0.37796447 0.36514837 ... 0.37796447 0.36514837 0.36514837]
 [0.26726124 0.26726124 0.25819889 ... 0.26726124 0.25819889 0.25819889]
 [0.37796447 0.37796447 0.36514837 ... 0.37796447 0.36514837 0.36514837]]


## Collaborative Filtering Recommendation Model by Users
* Use train_test_split to split above dataset with the ratio 50/50. The test dataset will be used as groundtruth to evaluate the rating calculated by using the train dataset.
* Create matrix for users, movies and ratings in both training and testing datasets.
* Calculate the user correlation.
* Implement a predict based on user correlation coefficient.
* Predict on train dataset and compare the RMSE with the test dataset.

In [13]:


def sim_matrix(M, dimension='user'):
    N = M.shape[0] if dimension == 'user' else M.shape[1]
    print(N)
    sim = np.zeros([N,N])
    for i in range(N):
        for j in range(N):
            if i == j:
                sim[i,j] = 0 #Cancel out the effect of self-similarity in the sums later
                continue
            if dimension == 'user':
                v1, v2 = M[i,:], M[j,:]
            else:
                v1, v2 = M[:,i], M[:,j]
            sim[i][j] = correlation_similarity(v1,v2)
    print(sim.shape)
    return sim

In [32]:
# Cleaning & Splitting the data (Massive help from Thanh working through this)

# Import a version of ratings dataframe with only user_id, movie_id, and the rating
ratings = pd.read_csv('ratings.csv', encoding='latin-1', usecols=['user_id', 'movie_id', 'rating'])

# Replace user_id & movie_id NaN values with 0
ratings['user_id'] = ratings['user_id'].fillna(0)
ratings['movie_id'] = ratings['movie_id'].fillna(0)

# Replace rating NaN values with average of all values
ratings['rating'] = ratings['rating'].fillna(ratings['rating'].mean())

# Splitting data 50% into training and testing
train_data, test_data = train_test_split(ratings, test_size=0.5)

# Taking a look at our newly split data
print("Testing Data:\n", test_data.head(), "\n")
print("Training Data:\n", train_data.head())

Testing Data:
       user_id  movie_id  rating
4624       62        72       2
886        12        40       3
347         5        59       4
7055       94        77       2
2242       30        81       4 

Training Data:
       user_id  movie_id  rating
2955       40        57       3
6969       93        58       4
3903       53         7       3
4812       65        23       4
1263       17        42       5


In [65]:
# Creating our matricies from the data (user-based)
train_matrix_user = train_data.pivot_table(index='movie_id', columns='user_id', values='rating').astype('float64')
test_matrix_user = test_data.pivot_table(index='movie_id', columns='user_id', values='rating').astype('float64')

# Looking at one of our matricies
train_matrix_user

user_id,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.0,3.0,3.0,1.0,,,,,,5.0,...,1.0,,,,3.0,,,4.0,,4.0
2,,4.0,,3.0,,2.0,3.0,1.0,,4.0,...,,,,,,,,,,
3,,5.0,4.0,3.0,,,4.0,,5.0,,...,4.0,,,5.0,3.0,,,5.0,,4.0
4,4.0,,3.0,,4.0,,,,,,...,2.0,4.0,,,3.0,,4.0,,,
5,,5.0,,3.0,,,2.0,4.0,,4.0,...,1.0,,4.0,,,,1.0,3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,,,4.0,,,,5.0,,,2.0,...,2.0,,3.0,,,,,,4.0,4.0
97,4.0,2.0,,,,,,,,4.0,...,4.0,,,,5.0,,5.0,5.0,4.0,4.0
98,,,,,,,,,,2.0,...,,5.0,,5.0,3.0,,,,,
99,,4.0,4.0,,,,,,,,...,4.0,3.0,,,,2.0,4.0,,3.0,5.0


In [68]:
# Create correlation matrix
corr_matrix_user = train_matrix_user.corr(method='pearson', min_periods=10)

# Display our user correlation matrix
corr_matrix_user

user_id,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.165751,-0.406587,-0.256148,-0.276780,-0.121673,-0.301174,-1.632130e-01,0.075023,0.546481,...,0.700028,-0.171959,0.142857,-0.132745,-0.158032,0.393042,-0.033137,-0.407323,0.032810,-0.240583
2,0.165751,1.000000,0.107381,-0.215438,0.125281,-0.416631,0.059761,-5.110997e-02,-0.233734,-0.215097,...,0.243688,0.296667,-0.237438,0.719589,0.025683,-0.190330,-0.322609,0.162062,-0.206538,-0.148675
3,-0.406587,0.107381,1.000000,-0.025842,-0.077161,-0.027690,0.044103,5.076553e-01,-0.162980,-0.355076,...,-0.190832,0.193970,0.102151,0.434594,0.300235,0.096674,-0.303418,0.100000,-0.022907,-0.032693
4,-0.256148,-0.215438,-0.025842,1.000000,-0.331819,-0.011389,-0.420808,1.550987e-01,0.488270,-0.666283,...,0.347908,,-0.206746,-0.225956,0.595835,-0.339480,0.203776,-0.289196,,-0.070899
5,-0.276780,0.125281,-0.077161,-0.331819,1.000000,0.303046,0.071040,3.470740e-01,0.233756,-0.291615,...,-0.497273,,0.036539,0.463517,-0.392857,0.417365,0.304572,0.162400,-0.484274,0.596520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0.393042,-0.190330,0.096674,-0.339480,0.417365,-0.241274,-0.281718,-1.941007e-01,0.493755,0.037814,...,0.114708,,0.289689,0.087002,0.159656,1.000000,-0.277035,0.214834,-0.234861,-0.326210
97,-0.033137,-0.322609,-0.303418,0.203776,0.304572,0.515153,0.236113,3.348411e-01,0.317491,0.150824,...,0.321055,-0.098304,0.022355,-0.208979,0.084791,-0.277035,1.000000,0.290684,0.609404,0.248792
98,-0.407323,0.162062,0.100000,-0.289196,0.162400,0.119352,0.009948,-4.186913e-17,-0.105272,-0.182938,...,-0.198826,0.080208,-0.313946,0.356026,0.241790,0.214834,0.290684,1.000000,-0.530931,0.158252
99,0.032810,-0.206538,-0.022907,,-0.484274,-0.579141,0.362012,,,0.159711,...,0.145689,,-0.158665,-0.373939,-0.096044,-0.234861,0.609404,-0.530931,1.000000,0.277102


## Collaborative Filtering Recommendation Model by Items
* Calculate the item correlation.
* Implement function to predict ratings based on Item Similarity.
* Predict on train dataset and compare the RMSE with the test dataset.
* Compare the results between User-based and Item-based. Make conclusion.

In [69]:
# Creating our matricies from the data (item-based)
train_matrix_item = train_data.pivot_table(index='user_id', columns='movie_id', values='rating').astype('float64')
test_matrix_item = test_data.pivot_table(index='user_id', columns='movie_id', values='rating').astype('float64')

In [70]:
# Create correlation matrix
corr_matrix_item = train_matrix_item.corr(method='pearson', min_periods=10)

# Display our user correlation matrix
corr_matrix_item

movie_id,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.254495,0.147794,0.395161,0.307848,-0.065717,0.166212,-0.212068,-0.006161,-0.498807,...,0.076194,-0.121578,-0.132521,0.076218,-0.170328,0.097460,0.041222,0.101888,-0.240766,-0.303978
2,0.254495,1.000000,-0.184900,-0.181207,0.075605,0.139285,-0.045455,-0.376504,-0.065099,-0.208797,...,-0.158114,-0.118872,-0.032870,-0.523788,0.039034,0.246728,-0.062904,,0.053441,-0.309837
3,0.147794,-0.184900,1.000000,-0.250743,-0.161605,0.066336,-0.194554,0.332956,0.145992,-0.287169,...,0.255876,0.327743,-0.683349,0.099504,-0.060193,0.454273,0.056394,-0.235864,0.117892,-0.280634
4,0.395161,-0.181207,-0.250743,1.000000,0.092962,-0.376785,0.398113,0.176947,0.000000,-0.022249,...,0.441034,-0.300005,-0.412217,-0.388379,-0.216598,0.067011,0.030216,0.247149,-0.080322,0.237245
5,0.307848,0.075605,-0.161605,0.092962,1.000000,,0.099281,0.077498,-0.017437,0.228606,...,0.363450,-0.475923,-0.403664,0.223952,,0.076234,-0.485840,,-0.129353,-0.327805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0.097460,0.246728,0.454273,0.067011,0.076234,,-0.161084,0.145865,-0.348868,-0.120657,...,,0.172357,-0.167454,-0.274721,0.074068,1.000000,-0.164980,,-0.325893,-0.264372
97,0.041222,-0.062904,0.056394,0.030216,-0.485840,-0.307860,-0.302117,0.067106,-0.368617,-0.149112,...,-0.232598,0.027420,0.067107,-0.267261,0.449503,-0.164980,1.000000,-0.241667,0.300939,-0.689500
98,0.101888,,-0.235864,0.247149,,,0.252523,,,0.410117,...,0.119681,,-0.245676,0.364900,,,-0.241667,1.000000,,
99,-0.240766,0.053441,0.117892,-0.080322,-0.129353,-0.171780,-0.131716,-0.360810,0.156098,0.066959,...,0.000000,-0.240754,-0.346993,,0.349975,-0.325893,0.300939,,1.000000,-0.239722
