In [7]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split

In [31]:
# Read the ratings data
ratings = pd.read_csv("./data/ml-latest-small/ratings.csv")
movies = pd.read_csv("./data/ml-latest-small/movies.csv")
tags = pd.read_csv("./data/ml-latest-small/tags.csv")
ratings.drop("timestamp", axis=1, inplace=True)
print(ratings.head(5))

       userId  movieId  rating
43321     290     1357     4.0
27351     186      969     5.0
13899      89   105540     4.5
63852     414     5064     4.0
28202     195     2770     4.0
       userId  movieId  rating
91446     593     2336     4.5
66730     430     1372     4.0
98658     607     3994     3.0
16699     105    74458     5.0
52201     339    45210     4.5


In [32]:
ratings_train, ratings_test = train_test_split(ratings, test_size=0.2)

print(ratings_train.head(5))
print(ratings_test.head(5))


       userId  movieId  rating
43321     290     1357     4.0
27351     186      969     5.0
13899      89   105540     4.5
63852     414     5064     4.0
28202     195     2770     4.0
       userId  movieId  rating
91446     593     2336     4.5
66730     430     1372     4.0
98658     607     3994     3.0
16699     105    74458     5.0
52201     339    45210     4.5


In [34]:
# Calculating the mean adjusted ratings of each user
mean = ratings_train.groupby(['movieId'], as_index=False, sort=False).mean().rename(columns = {'rating': 'rating_mean'})[['movieId', 'rating_mean']]
print(mean.head(5)) 
adjusted_ratings = pd.merge(ratings_train, mean, on="movieId", how='left', sort=False)
adjusted_ratings["adjusted_rating"] = adjusted_ratings["rating"] - adjusted_ratings["rating_mean"]
adjusted_ratings.loc[adjusted_ratings['adjusted_rating'] == 0, 'adjusted_rating'] = 1e-8
print(adjusted_ratings.head(5))

   movieId  rating_mean
0     1357     3.944444
1      969     4.038462
2   105540     4.500000
3     5064     3.840000
4     2770     2.940000
   userId  movieId  rating  rating_mean  adjusted_rating
0     290     1357     4.0     3.944444     5.555556e-02
1     186      969     5.0     4.038462     9.615385e-01
2      89   105540     4.5     4.500000     1.000000e-08
3     414     5064     4.0     3.840000     1.600000e-01
4     195     2770     4.0     2.940000     1.060000e+00


In [35]:
# Build item-item similarity matrix
def build_similarity_matrix(adjusted_ratings):

    w_matrix_columns = ['movie_1', 'movie_2', 'weight']
    w_matrix = pd.DataFrame(columns=w_matrix_columns)
    
    distinct_movies = np.unique(adjusted_ratings['movieId'])
    i = 0
    no_of_movies = len(distinct_movies)
    
    for movie_1 in distinct_movies:
        
        if i%10 == 0:
            print("Processed ", i, "out of", no_of_movies)
        
        #Extract all users that rated this movie
        user_data = adjusted_ratings[adjusted_ratings["movieId"] == movie_1]
        distinct_users = np.unique(user_data['userId'])
        
        #record these columns
        record_columns = ["userId", "movie_1", "movie_2", "adjusted_rating_1", "adjusted_rating_2"]
        record_movie_1_2 = pd.DataFrame(columns=record_columns)
        
        for user_id in distinct_users:
            # The user user_id's rating for movie_1
            c_movie_1_rating = user_data[user_data['userId'] == user_id]['adjusted_rating'].iloc[0]
            
            #Extract movies rated by this user excluding movie_1
            c_user_data = adjusted_ratings[(adjusted_ratings["userId"] == user_id) & (adjusted_ratings["movieId"] != movie_1)]
            c_distinct_movies = np.unique(c_user_data["movieId"])
            
            for movie_2 in c_distinct_movies:
                c_movie_2_rating = c_user_data[c_user_data["movieId"] == movie_2]['adjusted_rating'].iloc[0]
                record_row = pd.Series([user_id, movie_1, movie_2, c_movie_1_rating, c_movie_2_rating], index=record_columns)
                
                record_movie_1_2 = record_movie_1_2.append(record_row, ignore_index=True)
        
        distinct_movie_2 = np.unique(record_movie_1_2['movie_2'])
        
        # Calculate similarity between movie 1 and all above recorded movies
        for movie_2 in distinct_movie_2:
            paired_movie_1_2 = record_movie_1_2[record_movie_1_2["movie_2"] == movie_2]
            sim_value_numerator = (paired_movie_1_2['adjusted_rating_1'] * paired_movie_1_2['adjusted_rating_2']).sum()
            sim_value_denominator = np.sqrt(np.square(paired_movie_1_2['adjusted_rating_1']).sum()) * np.sqrt(np.square(paired_movie_1_2['adjusted_rating_2']).sum())
            
            sim_value_denominator = sim_value_denominator if sim_value_denominator != 0 else 1e-8
            sim_value = sim_value_numerator/sim_value_denominator
            
            w_matrix.append(pd.Series([movie_1, movie_2, sim_value], index=w_matrix_columns), ignore_index=True)
            
        i = i + 1
            
            
            
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    


SyntaxError: unexpected EOF while parsing (<ipython-input-35-7f4e0a7492d8>, line 53)