In [65]:
import pandas as pd
import numpy as np
import math

In [3]:
movies = pd.read_csv("../archive/movies_metadata.csv", low_memory=False)
user_ratings = pd.read_csv("../archive/ratings.csv", dtype={'userId': int, 'movieId': str, 'rating': float,'timestamp': int})
id_links = pd.read_csv("../archive/links.csv", dtype={'movieId': str, 'imdbId': str, 'tmdbId': str})
user_ratings = pd.merge(user_ratings, id_links, left_on='movieId', right_on='movieId', how='left')

In [84]:
user_ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,1,110,1.0,1425941529,112573,197
1,1,147,4.5,1425942435,112461,10474


In [123]:
def pearson_for_ratings(my_ratings, other_ratings):
    # Puedo tener ratings que el otro usuario no tiene
    my_ratings = my_ratings[my_ratings['movieId'].isin(other_ratings['movieId'].tolist())]
       
    my_ratings_mean = my_ratings['rating'].mean()
    other_ratings_mean = other_ratings['rating'].mean()
    my_ratings = my_ratings.sort_values(by='movieId', ascending=False)
    other_ratings = other_ratings.sort_values(by='movieId', ascending=False)
    
    Sxy, Sxx, Syy = 0, 0, 0
    for i in range(len(my_ratings)):
        my_rating_i = my_ratings.iloc[i]['rating']
        other_rating_i = other_ratings.iloc[i]['rating']
        
        my_distance = my_rating_i - my_ratings_mean
        other_distance = other_rating_i - other_ratings_mean
        Sxy += my_distance*other_distance
        Sxx += pow(my_distance,2)
        Syy += pow(other_distance,2)
    
    if Sxx == 0 or Syy == 0:
        return 0
    else:
        return Sxy/math.sqrt(Sxx*Syy)

In [124]:
def get_collab_recommendations(user):
    my_ratings = user_ratings[user_ratings['userId']==user]
    similar_users = user_ratings[user_ratings['movieId'].isin(my_ratings['movieId'].to_list())]
    
    ratings_grouped = similar_users.groupby(['userId'])
    ratings_grouped = sorted(ratings_grouped,  key=lambda x: len(x[1]), reverse=True)
    ratings_grouped = ratings_grouped[0:1000]
    
    users_pearson = []
    for user, ratings_df in ratings_grouped:
        pearson_row = {'userId':user, 'pearson': pearson_for_ratings(my_ratings, ratings_df)}
        users_pearson.append(pearson_row)
    near_users = pd.DataFrame.from_records(users_pearson)
    near_users = near_users.iloc[1:]
    #near_users.sort_values(by='pearson', inplace=True, ascending=False)
    ratings_with_pearson = pd.merge(user_ratings, near_users, left_on='userId', right_on='userId', how='right')
    ratings_with_pearson['weighted_rating'] = ratings_with_pearson['pearson'] * ratings_with_pearson['rating']
    
    # Para no perder el tmdbId en la agregacion
    out = ratings_with_pearson.groupby(['movieId','tmdbId'])
    out = out.agg({'weighted_rating':'mean'}).rename(columns={'weighted_rating':'mean_weighted_rating'}).reset_index()
    
    #out.sort_values(by='count', ascending=False, inplace=True)
    out.sort_values(by='mean_weighted_rating', ascending=False, inplace=True)
    to_remove = my_ratings['movieId'].to_list()
    out = out[~out['movieId'].isin(to_remove)][0:10]
    
    out = pd.merge(out, movies, left_on='tmdbId', right_on='id', how='left')
    
    return out[['title', 'tmdbId', 'mean_weighted_rating']]

In [125]:
rec = get_collab_recommendations(1)

In [126]:
rec

Unnamed: 0,title,tmdbId,mean_weighted_rating
0,The 24 Hour War,359093,2.32746
1,Peppermint Soda,80220,2.326161
2,Kara,200558,2.298639
3,Reggie Watts: Why Shit So Crazy?,56275,2.262762
4,In Celebration,158741,2.191781
5,Political Animals,200035,2.191781
6,Taxi for Tobruk,33336,2.148286
7,Classic Albums: Iron Maiden - The Number of th...,102622,2.148286
8,AC/DC- Let There Be Rock,50125,2.148286
9,This Is What They Want,232048,2.094714
