In [101]:
import pandas as pd
import numpy as np
import math
import scipy.stats as stats
from scipy import spatial

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv("../archive/movies_metadata.csv", low_memory=False)
user_ratings = pd.read_csv("../archive/ratings.csv", dtype={'userId': int, 'movieId': str, 'rating': float,'timestamp': int})
id_links = pd.read_csv("../archive/links.csv", dtype={'movieId': str, 'imdbId': str, 'tmdbId': str})
user_ratings = pd.merge(user_ratings, id_links, left_on='movieId', right_on='movieId', how='left')

In [84]:
user_ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,1,110,1.0,1425941529,112573,197
1,1,147,4.5,1425942435,112461,10474


In [67]:
def pearson_for_ratings(my_ratings, other_ratings):
    # Puedo tener ratings que el otro usuario no tiene
    my_ratings = my_ratings[my_ratings['movieId'].isin(other_ratings['movieId'].tolist())]

    my_ratings = my_ratings.sort_values(by='movieId', ascending=False)['rating'].tolist()
    other_ratings = other_ratings.sort_values(by='movieId', ascending=False)['rating'].tolist()
    
    correlation, _ = stats.pearsonr(my_ratings,other_ratings)
    #correlation = spatial.distance.euclidean(my_ratings,other_ratings)
    #correlation = spatial.distance.cosine(my_ratings,other_ratings)
    return correlation

In [None]:
def get_collab_recommendations_2(user):
    my_ratings = user_ratings[user_ratings['userId']==user]
    similar_users = user_ratings[user_ratings['movieId'].isin(my_ratings['movieId'].to_list())]

    # TODO Hay mejor manera de obtener los 100 users mayor interseccion de peliculas
    ratings_grouped = similar_users.groupby(['userId'])
    ratings_grouped = sorted(ratings_grouped,  key=lambda x: len(x[1]), reverse=True)
    ratings_grouped = ratings_grouped[0:100]
    # Pisamos nos interesan solos los ids
    users = [user_df[0] for user_df in ratings_grouped]
    ratings_grouped = ":("
    less_ratings = user_ratings[user_ratings['userId'].isin(users)]
    # Movies X Users matrix
    user_data = less_ratings.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)
    
    user_similarity = cosine_similarity(user_data)
    user_similarity[np.isnan(user_similarity)] = 0
    user_predicted_ratings = np.dot(user_similarity, user_data)
    
    user_predicted_ratings = user_predicted_ratings.loc[user].sort_values(ascending = False)
    
    to_remove = my_ratings['movieId'].to_list()
    
    out = out[~out['movieId'].isin(to_remove)][0:10]

    out = pd.merge(out, movies, left_on='tmdbId', right_on='id', how='left')
    
    return out[['title', 'tmdbId', 'mean_weighted_rating']]

In [61]:
def get_collab_recommendations(user):
    my_ratings = user_ratings[user_ratings['userId']==user]
    similar_users = user_ratings[user_ratings['movieId'].isin(my_ratings['movieId'].to_list())]
    
    ratings_grouped = similar_users.groupby(['userId'])
    ratings_grouped = sorted(ratings_grouped,  key=lambda x: len(x[1]), reverse=True)
    ratings_grouped = ratings_grouped[0:100]
    
    users_pearson = []
    for user, ratings_df in ratings_grouped:
        pearson_row = {'userId':user, 'pearson': pearson_for_ratings(my_ratings, ratings_df)}
        users_pearson.append(pearson_row)
    
    # Removemos el mismo usuario
    users_pearson = [up for up in users_pearson if up['userId']!=user]
    near_users = pd.DataFrame.from_records(users_pearson)

    ratings_with_pearson = pd.merge(user_ratings, near_users, left_on='userId', right_on='userId', how='right')
    ratings_with_pearson['weighted_rating'] = ratings_with_pearson['pearson'] * ratings_with_pearson['rating']
    
    # Para no perder el tmdbId en la agregacion
    out = ratings_with_pearson.groupby(['movieId','tmdbId'])
    out = out.agg({'weighted_rating':'mean'}).rename(columns={'weighted_rating':'mean_weighted_rating'}).reset_index()
    
    out.sort_values(by='mean_weighted_rating', ascending=False, inplace=True)
    to_remove = my_ratings['movieId'].to_list()
    out = out[~out['movieId'].isin(to_remove)][0:10]

    out = pd.merge(out, movies, left_on='tmdbId', right_on='id', how='left')
    
    return out[['title', 'tmdbId', 'mean_weighted_rating']]

In [68]:
rec = get_collab_recommendations(2)

In [69]:
rec

Unnamed: 0,title,tmdbId,mean_weighted_rating
0,Entity,131887,1.98188
1,Rudolph's Shiny New Year,30059,1.98188
2,The Ghost,21792,1.98188
3,Pearl Jam: Immagine in Cornice,18968,1.884463
4,The Smashing Pumpkins: Vieuphoria,57644,1.884463
5,A Haunting at Silver Falls,197919,1.783692
6,Oliver's Story,163631,1.783692
7,The War Tapes,23495,1.696016
8,Smashing Pumpkins: If All Goes Wrong,20996,1.696016
9,Salvador (Puig Antich),1896,1.662431


## Esto es un asco la idea es hacer cosine similarity con las peliculas de los que tengo en comun y coso

In [154]:
    my_ratings = user_ratings[user_ratings['userId']==2862]
    similar_users = user_ratings[user_ratings['movieId'].isin(my_ratings['movieId'].to_list())]

    # TODO Hay mejor manera de obtener los 100 users mayor interseccion de peliculas
    ratings_grouped = similar_users.groupby(['userId'])
    ratings_grouped = sorted(ratings_grouped,  key=lambda x: len(x[1]), reverse=True)
    ratings_grouped = ratings_grouped[0:100]
    # Pisamos nos interesan solos los ids
    users = [user_df[0] for user_df in ratings_grouped]
    ratings_grouped = ":("
    less_ratings = user_ratings[user_ratings['userId'].isin(users)]
    # Movies X Users matrix
    user_data = less_ratings.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)
    
    aux = user_data.reset_index()
    index_of_user = aux[aux['userId']==2862].index[0]
    
    user_similarity = cosine_similarity(user_data)
    user_similarity[np.isnan(user_similarity)] = 0
    user_predicted_ratings = np.dot(user_similarity, user_data)
    predictions = user_predicted_ratings[index_of_user].tolist().sort(ascending = False)
    

AttributeError: 'list' object has no attribute 'sort_values'

In [158]:
user_predicted_ratings

array([[164.51648734, 137.00035256,  49.82170075, ...,   4.49095806,
          4.21840435,   2.80172795],
       [171.75781165, 145.04180305,  51.23844915, ...,   3.52220957,
          3.45056042,   2.2645039 ],
       [ 31.81036537,  26.89558894,   8.94130721, ...,   0.67692318,
          0.59815499,   0.34146169],
       ...,
       [203.60197958, 170.24239027,  63.51575533, ...,   4.57702244,
          4.61339061,   3.31938519],
       [176.81822193, 149.31732752,  52.9758087 , ...,   3.53337633,
          3.64764367,   2.39384144],
       [144.49794307, 120.01193603,  49.05633895, ...,   4.14405101,
          6.01028413,   3.5323429 ]])

In [152]:
type(predictions)

NameError: name 'predictions' is not defined

In [134]:
asd = user_data.reset_index()

In [145]:
user_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,1,110,1.0,1425941529,112573,197
1,1,147,4.5,1425942435,112461,10474
2,1,858,5.0,1425941523,68646,238
3,1,1221,5.0,1425941546,71562,240
4,1,1246,5.0,1425941556,97165,207
