---
jupyter: python3
---

Importamos las librerias necesarias

In [None]:
import pandas as pd
import numpy as np
from functools import reduce
from scipy import sparse
from surprise import Dataset, Reader, accuracy, SVD, NMF
from surprise.model_selection import train_test_split
import surprise.prediction_algorithms.knns as knns
import surprise.prediction_algorithms.matrix_factorization
from sklearn.metrics import ndcg_score

## 1. Importamos los datos del dataset de manera manual para disponer de movies.csv

In [None]:
# We load all the variables we care about
ratings= pd.read_csv("ml-100k/ratings.csv")
movies = pd.read_csv('ml-100k/movies.csv')
tags = pd.read_csv('ml-100k/tags.csv')
links = pd.read_csv('ml-100k/links.csv')

Observamos la forma de los datos

In [None]:
ratings['rating']

Vemos que tenemos:
- userId: ID del usuario que dejó la calificación
- movieId: ID de la pelicula calificada
- rating: Calificación de 1 a 5 
- timestamp: Unidad de tiempo en la que se dejó la clasificación

Convertimos los datos para la libreria Surprise

In [None]:
reader = Reader(rating_scale=(1,5))
# Train/Test split using pandas alone
#train = ratings.sample(frac=0.75, random_state=1234)
#test = ratings.drop(train.index)
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

## 2. Separamos los conjuntos de train y test

In [None]:
trainset, testset = train_test_split(data, test_size=0.25, random_state=1234)

## 3. Realizamos las predicciones con diferentes algoritmos

Obtenemos predicciones con KNNBasic

In [None]:
knn = knns.KNNBasic(sim_options={"name": "pearson"})
knn.fit(trainset)
knn_pred = knn.test(testset)

Con KNNBasic basado en productos

In [None]:
knn_prod = knns.KNNBasic(sim_options={"name": "pearson", 'user_based': False})
knn_prod.fit(trainset)
knn_prod_pred = knn_prod.test(testset)

Con SVD

In [None]:
svd = SVD()
svd.fit(trainset)
svd_predictions = svd.test(testset)

Con NMF

In [None]:
nmf = NMF()
nmf.fit(trainset)
nmf_predictions = svd.test(testset)

In [None]:
predictions = {"KNNSBasic User Based": knn_pred, "KNNSBasic Product Based": knn_prod_pred, "SVD": svd_predictions, "NMF": nmf_predictions}

Funcion auxiliar para obtener los resultados de las predicciones

In [None]:
def get_titles(prediction, n, movies):
    return [movies.loc[movies['movieId'] == recomended_id, 'title'].item() for recomended_id in [item.iid for item in prediction[0:n]]]

## 5. Obtenemos las predicciones para cada algoritmo
KNNSBasic

In [None]:
get_titles(knn_pred, 30, movies)

KNNSBasic products

In [None]:
get_titles(knn_prod_pred, 10, movies)

SVD

In [None]:
get_titles(svd_predictions, 30, movies)

NMF

In [None]:
get_titles(nmf_predictions, 10, movies)

Función auxiliar para generar el diccionario de usuarios y predicciones

In [None]:
def groupby_id(d, p):
    if p.uid in d:
        d[p.uid].append(p)
        return d
    else:
        d[p.uid] = [p]
        return d

Generamos nuestra lista de recomendaciones y la ordenamos por positivas

In [None]:
def get_recomendation_list(predictions):
    recomendation_list = reduce(groupby_id, predictions, {})
    for recomendations in recomendation_list.values():
        recomendations.sort(key=lambda x: x.est, reverse=True)
    return recomendation_list

In [None]:
def recallAtK(rec_list, k = 10):
    out = {}
    for user, user_preds in rec_list.items():
        relevant_k = len(list(filter(lambda p: p.r_ui >= 4 and p.est >=4, user_preds[:k])))
        n_rel = len(list(filter(lambda p: p.r_ui >= 4, user_preds)))
        if len(user_preds) >= k:
            out[user] = relevant_k / n_rel if n_rel != 0 else 0
    return out

def precisionAtK(rec_list, k = 10):
    out = {}
    for user, user_preds in rec_list.items():
        relevant_k = len(list(filter(lambda p: p.r_ui >= 4 and p.est >=4, user_preds[:k])))
        n_rec_k = len(list(filter(lambda p: p.est >= 4, user_preds[:k])))
        if len(user_preds) >= k:
            out[user] = relevant_k / n_rec_k if n_rec_k != 0 else 0
    return out

In [None]:
#No entiendo NDCG esto está sacado de github issues para surprise
def get_ndcg(surprise_predictions, k_highest_scores=None):
    uids = [int(p.uid) for p in surprise_predictions ]
    iids = [int(p.iid) for p in surprise_predictions ]
    r_uis = [p.r_ui for p in surprise_predictions ]
    ests = [p.est for p in surprise_predictions ]
    
    assert(len(uids) == len(iids) == len(r_uis) == len(ests) )    
    
    sparse_preds = sparse.coo_matrix( (ests, (uids , iids )) )
    sparse_vals = sparse.coo_matrix( (r_uis, (uids , iids )) )
    
    dense_preds = sparse_preds.toarray()
    dense_vals = sparse_vals.toarray()
    
    return ndcg_score(y_true= dense_vals , y_score= dense_preds, k=k_highest_scores)

Calculamos accuracy, Recall y precision @ 10

In [None]:
metricas_evaluacion = {"Accuracy": [], "Recall@k": [], "Precision@k": [], "NDGC": []}
for pred in predictions.values():
    metricas_evaluacion["Accuracy"].append(accuracy.rmse(pred))
    recs = get_recomendation_list(pred)
    recalls = recallAtK(recs)
    avg_recall = sum(recalls.values()) / len(recalls)
    metricas_evaluacion["Recall@k"].append(avg_recall)
    precision = precisionAtK(recs)
    avg_precision = sum(precision.values()) / len(recalls)
    metricas_evaluacion["Precision@k"].append(avg_precision)
    ndgc = get_ndcg(pred)
    metricas_evaluacion["NDGC"].append(ndgc)
metricas = pd.DataFrame(data=metricas_evaluacion, index=predictions.keys())
metricas

## 7. Conclusion

Parece ser que en todas las métricas el que mejor funciona para este dataset es KNNSBasic basado en usuarios.
