---
jupyter: python3
---

Importamos las librerias necesarias

In [22]:
import pandas as pd
import numpy as np
from functools import reduce
from scipy import sparse
from surprise import Dataset, Reader, accuracy, SVD, NMF
from surprise.model_selection import train_test_split
import surprise.prediction_algorithms.knns as knns
import surprise.prediction_algorithms.matrix_factorization
from sklearn.metrics import ndcg_score

## 1. Importamos los datos del dataset de manera manual para disponer de movies.csv

In [23]:
# We load all the variables we care about
ratings= pd.read_csv("ml-100k/ratings.csv")
movies = pd.read_csv('ml-100k/movies.csv')
tags = pd.read_csv('ml-100k/tags.csv')
links = pd.read_csv('ml-100k/links.csv')

Observamos la forma de los datos

In [24]:
ratings['rating']

0         4.0
1         4.0
2         4.0
3         5.0
4         5.0
         ... 
100831    4.0
100832    5.0
100833    5.0
100834    5.0
100835    3.0
Name: rating, Length: 100836, dtype: float64

Vemos que tenemos:
- userId: ID del usuario que dejó la calificación
- movieId: ID de la pelicula calificada
- rating: Calificación de 1 a 5 
- timestamp: Unidad de tiempo en la que se dejó la clasificación

Convertimos los datos para la libreria Surprise

In [25]:
reader = Reader(rating_scale=(1,5))
# Train/Test split using pandas alone
#train = ratings.sample(frac=0.75, random_state=1234)
#test = ratings.drop(train.index)
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

## 2. Separamos los conjuntos de train y test

In [26]:
trainset, testset = train_test_split(data, test_size=0.25, random_state=1234)

## 3. Realizamos las predicciones con diferentes algoritmos

Obtenemos predicciones con KNNBasic

In [27]:
knn = knns.KNNBasic(sim_options={"name": "pearson"})
knn.fit(trainset)
knn_pred = knn.test(testset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


Con KNNBasic basado en productos

In [28]:
knn_prod = knns.KNNBasic(sim_options={"name": "pearson", 'user_based': False})
knn_prod.fit(trainset)
knn_prod_pred = knn_prod.test(testset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


Con SVD

In [29]:
svd = SVD()
svd.fit(trainset)
svd_predictions = svd.test(testset)

Con NMF

In [30]:
nmf = NMF()
nmf.fit(trainset)
nmf_predictions = svd.test(testset)

In [31]:
predictions = {"KNNSBasic User Based": knn_pred, "KNNSBasic Product Based": knn_prod_pred, "SVD": svd_predictions, "NMF": nmf_predictions}

Funcion auxiliar para obtener los resultados de las predicciones

In [32]:
def get_titles(prediction, n, movies):
    return [movies.loc[movies['movieId'] == recomended_id, 'title'].item() for recomended_id in [item.iid for item in prediction[0:n]]]

## 5. Obtenemos las predicciones para cada algoritmo
KNNSBasic

In [33]:
get_titles(knn_pred, 5ls, movies)

SyntaxError: invalid decimal literal (281762232.py, line 1)

KNNSBasic products

In [None]:
get_titles(knn_prod_pred, 5, movies)

['Rogue One: A Star Wars Story (2016)',
 'Godfather: Part II, The (1974)',
 'Breakfast Club, The (1985)',
 'Apollo 13 (1995)',
 'Office Space (1999)',
 'Logan (2017)',
 'Tucker & Dale vs Evil (2010)',
 'Easy Rider (1969)',
 'Purge, The (2013)',
 'Guardians of the Galaxy (2014)']

SVD

In [None]:
get_titles(svd_predictions, 5, movies)

['Rogue One: A Star Wars Story (2016)',
 'Godfather: Part II, The (1974)',
 'Breakfast Club, The (1985)',
 'Apollo 13 (1995)',
 'Office Space (1999)',
 'Logan (2017)',
 'Tucker & Dale vs Evil (2010)',
 'Easy Rider (1969)',
 'Purge, The (2013)',
 'Guardians of the Galaxy (2014)',
 'Dreamers, The (2003)',
 'Congo (1995)',
 'Talented Mr. Ripley, The (1999)',
 'Scanner Darkly, A (2006)',
 'Get Shorty (1995)',
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)',
 'Dogtown and Z-Boyz (2001)',
 'Fight Club (1999)',
 'Hard Candy (2005)',
 'Bruce Almighty (2003)',
 'Chamber, The (1996)',
 'American President, The (1995)',
 'Mission: Impossible (1996)',
 'Forrest Gump (1994)',
 'Taxi 4 (2007)',
 'Snowpiercer (2013)',
 'Truman Show, The (1998)',
 'Desperately Seeking Susan (1985)',
 'Twister (1996)',
 'Beauty and the Beast (1991)']

NMF

In [None]:
get_titles(nmf_predictions, 5, movies)

['Rogue One: A Star Wars Story (2016)',
 'Godfather: Part II, The (1974)',
 'Breakfast Club, The (1985)',
 'Apollo 13 (1995)',
 'Office Space (1999)',
 'Logan (2017)',
 'Tucker & Dale vs Evil (2010)',
 'Easy Rider (1969)',
 'Purge, The (2013)',
 'Guardians of the Galaxy (2014)']

## 6. Generamos la tabla de métricas
Función auxiliar para generar el diccionario de usuarios y predicciones

In [34]:
def groupby_id(d, p):
    if p.uid in d:
        d[p.uid].append(p)
        return d
    else:
        d[p.uid] = [p]
        return d

Generamos nuestra lista de recomendaciones y la ordenamos por positivas

In [35]:
def get_recomendation_list(predictions):
    recomendation_list = reduce(groupby_id, predictions, {})
    for recomendations in recomendation_list.values():
        recomendations.sort(key=lambda x: x.est, reverse=True)
    return recomendation_list

In [36]:
def recallAtK(rec_list, k = 10):
    out = {}
    for user, user_preds in rec_list.items():
        relevant_k = len(list(filter(lambda p: p.r_ui >= 4 and p.est >=4, user_preds[:k])))
        n_rel = len(list(filter(lambda p: p.r_ui >= 4, user_preds)))
        if len(user_preds) >= k:
            out[user] = relevant_k / n_rel if n_rel != 0 else 0
    return out

def precisionAtK(rec_list, k = 10):
    out = {}
    for user, user_preds in rec_list.items():
        relevant_k = len(list(filter(lambda p: p.r_ui >= 4 and p.est >=4, user_preds[:k])))
        n_rec_k = len(list(filter(lambda p: p.est >= 4, user_preds[:k])))
        if len(user_preds) >= k:
            out[user] = relevant_k / n_rec_k if n_rec_k != 0 else 0
    return out

In [37]:
#No entiendo NDCG esto está sacado de github issues para surprise
def get_ndcg(surprise_predictions, k_highest_scores=None):
    uids = [int(p.uid) for p in surprise_predictions ]
    iids = [int(p.iid) for p in surprise_predictions ]
    r_uis = [p.r_ui for p in surprise_predictions ]
    ests = [p.est for p in surprise_predictions ]
    
    assert(len(uids) == len(iids) == len(r_uis) == len(ests) )    
    
    sparse_preds = sparse.coo_matrix( (ests, (uids , iids )) )
    sparse_vals = sparse.coo_matrix( (r_uis, (uids , iids )) )
    
    dense_preds = sparse_preds.toarray()
    dense_vals = sparse_vals.toarray()
    
    return ndcg_score(y_true= dense_vals , y_score= dense_preds, k=k_highest_scores)

Calculamos accuracy, Recall y precision @ 10

In [None]:
metricas_evaluacion = {"Accuracy": [], "Recall@k": [], "Precision@k": [], "NDGC": []}
for pred in predictions.values():
    metricas_evaluacion["Accuracy"].append(accuracy.rmse(pred))
    recs = get_recomendation_list(pred)
    recalls = recallAtK(recs)
    avg_recall = sum(recalls.values()) / len(recalls)
    metricas_evaluacion["Recall@k"].append(avg_recall)
    precision = precisionAtK(recs)
    avg_precision = sum(precision.values()) / len(recalls)
    metricas_evaluacion["Precision@k"].append(avg_precision)
    ndgc = get_ndcg(pred)
    metricas_evaluacion["NDGC"].append(ndgc)
metricas = pd.DataFrame(data=metricas_evaluacion, index=predictions.keys())
metricas

RMSE: 0.9772
RMSE: 0.9678


## 7. Conclusion

Parece ser que en todas las métricas el que mejor funciona para este dataset es KNNSBasic basado en usuarios.
