# Tutorial BPR con Implicit I

MAN 3160 - Sistemas Recomendadores

En este tutorial vamos el modelo BPR proporcionado por la librería Implicit.


## Importar Librerías

In [6]:
# Instalamos librerías para descarcar y descomprimir archivos.

!pip install wget
!pip install zipfile36
!pip3 install implicit --upgrade


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import zipfile
import numpy as np
import implicit
import scipy.sparse as sparse

## Entrenamiento de agente BPR

In [7]:
!python -m wget http://files.grouplens.org/datasets/movielens/ml-100k.zip

/Users/ldavico/.pyenv/versions/3.11.2/bin/python: No module named wget


In [4]:
with zipfile.ZipFile("ml-100k.zip", 'r') as zip_ref:
    zip_ref.extractall(".")

In [5]:
dir_train = 'ml-100k'

# Generamos los títulos de las columnas del archivo items.

columns = ['movieid', 'title', 'release_date', 'video_release_date', \
           'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', \
           'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', \
           'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', \
           'Thriller', 'War', 'Western']

In [6]:
# Primero creamos el dataframe con los datos
df_train = pd.read_csv(f'{dir_train}/u2.base',
                         sep='\t',
                         names=['userid', 'itemid', 'rating', 'timestamp'],
                         header=None)

In [7]:
# rating >= 3 , relevante (1) y rating menor a 3 es no relevante (0)
df_train.rating = [1 if x >=3 else 0 for x in df_train.rating ]

In [8]:
# Cargamos el dataset con los items
df_items = pd.read_csv(f'{dir_train}/u.item',
                        sep='|',
                        index_col=0,
                        names = columns,
                        header=None,
                        encoding='latin-1')

In [9]:
# Cargamos el dataset de testing
df_test = pd.read_csv(f'{dir_train}/u2.test',
                      sep='\t',
                      names=['userid', 'itemid', 'rating', 'timestamp'],
                      header=None)


# rating >= 3 es relevante (1) y rating menor a 3 es no relevante (0)
df_test.rating = [1 if x >= 3 else 0 for x in df_test.rating ]


user_items_test = {}

for row in df_test.itertuples():
    if row[3]:
        if row[1] not in user_items_test:
            user_items_test[row[1]] = []

        user_items_test[row[1]].append(row[2])

In [10]:
user_items = {}
itemset = set()

for row in df_train.itertuples():
    if row[3]:
        if row[1] not in user_items:
            user_items[row[1]] = []

        user_items[row[1]].append(row[2])
        itemset.add(row[2])

itemset = np.sort(list(itemset))

sparse_matrix = np.zeros((len(user_items), len(itemset)))

for i, items in enumerate(user_items.values()):
    sparse_matrix[i] = np.isin(itemset, items, assume_unique=True).astype(int)

user_item_matrix = sparse.csr_matrix(sparse_matrix)

user_ids = {key: i for i, key in enumerate(user_items.keys())}
items_ids = {key: i for i, key in enumerate(itemset)}

In [11]:
# Definimos y entrenamos el modelo con optimización BPR
model_bpr = implicit.bpr.BayesianPersonalizedRanking()
model_bpr.fit(user_item_matrix)

100%|██████████████████████████████████████████████| 100/100 [00:01<00:00, 51.16it/s, train_auc=89.18%, skipped=22.80%]


### Métricas

In [12]:
# Definicion de métricas (No editar)
# Obtenido de https://gist.github.com/bwhite/3726239

def precision_at_k(r, k):
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)

def average_precision(r):
    r = np.asarray(r) != 0
    out = [precision_at_k(r, k + 1) for k in range(r.size) if r[k]]
    if not out:
        return 0.
    return np.mean(out)

def mean_average_precision(rs):
    return np.mean([average_precision(r) for r in rs])

def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)

    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

In [13]:
def evaluate_model(model, n):
    mean_map = 0.
    mean_ndcg = 0.
    for u in user_items_test.keys():
        rec = model.recommend(user_ids[u], user_item_matrix[user_ids[u]], n)[0]
        rec = [itemset[r] for r in rec]
        rel_vector = [np.isin(rec, user_items_test[u], assume_unique=True).astype(int)]
        mean_map += mean_average_precision(rel_vector)
        mean_ndcg += ndcg_at_k(rel_vector, n)

    mean_map /= len(user_items_test)
    mean_ndcg /= len(user_items_test)

    return mean_map, mean_ndcg

In [14]:
def show_recommendations(model, user, n):
    recommendations = model.recommend(userid=user_ids[user], user_items=user_item_matrix[user_ids[user]], N=n)
    return df_items.loc[recommendations[0]]['title']

## Evaluación del modelo

In [15]:
for n in [5, 10, 15, 20]:
    mmap, ndcg = evaluate_model(model_bpr, n)
    print(f'map@{n}: {mmap} \nndcg@{n}: {ndcg}')

map@5: 0.5121901709401702 
ndcg@5: 0.7676923076923077
map@10: 0.47731450302343126 
ndcg@10: 0.8630769230769231
map@15: 0.44302460978830344 
ndcg@15: 0.9123076923076923
map@20: 0.41482785948422757 
ndcg@20: 0.9261538461538461


# Explorar distintas señales de feedback

In [16]:
df_train = pd.read_csv(f'{dir_train}/u2.base',
                         sep='\t',
                         names=['userid', 'itemid', 'rating', 'timestamp'],
                         header=None)

df_test = pd.read_csv(f'{dir_train}/u2.test',
                      sep='\t',
                      names=['userid', 'itemid', 'rating', 'timestamp'],
                      header=None)

In [17]:
df_train_original = df_train.copy()
df_test_original = df_test.copy()
for r in range(1,6):
    df_train.rating = [1 if x >= r else 0 for x in df_train_original.rating ]

    df_test.rating = [1 if x >= r else 0 for x in df_test_original.rating ]


    user_items_test = {}


    user_items = {}
    itemset = set()

    for row in df_train.itertuples():
        if row[3]:
            if row[1] not in user_items:
                user_items[row[1]] = []

            user_items[row[1]].append(row[2])
            itemset.add(row[2])

    itemset = np.sort(list(itemset))
    
    for row in df_test.itertuples():
        if row[3] and row[1] in user_items: # Solo tomamos filas con rating (row[3]) = 1 y usuarios que tengan interacciones en el set de entrenamiento
            if row[1] not in user_items_test:
                user_items_test[row[1]] = []

            user_items_test[row[1]].append(row[2])

    sparse_matrix = np.zeros((len(user_items), len(itemset)))

    for i, items in enumerate(user_items.values()):
        sparse_matrix[i] = np.isin(itemset, items, assume_unique=True).astype(int)

    user_item_matrix = sparse.csr_matrix(sparse_matrix)

    user_ids = {key: i for i, key in enumerate(user_items.keys())}
    items_ids = {key: i for i, key in enumerate(itemset)}
    
    model_bpr = implicit.bpr.BayesianPersonalizedRanking()
    model_bpr.fit(user_item_matrix)
    print(f"Evaluando para ítems con calificación superior o igual a {r}:")
    mmap, ndcg = evaluate_model(model_bpr, 10)
    print(f'map@{10}: {mmap} \nndcg@{10}: {ndcg}')

100%|█████████████████████████████████████████████| 100/100 [00:00<00:00, 155.76it/s, train_auc=89.49%, skipped=24.76%]


Evaluando para ítems con calificación superior o igual a 1:
map@10: 0.4885171589196057 
ndcg@10: 0.889739663093415


100%|█████████████████████████████████████████████| 100/100 [00:00<00:00, 158.32it/s, train_auc=89.08%, skipped=24.26%]


Evaluando para ítems con calificación superior o igual a 2:
map@10: 0.48799528907507084 
ndcg@10: 0.8742331288343558


100%|█████████████████████████████████████████████| 100/100 [00:00<00:00, 176.28it/s, train_auc=89.07%, skipped=22.64%]


Evaluando para ítems con calificación superior o igual a 3:
map@10: 0.46516735057270747 
ndcg@10: 0.8569230769230769


100%|█████████████████████████████████████████████| 100/100 [00:00<00:00, 224.51it/s, train_auc=88.54%, skipped=18.81%]


Evaluando para ítems con calificación superior o igual a 4:
map@10: 0.4019901499822381 
ndcg@10: 0.7841614906832298


100%|█████████████████████████████████████████████| 100/100 [00:00<00:00, 354.05it/s, train_auc=84.78%, skipped=11.21%]


Evaluando para ítems con calificación superior o igual a 5:
map@10: 0.26456913639298324 
ndcg@10: 0.5798611111111112
