# **Filtrado basado en contenido**

Álvaro Fraile, Jaime Álvarez, Alejandro Mendoza

## Imports

In [1]:
import pandas as pd
from tqdm import tqdm

## Dataset

In [2]:
usuarios_df = pd.read_csv('./data/usuarios.csv')
negocios_df = pd.read_csv('./data/negocios.csv')
test_reviews_df = pd.read_csv('./data/test_reviews.csv')
train_reviews_df = pd.read_csv('./data/train_reviews.csv')

  usuarios_df = pd.read_csv('./data/usuarios.csv')


In [9]:
train_reviews_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 967784 entries, 0 to 967783
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   review_id    967784 non-null  object 
 1   user_id      967784 non-null  object 
 2   business_id  967784 non-null  object 
 3   stars        967784 non-null  float64
 4   useful       967784 non-null  int64  
 5   funny        967784 non-null  int64  
 6   cool         967784 non-null  int64  
 7   text         967784 non-null  object 
 8   date         967784 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 896.9 MB


In [8]:
train_reviews_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,ZZO43qKB-s65zplC8RfJqw,-1BSu2dt_rOAqllw9ZDXtA,smkZq4G1AOm4V6p3id5sww,5.0,0,0,0,Fantastic fresh food. The greek salad is amazi...,2016-09-30 15:49:32
1,vojXOF_VOgvuKD95gCO8_Q,xpe178ng_gj5X6HgqtOing,96_c_7twb7hYRZ9HHrq01g,1.0,2,0,1,Been a patient at Largo Med/Diagnostic Clinic ...,2020-12-09 14:39:51
2,KwxdbiseRlIRNzpgvyjY0Q,axbaerf2Fk92OB4b9_peVA,e0AYjKfSF0DL-5C1CpOq6Q,4.0,0,0,0,The location is convenient to my campus so I d...,2013-09-04 16:19:51
3,3mwoBcTy-2gMh0L91uaIeA,_GOiybb0rImYKJfwyxEaGg,vF-uptiQ34pVLHJKzPHUlA,5.0,0,0,0,I agree with all the other compliments posted ...,2019-03-02 12:24:14
4,XfWf7XsBWs3kYyYq7Ns1ZQ,ojWKg3B5pH3ncAsxun3kUw,X28XK71RuEXPapeyUOwNzg,5.0,10,4,7,"Wanting to help out the local economy, I thoug...",2020-04-23 18:26:29


In [None]:
negocios_df.head()

In [7]:
train_reviews_df.shape

(967784, 9)

## Modelos

## Aproximación 1 - Media del negocio

In [3]:
# Calculate the average rating for each business
avg_ratings = train_reviews_df.groupby('business_id')['stars'].mean().reset_index()
avg_ratings.columns = ['business_id', 'avg_stars']
print("Length of avg_ratings:", len(avg_ratings))
print("Length of negocios_df:", len(negocios_df))
avg_ratings.head()

Length of avg_ratings: 30064
Length of negocios_df: 30069


Unnamed: 0,business_id,avg_stars
0,--7PUidqRWpRSpXebiyxTg,1.9
1,--ARBQr1WMsTWiwOKOj-FQ,4.666667
2,--LC8cIrALInl2vyo701tg,4.6
3,--N9yp3ZWqQIm7DqKRvorg,2.5
4,--S43ruInmIsGrnnkmavRw,3.380952


In [4]:
# Create a DataFrame with the required columns
output_df = test_reviews_df[['review_id']].copy()

global_avg = train_reviews_df['stars'].mean()

In [None]:
# Wrap the DataFrame in tqdm for progress tracking
for index, review in tqdm(test_reviews_df.iterrows(), total=len(test_reviews_df)):
    output_df.loc[index, 'stars'] = (
        avg_ratings.loc[avg_ratings.business_id == review.business_id, 'avg_stars'].values[0]
        if review.business_id in avg_ratings.business_id.values else global_avg
    )

# Save the DataFrame to a CSV file
output_df.to_csv('./results/submission_business_avg.csv', index=False)


100%|██████████| 414765/414765 [18:09<00:00, 380.54it/s]


MAE Publico obtenido: 
    · Usando 3 como default: 1.0433
    · Usando media global como default: 1.0433

## Aproximación 1.1 - Con redondeo

In [6]:
output_df['stars'] = output_df['stars'].round()
output_df.to_csv('./results/submission_business_avg_rounded.csv', index=False)

MAE Publico obtenido con redondeo: 1.0286

## Aproximación 2 - Embeddings con TFIDF

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np

# Paso 1: Agrupar reviews por negocio
business_reviews = train_reviews_df.groupby('business_id')['text'].apply(lambda x: ' '.join(x)).reset_index()

# Paso 2: Vectorizar con TF-IDF
print("Vectorizando con TF-IDF...")
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(business_reviews['text'])

# Paso 3: Calcular similitud entre negocios
print("Calculando similitud...")
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Paso 4: Índice para acceder por business_id
business_indices = pd.Series(business_reviews.index, index=business_reviews['business_id'])

# Paso 5: Crear un diccionario de ratings por usuario
print("Creando diccionario de ratings por usuario...")  
user_ratings = train_reviews_df.groupby('user_id').apply(
    lambda x: x[['business_id', 'stars']].set_index('business_id').to_dict()['stars']
).to_dict()

# Paso 6: Función para predecir rating
def predict_rating(user_id, target_business_id):
    if user_id not in user_ratings or target_business_id not in business_indices:
        return global_avg

    rated_items = user_ratings[user_id]
    similarities = []
    ratings = []

    target_idx = business_indices[target_business_id]

    for rated_business_id, rating in rated_items.items():
        if rated_business_id in business_indices:
            rated_idx = business_indices[rated_business_id]
            sim = cosine_sim[target_idx, rated_idx]
            if sim > 0:  # Considerar solo similares positivos
                similarities.append(sim)
                ratings.append(rating)

    if not similarities:
        return global_avg # No hay similitud con los ítems que ha valorado

    # Promedio ponderado
    weighted_avg = np.dot(similarities, ratings) / np.sum(similarities)
    return weighted_avg

# Wrap the DataFrame in tqdm for progress tracking
print("Prediciendo ratings...")
for index, review in tqdm(test_reviews_df.iterrows(), total=len(test_reviews_df)):
    output_df.loc[index, 'stars'] = (
        predict_rating(review.user_id, review.business_id)
    )


output_df['stars'] = output_df['stars'].round()
output_df.to_csv('./results/submission_tfidf_rounded.csv', index=False)

Vectorizando con TF-IDF...
Calculando similitud...
Creando diccionario de ratings por usuario...


  user_ratings = train_reviews_df.groupby('user_id').apply(


Prediciendo ratings...


100%|██████████| 414765/414765 [00:43<00:00, 9531.04it/s]


MAE publico obtenido con TFIDF: 1.1597

In [5]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch
from tqdm import tqdm

# Paso 1: Agrupar reviews por negocio
business_reviews = train_reviews_df.groupby('business_id')['text'].apply(lambda x: ' '.join(x)).reset_index()

# Paso 2: Vectorizar con SentenceTransformer en GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Usando dispositivo: {device}")

print("Cargando modelo de sentence-transformers...")
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

print("Vectorizando con sentence-transformers...")
embeddings = model.encode(business_reviews['text'].tolist(), convert_to_tensor=True, device=device)

# Paso 3: Calcular similitud entre negocios
print("Calculando similitud...")
cosine_sim = util.pytorch_cos_sim(embeddings, embeddings).cpu().numpy()  # Para usarlo como matriz normal

# Paso 4: Índice para acceder por business_id
business_indices = pd.Series(business_reviews.index, index=business_reviews['business_id'])

# Paso 5: Crear un diccionario de ratings por usuario
print("Creando diccionario de ratings por usuario...")  
user_ratings = train_reviews_df.groupby('user_id').apply(
    lambda x: x[['business_id', 'stars']].set_index('business_id').to_dict()['stars']
).to_dict()

# Paso 6: Función para predecir rating
def predict_rating(user_id, target_business_id):
    if user_id not in user_ratings or target_business_id not in business_indices:
        return global_avg 
    rated_items = user_ratings[user_id]
    similarities = []
    ratings = []

    target_idx = business_indices[target_business_id]

    for rated_business_id, rating in rated_items.items():
        if rated_business_id in business_indices:
            rated_idx = business_indices[rated_business_id]
            sim = cosine_sim[target_idx, rated_idx]
            if sim > 0:  # Considerar solo similares positivos
                similarities.append(sim)
                ratings.append(rating)

    if not similarities:
        return global_avg  # No hay similitud con los ítems que ha valorado

    # Promedio ponderado
    weighted_avg = np.dot(similarities, ratings) / np.sum(similarities)
    return weighted_avg

# Paso 7: Predecir ratings
print("Prediciendo ratings...")
for index, review in tqdm(test_reviews_df.iterrows(), total=len(test_reviews_df)):
    output_df.loc[index, 'stars'] = (
        predict_rating(review.user_id, review.business_id)
    )

# Redondear resultados
output_df['stars'] = output_df['stars'].round()
output_df.to_csv('./results/submission_sentence_transformers_rounded.csv', index=False)


  from .autonotebook import tqdm as notebook_tqdm


Usando dispositivo: cuda
Cargando modelo de sentence-transformers...
Vectorizando con sentence-transformers...
Calculando similitud...
Creando diccionario de ratings por usuario...


  user_ratings = train_reviews_df.groupby('user_id').apply(


Prediciendo ratings...


100%|██████████| 414765/414765 [00:47<00:00, 8738.12it/s]


MAE publico obtenido: 1.1594