# **Filtrado basado en contenido**

Álvaro Fraile, Jaime Álvarez, Alejandro Mendoza

https://www.kaggle.com/competitions/recsys-filtrado-basado-en-contenido-2425

## Imports

In [1]:
import time
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

  from .autonotebook import tqdm as notebook_tqdm


## **Dataset**

### Negocios

In [3]:
start = time.time()

negocios_df = pd.read_csv('data/recsys-filtrado-basado-en-contenido-24-25/negocios.csv')
initial_memory = negocios_df.memory_usage(deep=True).sum() / 1024**2
print(f'Negocios initial memory usage: {initial_memory:.2f} MB')

negocios_df.drop(columns=['address', 'postal_code', 'is_open', 'hours'], inplace=True)
for col in negocios_df.select_dtypes(include=['object']):
    negocios_df[col] = negocios_df[col].astype("category")
negocios_df['latitude'] = negocios_df['latitude'].astype('float16')
negocios_df['longitude'] = negocios_df['longitude'].astype('float16')
negocios_df['stars'] = negocios_df['stars'].astype('float16')
negocios_df['review_count'] = negocios_df['review_count'].astype('int16')

final_memory = negocios_df.memory_usage(deep=True).sum() / 1024**2
print(f'Negocios final memory usage: {final_memory:.2f} MB')

minutos, segundos = divmod(time.time() - start, 60)
print(f"***** Preprocesamiento negocios: {int(minutos)} minutos y {int(segundos)} segundos *****")

Negocios initial memory usage: 33.01 MB
Negocios final memory usage: 19.96 MB
***** Preprocesamiento negocios: 0 minutos y 0 segundos *****


### Usuarios

In [4]:
start = time.time()

usuarios_df = pd.read_csv('data/recsys-filtrado-basado-en-contenido-24-25/usuarios.csv')
initial_memory = usuarios_df.memory_usage(deep=True).sum() / 1024**2
print(f'Negocios initial memory usage: {initial_memory:.2f} MB')

usuarios_df.drop(columns=['elite', 'yelping_since'], inplace=True)
usuarios_df['user_id'] = usuarios_df['user_id'].astype('string')
usuarios_df['name'] = usuarios_df['name'].astype('category')
usuarios_df['friends'] = usuarios_df['friends'].astype('category')
usuarios_df['useful'] = usuarios_df['useful'].astype('int32')
usuarios_df['funny'] = usuarios_df['funny'].astype('int32')
usuarios_df['cool'] = usuarios_df['cool'].astype('int32')
usuarios_df['average_stars'] = usuarios_df['average_stars'].astype('float16')
for col in usuarios_df.select_dtypes(include=['int64']):
    usuarios_df[col] = usuarios_df[col].astype('uint16')

final_memory = usuarios_df.memory_usage(deep=True).sum() / 1024**2
print(f'Negocios final memory usage: {final_memory:.2f} MB')

minutos, segundos = divmod(time.time() - start, 60)
print(f"***** Preprocesamiento usuarios: {int(minutos)} minutos y {int(segundos)} segundos *****")

  usuarios_df = pd.read_csv('data/recsys-filtrado-basado-en-contenido-24-25/usuarios.csv')


Negocios initial memory usage: 1285.09 MB
Negocios final memory usage: 1123.49 MB
***** Preprocesamiento usuarios: 0 minutos y 8 segundos *****


### Train_reviews

In [5]:
start = time.time()

train_reviews_df = pd.read_csv('data/recsys-filtrado-basado-en-contenido-24-25/train_reviews.csv')
initial_memory = train_reviews_df.memory_usage(deep=True).sum() / 1024**2
print(f'Train reviews initial memory usage: {initial_memory:.2f} MB')

train_reviews_df.drop(columns=['date'], inplace=True)
train_reviews_df['review_id'] = train_reviews_df['review_id'].astype('string')
train_reviews_df['user_id'] = train_reviews_df['user_id'].astype('category')
train_reviews_df['business_id'] = train_reviews_df['business_id'].astype('category')
train_reviews_df['text'] = train_reviews_df['text'].astype('string')

final_memory = train_reviews_df.memory_usage(deep=True).sum() / 1024**2
print(f'Train reviews final memory usage: {final_memory:.2f} MB')

minutos, segundos = divmod(time.time() - start, 60)
print(f"***** Preprocesamiento train reviews: {int(minutos)} minutos y {int(segundos)} segundos *****")

Train reviews initial memory usage: 896.88 MB
Train reviews final memory usage: 746.68 MB
***** Preprocesamiento train reviews: 0 minutos y 6 segundos *****


### Test_reviews

In [6]:
start = time.time()

test_reviews_df = pd.read_csv('data/recsys-filtrado-basado-en-contenido-24-25/test_reviews.csv')
initial_memory = test_reviews_df.memory_usage(deep=True).sum() / 1024**2
print(f'Train reviews initial memory usage: {initial_memory:.2f} MB')

test_reviews_df.drop(columns=['date'], inplace=True)
test_reviews_df['review_id'] = test_reviews_df['review_id'].astype('string')
test_reviews_df['user_id'] = test_reviews_df['user_id'].astype('category')
test_reviews_df['business_id'] = test_reviews_df['business_id'].astype('category')
test_reviews_df['text'] = test_reviews_df['text'].astype('string')

final_memory = test_reviews_df.memory_usage(deep=True).sum() / 1024**2
print(f'Train reviews final memory usage: {final_memory:.2f} MB')

minutos, segundos = divmod(time.time() - start, 60)
print(f"***** Preprocesamiento test reviews: {int(minutos)} minutos y {int(segundos)} segundos *****")

Train reviews initial memory usage: 381.26 MB
Train reviews final memory usage: 323.57 MB
***** Preprocesamiento test reviews: 0 minutos y 2 segundos *****


## **Submission DataFrame skeleton**

In [7]:
output_df = test_reviews_df[['review_id']].copy() # dataframe with review_id column
global_avg = train_reviews_df['stars'].mean() # global average rating value

## Aproximación 1 - Media del negocio

In [8]:
# Calculate the average rating for each business
avg_ratings = train_reviews_df.groupby('business_id', observed=True)['stars'].mean().reset_index()
avg_ratings.columns = ['business_id', 'avg_stars']
print("Length of avg_ratings:", len(avg_ratings))
print("Length of negocios_df:", len(negocios_df))
avg_ratings.head()

Length of avg_ratings: 30064
Length of negocios_df: 30069


Unnamed: 0,business_id,avg_stars
0,--7PUidqRWpRSpXebiyxTg,1.9
1,--ARBQr1WMsTWiwOKOj-FQ,4.666667
2,--LC8cIrALInl2vyo701tg,4.6
3,--N9yp3ZWqQIm7DqKRvorg,2.5
4,--S43ruInmIsGrnnkmavRw,3.380952


In [9]:
output_df_1 = output_df.copy()
for index, review in tqdm(test_reviews_df.iterrows(), total=len(test_reviews_df)):
    output_df_1.loc[index, 'stars'] = (
        avg_ratings.loc[avg_ratings['business_id'] == review['business_id'], 'avg_stars'].values[0]
        if review['business_id'] in avg_ratings['business_id'].values else global_avg
    )

output_df_1.to_csv('results_tournament_2/submission_business_avg.csv', index=False)

100%|██████████| 414765/414765 [01:19<00:00, 5246.84it/s]


MAE Público obtenido: 
* Usando 3 como default: **1.0433**
* Usando media global como default: **1.0433**

## Aproximación 1.1 - Con redondeo

In [10]:
output_df_2 = output_df_1.copy() 
output_df_2['stars'] = output_df_2['stars'].round()
output_df_2.to_csv('results_tournament_2/submission_business_avg_rounded.csv', index=False)

MAE Público obtenido con redondeo: **1.0286**

## Aproximación 2 - Embeddings con TFIDF

In [None]:
total_time = time.time()

# Paso 1: Agrupar reviews por negocio
business_reviews = train_reviews_df.groupby('business_id', observed=True)['text'].apply(lambda x: ' '.join(x)).reset_index()

# Paso 2: Vectorizar con TF-IDF
print("Vectorizando con TF-IDF...")
start = time.time()
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(business_reviews['text'])
minutos, segundos = divmod(time.time() - start, 60)
print(f"***** TF-IDF vectorization: {int(minutos)} minutes {int(segundos)} seconds *****")

# Paso 3: Calcular similitud entre negocios
print("Calculando similitud...")
start = time.time()
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
minutos, segundos = divmod(time.time() - start, 60)
print(f"***** Cosine similarity: {int(minutos)} minutes {int(segundos)} seconds *****")

# Paso 4: Índice para acceder por business_id
business_indices = pd.Series(business_reviews.index, index=business_reviews['business_id'])

# Paso 5: Crear un diccionario de ratings por usuario
print("Creando diccionario de ratings por usuario...")
start = time.time()
user_ratings = train_reviews_df.groupby('user_id', observed=True)
user_ratings = user_ratings.apply(lambda x: x[['business_id', 'stars']].set_index('business_id').to_dict()['stars'], include_groups=False).to_dict()
minutos, segundos = divmod(time.time() - start, 60)
print(f"***** User ratings dictionary: {int(minutos)} minutes {int(segundos)} seconds *****")

# Paso 6: Función para predecir rating
def predict_rating(user_id: str, target_business_id: str) -> float:
    if user_id not in user_ratings or target_business_id not in business_indices:
        return global_avg

    rated_items = user_ratings[user_id]
    similarities = []
    ratings = []

    target_idx = business_indices[target_business_id]

    for rated_business_id, rating in rated_items.items():
        if rated_business_id in business_indices:
            rated_idx = business_indices[rated_business_id]
            sim = cosine_sim[target_idx, rated_idx]
            if sim > 0:  # Considerar solo similares positivos
                similarities.append(sim)
                ratings.append(rating)

    if not similarities:
        return global_avg # No hay similitud con los ítems que ha valorado

    # Promedio ponderado
    weighted_avg = np.dot(similarities, ratings) / np.sum(similarities)
    return weighted_avg

# Paso 7: Predecir ratings
print("Prediciendo ratings...")
output_df_3 = output_df.copy()
for index, review in tqdm(test_reviews_df.iterrows(), total=len(test_reviews_df)):
    output_df_3.loc[index, 'stars'] = predict_rating(review['user_id'], review['business_id'])

total_minutes, total_seconds = divmod(time.time() - total_time, 60)
print(f"***** Total time: {int(total_minutes)} minutes {int(total_seconds)} seconds *****")

output_df_3['stars'] = output_df_3['stars'].round()
output_df_3.to_csv('results_tournament_2/submission_tfidf_rounded.csv', index=False)

Vectorizando con TF-IDF...
***** TF-IDF vectorization: 0 minutes 34 seconds *****
Calculando similitud...
***** Cosine similarity: 2 minutes 24 seconds *****
Creando diccionario de ratings por usuario...
***** User ratings dictionary: 4 minutes 5 seconds *****
Prediciendo ratings...


100%|██████████| 414765/414765 [00:54<00:00, 7542.67it/s]


***** Total time: 8 minutes 0 seconds *****


MAE público obtenido con TFIDF: **1.1597**

## Aproximación 3 - Embeddings con Sentence Transformers

In [43]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch
from tqdm import tqdm

Usando dispositivo: cuda
Cargando modelo de sentence-transformers...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
total_time = time.time()


# Paso 1: Agrupar reviews por negocio
business_reviews = train_reviews_df.groupby('business_id', observed=True)['text'].apply(lambda x: ' '.join(x)).reset_index()

# Paso 2: Vectorizar con SentenceTransformer en GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Usando dispositivo: {device}")

print("Cargando modelo de sentence-transformers...")
start = time.time()
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
minutos, segundos = divmod(time.time() - start, 60)
print(f"***** Carga de modelo: {int(minutos)} minutos {int(segundos)} segundos *****")

print("Vectorizando con sentence-transformers...")
start = time.time()
embeddings = model.encode(business_reviews['text'].tolist(), convert_to_tensor=True, device=device)
minutos, segundos = divmod(time.time() - start, 60)
print(f"***** Vectorización: {int(minutos)} minutos {int(segundos)} segundos *****")

# Paso 3: Calcular similitud entre negocios
print("Calculando similitud...")
start = time.time()
cosine_sim = util.pytorch_cos_sim(embeddings, embeddings).cpu().numpy()  # Para usarlo como matriz normal
minutos, segundos = divmod(time.time() - start, 60)
print(f"***** Cosine similarity: {int(minutos)} minutos {int(segundos)} segundos *****")

# Paso 4: Índice para acceder por business_id
business_indices = pd.Series(business_reviews.index, index=business_reviews['business_id'])

# Paso 5: Crear un diccionario de ratings por usuario
print("Creando diccionario de ratings por usuario...")
start = time.time()
user_ratings = train_reviews_df.groupby('user_id', observed=True)
user_ratings = user_ratings.apply(lambda x: x[['business_id', 'stars']].set_index('business_id').to_dict()['stars'], include_groups=False).to_dict()
minutos, segundos = divmod(time.time() - start, 60)
print(f"***** User ratings dictionary: {int(minutos)} minutos {int(segundos)} segundos *****")

# Paso 6: Función para predecir rating
def predict_rating(user_id: str, target_business_id: str) -> float:
    if user_id not in user_ratings or target_business_id not in business_indices:
        return global_avg 
    rated_items = user_ratings[user_id]
    similarities = []
    ratings = []

    target_idx = business_indices[target_business_id]

    for rated_business_id, rating in rated_items.items():
        if rated_business_id in business_indices:
            rated_idx = business_indices[rated_business_id]
            sim = cosine_sim[target_idx, rated_idx]
            if sim > 0:  # Considerar solo similares positivos
                similarities.append(sim)
                ratings.append(rating)

    if not similarities:
        return global_avg  # No hay similitud con los ítems que ha valorado

    # Promedio ponderado
    weighted_avg = np.dot(similarities, ratings) / np.sum(similarities)
    return weighted_avg

# Paso 7: Predecir ratings
print("Prediciendo ratings...")
output_df_4 = output_df.copy()
for index, review in tqdm(test_reviews_df.iterrows(), total=len(test_reviews_df)):
    output_df_4.loc[index, 'stars'] = predict_rating(review['user_id'], review['business_id'])

total_minutes, total_seconds = divmod(time.time() - total_time, 60)
print(f"***** Total time: {int(total_minutes)} minutes {int(total_seconds)} seconds *****")

output_df_4['stars'] = output_df_4['stars'].round()
output_df_4.to_csv('results_tournament_2/submission_tfidf_rounded.csv', index=False)

Usando dispositivo: cuda
Cargando modelo de sentence-transformers...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


***** Carga de modelo: 0 minutos 7 segundos *****
Vectorizando con sentence-transformers...
***** Vectorización: 1 minutos 23 segundos *****
Calculando similitud...
***** Cosine similarity: 0 minutos 1 segundos *****
Creando diccionario de ratings por usuario...
***** User ratings dictionary: 4 minutos 14 segundos *****
Prediciendo ratings...


100%|██████████| 414765/414765 [00:57<00:00, 7207.04it/s]


***** Total time: 6 minutes 45 seconds *****


MAE publico obtenido: 1.1594

In [None]:
# Paso 1: Vectorizar cada review individual
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Usando dispositivo: {device}")

print("Cargando modelo de sentence-transformers...")
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

print("Vectorizando cada review individual...")
review_embeddings = model.encode(
    train_reviews_df['text'].tolist(),
    convert_to_tensor=True,
    device=device,
    show_progress_bar=True,
    batch_size=512
)



Usando dispositivo: cuda
Cargando modelo de sentence-transformers...
Vectorizando cada review individual...


Batches: 100%|██████████| 1891/1891 [07:55<00:00,  3.97it/s]


TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [None]:
# Paso 2: Asociar embeddings a cada review
train_reviews_df['embedding'] = list(review_embeddings.cpu())

# Paso 3: Agrupar por negocio y sacar el embedding medio
print("Calculando embedding medio por negocio...")
business_embeddings = {}
for business_id, group in tqdm(train_reviews_df.groupby('business_id')):
    embs = torch.stack(group['embedding'].tolist())
    business_embeddings[business_id] = embs.mean(dim=0)

# Paso 4: Crear lista de embeddings en orden
print("Creando matriz de embeddings...")
business_ids = list(business_embeddings.keys())
embedding_matrix = torch.stack([business_embeddings[b_id] for b_id in business_ids])

# Paso 5: Calcular similitud coseno entre negocios
print("Calculando similitud coseno...")
cosine_sim = util.pytorch_cos_sim(embedding_matrix, embedding_matrix).cpu().numpy()

# Paso 6: Índice para acceder por business_id
business_indices = pd.Series(range(len(business_ids)), index=business_ids)

# Paso 7: Crear diccionario de ratings por usuario
print("Creando diccionario de ratings por usuario...")
user_ratings = train_reviews_df.groupby('user_id').apply(
    lambda x: x[['business_id', 'stars']].set_index('business_id').to_dict()['stars']
).to_dict()

# Paso 8: Función para predecir rating
def predict_rating(user_id, target_business_id):
    if user_id not in user_ratings or target_business_id not in business_indices:
        return global_avg
    
    rated_items = user_ratings[user_id]
    similarities = []
    ratings = []

    target_idx = business_indices[target_business_id]

    for rated_business_id, rating in rated_items.items():
        if rated_business_id in business_indices:
            rated_idx = business_indices[rated_business_id]
            sim = cosine_sim[target_idx, rated_idx]
            if sim > 0.8:
                similarities.append(sim)
                ratings.append(rating)

    if not similarities:
        return global_avg

    weighted_avg = np.dot(similarities, ratings) / np.sum(similarities)
    return weighted_avg

# Paso 9: Predecir ratings
print("Prediciendo ratings...")
for index, review in tqdm(test_reviews_df.iterrows(), total=len(test_reviews_df)):
    output_df.loc[index, 'stars'] = predict_rating(review.user_id, review.business_id)

# Paso 10: Limpiar NaNs y redondear
output_df['stars'] = output_df['stars'].fillna(global_avg).round()
output_df.to_csv('./results/submission_sentence_transformers_rounded.csv', index=False)

Calculando embedding medio por negocio...


100%|██████████| 30064/30064 [00:02<00:00, 11034.70it/s]


Creando matriz de embeddings...
Calculando similitud coseno...
Creando diccionario de ratings por usuario...


  user_ratings = train_reviews_df.groupby('user_id').apply(


Prediciendo ratings...


100%|██████████| 414765/414765 [00:47<00:00, 8710.73it/s]


In [14]:
print(cosine_sim)

[[0.9999999  0.4427101  0.29980582 ... 0.6780672  0.6563667  0.59870124]
 [0.4427101  1.0000001  0.44541562 ... 0.24072593 0.3650766  0.31967145]
 [0.29980582 0.44541562 0.9999999  ... 0.1681978  0.25902703 0.21196766]
 ...
 [0.6780672  0.24072593 0.1681978  ... 1.0000002  0.5487293  0.6100424 ]
 [0.6563667  0.3650766  0.25902703 ... 0.5487293  0.9999999  0.52822644]
 [0.59870124 0.31967145 0.21196766 ... 0.6100424  0.52822644 0.99999964]]


MAE publico: 1.1614

In [15]:
# Paso 8: Función para predecir rating
def predict_rating(user_id, target_business_id):
    if user_id not in user_ratings or target_business_id not in business_indices:
        return global_avg
    
    rated_items = user_ratings[user_id]
    similarities = []
    ratings = []

    target_idx = business_indices[target_business_id]

    for rated_business_id, rating in rated_items.items():
        if rated_business_id in business_indices:
            rated_idx = business_indices[rated_business_id]
            sim = cosine_sim[target_idx, rated_idx]
            if sim > 0.8:
                similarities.append(sim)
                ratings.append(rating)

    if not similarities:
        return global_avg

    weighted_avg = np.dot(similarities, ratings) / np.sum(similarities)
    return weighted_avg

# Paso 9: Predecir ratings
print("Prediciendo ratings...")
for index, review in tqdm(test_reviews_df.iterrows(), total=len(test_reviews_df)):
    output_df.loc[index, 'stars'] = predict_rating(review.user_id, review.business_id)

# Paso 10: Limpiar NaNs y redondear
output_df['stars'] = output_df['stars'].fillna(global_avg).round()
output_df.to_csv('./results/submission_sentence_transformers_rounded.csv', index=False)

Prediciendo ratings...


100%|██████████| 414765/414765 [00:45<00:00, 9182.06it/s]


In [None]:
# Calculate Euclidean distance instead of cosine similarity
print("Calculando distancia euclidiana...")
euclidean_dist = torch.cdist(embedding_matrix, embedding_matrix, p=2).cpu().numpy()

# Convert Euclidean distance to similarity (1 / (1 + distance))
euclidean_sim = 1 / (1 + euclidean_dist)

# Paso 6: Índice para acceder por business_id
business_indices = pd.Series(range(len(business_ids)), index=business_ids)

# Paso 7: Crear diccionario de ratings por usuario
print("Creando diccionario de ratings por usuario...")
user_ratings = train_reviews_df.groupby('user_id').apply(
    lambda x: x[['business_id', 'stars']].set_index('business_id').to_dict()['stars']
).to_dict()

# Paso 8: Función para predecir rating
def predict_rating(user_id, target_business_id):
    if user_id not in user_ratings or target_business_id not in business_indices:
        return global_avg
    
    rated_items = user_ratings[user_id]
    similarities = []
    ratings = []

    target_idx = business_indices[target_business_id]

    for rated_business_id, rating in rated_items.items():
        if rated_business_id in business_indices:
            rated_idx = business_indices[rated_business_id]
            sim = euclidean_sim[target_idx, rated_idx]
            if sim > 0.8:
                similarities.append(sim)
                ratings.append(rating)

    if not similarities:
        return global_avg

    weighted_avg = np.dot(similarities, ratings) / np.sum(similarities)
    return weighted_avg



Calculando distancia euclidiana...
Creando diccionario de ratings por usuario...


  user_ratings = train_reviews_df.groupby('user_id').apply(


Prediciendo ratings...


100%|██████████| 414765/414765 [00:45<00:00, 9091.42it/s]


In [45]:
# Paso 9: Predecir ratings
print("Prediciendo ratings...")
for index, review in tqdm(test_reviews_df.iterrows(), total=len(test_reviews_df)):
    output_df.loc[index, 'stars'] = predict_rating(review.user_id, review.business_id)

# Paso 10: Limpiar NaNs y redondear
output_df['stars'] = output_df['stars'].fillna(global_avg).round()
output_df.to_csv('./results/submission_sentence_transformers_rounded.csv', index=False)

Prediciendo ratings...


100%|██████████| 414765/414765 [00:44<00:00, 9239.29it/s]


### Aproximación 4 - Analisis de sentimientos

In [13]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm import tqdm

# Cargar modelo y tokenizer
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.eval()

# Asegurar uso de GPU si está disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Preprocesar textos
MAX_LEN = 512
texts = test_reviews_df['text'].apply(lambda x: x[:MAX_LEN]).tolist()

# Función para procesar en lotes
def get_sentiment_scores_batched(texts, batch_size=256):
    scores = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Procesando batches"):
        batch_texts = texts[i:i+batch_size]
        
        encoded_batch = tokenizer(
            batch_texts,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=MAX_LEN
        )
        encoded_batch = {k: v.to(device) for k, v in encoded_batch.items()}

        with torch.no_grad():
            outputs = model(**encoded_batch)
            logits = outputs.logits.cpu().numpy()
            probs = softmax(logits, axis=1)

            # Map: negative → 0, neutral → 2.5, positive → 5
            batch_scores = probs[:, 0]*0 + probs[:, 1]*2.5 + probs[:, 2]*5
            scores.extend(batch_scores.tolist())

    return scores

# Calcular los scores en batch
print("Calculando sentimiento por batches...")
sentiment_scores = get_sentiment_scores_batched(texts, batch_size=512)

# Guardar en el DataFrame
test_reviews_df['stars'] = [round(s, 0) for s in sentiment_scores]

# Exportar resultado
test_reviews_df[['review_id', 'stars']].to_csv('./results_tournament_2/sentiment_analysis_predictions.csv', index=False)


Calculando sentimiento por batches...


Procesando batches: 100%|██████████| 811/811 [30:33<00:00,  2.26s/it] 
