# Baselines modelos Proyecto Final

Nombres:
- Randall Fabrizio Biermann Olivari
- Martín Alonso Muñoz Arévalo
- Eduardo Vicente Soto Rojas

Importamos algunas librerías que se utilizarán a lo largo del desarrollo del proyecto.

In [9]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import torch


Subimos la base de datos de redial

In [None]:
from google.colab import files
uploaded = files.upload()

Saving redial_dataset.zip to redial_dataset.zip


Se extraen los zips con el script otorgado por la siguiente referencia:

ReDialData. (s. f.). website/load_data.py at data · ReDialData/website. GitHub. https://github.com/ReDialData/website/blob/data/load_data.py

In [None]:
import zipfile
import json

with zipfile.ZipFile('redial_dataset.zip', 'r') as z:
    z.extractall()

train_data = []
for line in open("train_data.jsonl", "r"):
    train_data.append(json.loads(line))
print("Loaded {} train conversations".format(len(train_data)))

test_data = []
for line in open("test_data.jsonl", "r"):
    test_data.append(json.loads(line))
print("Loaded {} test conversations".format(len(test_data)))



Loaded 10006 train conversations
Loaded 1342 test conversations


Ahora, lo pasamos a formato de dataframe de pandas.

In [None]:
import pandas as pd

df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)

print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)

df_train.head()


Train shape: (10006, 7)
Test shape: (1342, 7)


Unnamed: 0,movieMentions,respondentQuestions,messages,conversationId,respondentWorkerId,initiatorWorkerId,initiatorQuestions
0,{'203371': 'Final Fantasy: The Spirits Within ...,"{'203371': {'suggested': 1, 'seen': 0, 'liked'...","[{'timeOffset': 0, 'text': 'Hi there, how are ...",391,1,0,"{'203371': {'suggested': 1, 'seen': 0, 'liked'..."
1,"{'196336': 'Scarface (1983)', '204322': 'Sout...","{'196336': {'suggested': 0, 'seen': 1, 'liked'...","[{'timeOffset': 0, 'text': 'Hi, did you see @1...",395,1,2,[]
2,"{'79320': 'Contact (2009)', '84001': 'Donnie ...","{'79320': {'suggested': 0, 'seen': 1, 'liked':...","[{'timeOffset': 0, 'text': '@163606 It is a ni...",397,1,3,"{'79320': {'suggested': 0, 'seen': 0, 'liked':..."
3,"{'188302': 'Ever After (1998)', '195904': 'Cha...",[],"[{'timeOffset': 0, 'text': 'I'm in the mood to...",405,5,4,[]
4,"{'119144': 'Wonder Woman (2017)', '125431': '...","{'119144': {'suggested': 0, 'seen': 1, 'liked'...","[{'timeOffset': 0, 'text': 'Hello', 'senderWor...",407,6,7,"{'119144': {'suggested': 0, 'seen': 1, 'liked'..."


# Implementación de Modelos de Referencia

Preprocesamiento

In [46]:
def build_rating_matrix_from_dataframes(df_train, df_test):
    all_movies = set()

    for _, row in df_train.iterrows():
        if isinstance(row.get('movieMentions'), dict):
            all_movies.update(row['movieMentions'].keys())
        if isinstance(row.get('initiatorQuestions'), dict):
            all_movies.update(row['initiatorQuestions'].keys())
        if isinstance(row.get('respondentQuestions'), dict):
            all_movies.update(row['respondentQuestions'].keys())

    for _, row in df_test.iterrows():
        if isinstance(row.get('movieMentions'), dict):
            all_movies.update(row['movieMentions'].keys())
        if isinstance(row.get('initiatorQuestions'), dict):
            all_movies.update(row['initiatorQuestions'].keys())
        if isinstance(row.get('respondentQuestions'), dict):
            all_movies.update(row['respondentQuestions'].keys())

    all_movies = sorted(list(all_movies))
    movie_to_idx = {movie: idx for idx, movie in enumerate(all_movies)}

    def build_matrix_from_df(df):
        user_ratings = defaultdict(dict)
        user_ids = set()

        for _, row in df.iterrows():
            user1 = f"user_{row['initiatorWorkerId']}"
            user_ids.add(user1)
            if isinstance(row.get('initiatorQuestions'), dict):
                for movie_id, ratings in row['initiatorQuestions'].items():
                    if ratings['seen'] > 0:
                        user_ratings[user1][movie_id] = ratings['liked']

            user2 = f"user_{row['respondentWorkerId']}"
            user_ids.add(user2)
            if isinstance(row.get('respondentQuestions'), dict):
                for movie_id, ratings in row['respondentQuestions'].items():
                    if ratings['seen'] > 0:
                        user_ratings[user2][movie_id] = ratings['liked']

        user_list = sorted(list(user_ids))
        user_to_idx = {user: idx for idx, user in enumerate(user_list)}
        rating_matrix = -1 * np.ones((len(user_list), len(all_movies)))

        for user, ratings in user_ratings.items():
            user_idx = user_to_idx[user]
            for movie_id, rating in ratings.items():
                if movie_id in movie_to_idx:
                    movie_idx = movie_to_idx[movie_id]
                    rating_matrix[user_idx, movie_idx] = rating

        return rating_matrix, user_list, user_to_idx

    train_matrix, train_users, train_user_idx = build_matrix_from_df(df_train)
    test_matrix, test_users, test_user_idx = build_matrix_from_df(df_test)

    return train_matrix, test_matrix, all_movies, movie_to_idx, train_users, test_users

train_matrix, test_matrix, all_movies, movie_to_idx, train_users, test_users = build_rating_matrix_from_dataframes(df_train, df_test)

print(f"Train matrix: {train_matrix.shape}")
print(f"Test matrix: {test_matrix.shape}")
print(f"Total películas únicas: {len(all_movies)}")
print(f"Usuarios en train: {len(train_users)}")
print(f"Usuarios en test: {len(test_users)}")


Train matrix: (956, 6637)
Test matrix: (156, 6637)
Total películas únicas: 6637
Usuarios en train: 956
Usuarios en test: 156


Definimos los modelos

In [47]:
class RandomRecommender:
    def __init__(self, all_movies):
        self.all_movies = all_movies

    def fit(self, df_train):
        pass

    def recommend(self, user_ratings, top_k=10):
        unseen = [m for m in self.all_movies if m not in user_ratings]
        if len(unseen) == 0:
            return []
        selected = np.random.choice(unseen, size=min(top_k, len(unseen)), replace=False)
        return list(selected)

class MostPopularRecommender:
    def __init__(self):
        self.popularity = None
        self.sorted_movies = None

    def fit(self, df_train):
        self.popularity = defaultdict(int)

        for _, row in df_train.iterrows():
            if isinstance(row.get('movieMentions'), dict):
                for movie_id in row['movieMentions']:
                    self.popularity[movie_id] += 1

            if isinstance(row.get('initiatorQuestions'), dict):
                for movie_id in row['initiatorQuestions']:
                    self.popularity[movie_id] += 1
            if isinstance(row.get('respondentQuestions'), dict):
                for movie_id in row['respondentQuestions']:
                    self.popularity[movie_id] += 1

        self.sorted_movies = sorted(self.popularity.items(), key=lambda x: x[1], reverse=True)

    def recommend(self, user_ratings, top_k=10):
        recommendations = []
        for movie_id, count in self.sorted_movies:
            if movie_id not in user_ratings:
                recommendations.append(movie_id)
                if len(recommendations) >= top_k:
                    break
        return recommendations

class UserUserCF:
    def __init__(self):
        self.rating_matrix = None
        self.user_similarities = None
        self.user_ids = None

    def fit(self, rating_matrix, user_ids):
        self.rating_matrix = rating_matrix
        self.user_ids = user_ids

        n_users = rating_matrix.shape[0]
        self.user_similarities = np.zeros((n_users, n_users))

        for i in range(n_users):
            for j in range(i+1, n_users):
                mask_i = rating_matrix[i] != -1
                mask_j = rating_matrix[j] != -1
                common_mask = mask_i & mask_j

                if np.sum(common_mask) > 2:
                    vec_i = rating_matrix[i][common_mask]
                    vec_j = rating_matrix[j][common_mask]
                    similarity = np.dot(vec_i, vec_j) / (np.linalg.norm(vec_i) * np.linalg.norm(vec_j) + 1e-8)
                    self.user_similarities[i, j] = similarity
                    self.user_similarities[j, i] = similarity

    def recommend(self, user_idx, top_k=10):
        user_ratings = self.rating_matrix[user_idx]

        similar_users = []
        for j in range(len(self.user_ids)):
            if j != user_idx and self.user_similarities[user_idx, j] > 0.1:
                similar_users.append((j, self.user_similarities[user_idx, j]))

        similar_users.sort(key=lambda x: x[1], reverse=True)
        similar_users = similar_users[:5]

        predictions = {}
        for movie_idx in range(self.rating_matrix.shape[1]):
            if user_ratings[movie_idx] == -1:
                weighted_sum = 0
                total_similarity = 0

                for sim_user_idx, similarity in similar_users:
                    if self.rating_matrix[sim_user_idx, movie_idx] != -1:
                        weighted_sum += similarity * self.rating_matrix[sim_user_idx, movie_idx]
                        total_similarity += similarity

                if total_similarity > 0:
                    predictions[movie_idx] = weighted_sum / total_similarity

        sorted_predictions = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
        return [movie_idx for movie_idx, score in sorted_predictions[:top_k]]

Entrenamos

In [48]:
print("\n=== ENTRENANDO MODELOS ===")

print("1. Random Recommender...")
random_model = RandomRecommender(all_movies)
random_model.fit(df_train)

print("2. Most Popular Recommender...")
popular_model = MostPopularRecommender()
popular_model.fit(df_train)

print("3. User-User Collaborative Filtering...")
user_user_model = UserUserCF()
user_user_model.fit(train_matrix, train_users)



=== ENTRENANDO MODELOS ===
1. Random Recommender...
2. Most Popular Recommender...
3. User-User Collaborative Filtering...
✅ Todos los modelos entrenados exitosamente!


Recomendaciones

In [49]:
def get_movie_title(movie_id, df):
    for _, row in df.iterrows():
        if isinstance(row.get('movieMentions'), dict) and movie_id in row['movieMentions']:
            return row['movieMentions'][movie_id]
    return "Título no encontrado"

def get_user_history(user_worker_id, df, user_type='initiator'):
    user_ratings = {}

    for _, row in df.iterrows():
        if user_type == 'initiator' and row['initiatorWorkerId'] == user_worker_id:
            if isinstance(row.get('initiatorQuestions'), dict):
                for movie_id, ratings in row['initiatorQuestions'].items():
                    if ratings['seen'] > 0:
                        user_ratings[movie_id] = ratings['liked']
        elif user_type == 'respondent' and row['respondentWorkerId'] == user_worker_id:
            if isinstance(row.get('respondentQuestions'), dict):
                for movie_id, ratings in row['respondentQuestions'].items():
                    if ratings['seen'] > 0:
                        user_ratings[movie_id] = ratings['liked']

    return user_ratings

print("\n=== RECOMENDACIONES ===")

example_user_id = df_train.iloc[0]['initiatorWorkerId']
user_ratings = get_user_history(example_user_id, df_train, 'initiator')

print(f"Usuario {example_user_id} tiene {len(user_ratings)} películas vistas:")
for movie_id, rating in list(user_ratings.items())[:5]:
    title = get_movie_title(movie_id, df_train)
    print(f"  - {title} (rating: {rating})")

print("\n🎲 1. RECOMENDACIONES ALEATORIAS:")
random_recs = random_model.recommend(user_ratings, top_k=5)
for i, movie_id in enumerate(random_recs, 1):
    title = get_movie_title(movie_id, df_train)
    print(f"   {i}. {title}")

print("\n🔥 2. RECOMENDACIONES POPULARES:")
popular_recs = popular_model.recommend(user_ratings, top_k=5)
for i, movie_id in enumerate(popular_recs, 1):
    title = get_movie_title(movie_id, df_train)
    count = popular_model.popularity[movie_id]
    print(f"   {i}. {title} (mencionada {count} veces)")

print("\n👥 3. RECOMENDACIONES USER-USER CF:")
user_idx = None
for idx, user_str in enumerate(train_users):
    if user_str == f"user_{example_user_id}":
        user_idx = idx
        break

if user_idx is not None:
    cf_recs_indices = user_user_model.recommend(user_idx, top_k=5)
    cf_recs = [all_movies[idx] for idx in cf_recs_indices]

    for i, movie_id in enumerate(cf_recs, 1):
        title = get_movie_title(movie_id, df_train)
        print(f"   {i}. {title}")
else:
    print("   Usuario no encontrado en la matriz de entrenamiento")


=== RECOMENDACIONES ===
Usuario 0 tiene 4 películas vistas:
  - The Triplets of Belleville (2003) (rating: 1)
  - Mary and Max (2009) (rating: 1)
  - A Scanner Darkly  (2006) (rating: 1)
  - Waking Life (2001) (rating: 1)

🎲 1. RECOMENDACIONES ALEATORIAS:
   1. Fame  (1980)
   2. Focus  (2015)
   3. Early Man 
   4. The Craft  (1996)
   5. Three Stooges Fun-O-Rama (1959)

🔥 2. RECOMENDACIONES POPULARES:
   1. Black Panther (2018) (mencionada 1776 veces)
   2. It  (2017) (mencionada 1612 veces)
   3. Jumanji  (2017) (mencionada 1337 veces)
   4. Get Out (2017) (mencionada 959 veces)
   5. Coco  (2017) (mencionada 882 veces)

👥 3. RECOMENDACIONES USER-USER CF:
   1. Inception (2010)
   2. Lethal Weapon (1987)
   3. A Christmas Story (1983)
   4. Kung Fu Panda (2008)
   5. Les Misérables  (2012)


Código creado con asistencia de IA: https://chat.deepseek.com/share/uluv49g1m1ere7usah