In [None]:
import pandas as pd

# Tabla de metadata
movies = pd.read_csv("../data/raw/rotten_tomatoes_movies.csv")  

# Tabla de reseñas
reviews = pd.read_csv("../data/raw/rotten_tomatoes_critic_reviews.csv") 

In [None]:
import pickle 

# Cargo el modelo de reseñas
ruta_pipeline = "../models/rotten_pipeline.pkl"
with open(ruta_pipeline, "rb") as f:
    pipeline_cargado = pickle.load(f)

reviews["review_content"] = reviews["review_content"].fillna("")

reviews["sentiment_prob"] = pipeline_cargado.predict_proba(reviews["review_content"])[:, 1]


In [None]:
# Aqui se argupan por el id y se saca la media
sentiment_scores = reviews.groupby("rotten_tomatoes_link")["sentiment_prob"].mean().reset_index()
sentiment_scores.rename(columns={"sentiment_prob": "avg_sentiment_score"}, inplace=True)

# Unir al dataset de películas
movies = movies.merge(sentiment_scores, on="rotten_tomatoes_link", how="left")


In [5]:
movies.head()

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count,avg_sentiment_score
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,...,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,0.552726
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,...,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19,0.808221
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,...,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8,0.582091
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,...,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0,0.771389
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,...,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3,0.719479


In [None]:
##Estos son ejemplos, pueden estar abajo
selected_genres = ["Action & Adventure", "Fantasy"]
selected_actor = "Logan Lerman"
selected_director = "Chris Columbus"
selected_year_range = (2005, 2015)
selected_tomatometer = 50
selected_movie = "Harry Potter and the Goblet of Fire"
top_n = 5
alpha = 0.7   # peso a la pelicula elegida (el titulo)
beta = 0.2    # peso al resto de valores de imput
gamma = 0.1   # peso sentimiento del crítico


In [None]:
selected_genres = ["Action & Adventure", "Animation"]
selected_actor = ""
selected_director = "Zack Snyder"
selected_year_range = ""
selected_tomatometer = 50
selected_movie = "300"
top_n = 5
alpha = 0.7   # peso a la pelicula elegida (el titulo)
beta = 0.2    # peso al resto de valores de imput
gamma = 0.1   # peso sentimiento del crítico


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Preparar dataset
filtered_movies = movies.copy()

# Se rellenan los nulos para que no de error
for col in ["genres", "actors", "directors", "movie_info", "critics_consensus"]:
    filtered_movies[col] = filtered_movies[col].fillna("")
filtered_movies["original_release_date"] = filtered_movies["original_release_date"].fillna("1900-01-01")
filtered_movies["tomatometer_rating"] = filtered_movies["tomatometer_rating"].fillna(0)

# Aqui se crea la coincidencia que tiene con los imput puestos y se da un puntaje con cada coincidencia
filtered_movies["match_score"] = 0
if selected_genres:
    filtered_movies["match_score"] += filtered_movies["genres"].apply(lambda x: sum(g in x for g in selected_genres))
if selected_actor:
    filtered_movies["match_score"] += filtered_movies["actors"].apply(lambda x: 2 if selected_actor in x else 0)
if selected_director:
    filtered_movies["match_score"] += filtered_movies["directors"].apply(lambda x: 2 if selected_director in x else 0)
if selected_year_range:
    filtered_movies["match_score"] += filtered_movies["original_release_date"].apply(
        lambda x: 1 if selected_year_range[0] <= int(str(x)[:4]) <= selected_year_range[1] else 0
    )
if selected_tomatometer is not None:
    filtered_movies["match_score"] += filtered_movies["tomatometer_rating"].apply(lambda x: 1 if x >= selected_tomatometer else 0)

# Normalizar match_score que sirve para los calculos de mas abajo
filtered_movies["match_norm"] = filtered_movies["match_score"] / filtered_movies["match_score"].max()

# Aplicar modelo de sentimiento a critics_consensus (recuerden que la importancia la controla gamma)
filtered_movies["consensus_sentiment_prob"] = pipeline_cargado.predict_proba(
    filtered_movies["critics_consensus"]
)[:, 1]
filtered_movies["consensus_sentiment_norm"] = filtered_movies["consensus_sentiment_prob"] / filtered_movies["consensus_sentiment_prob"].max()

# Resetear índices para poder usar iloc después
filtered_movies = filtered_movies.reset_index(drop=True)

# TF-IDF sobre combined_features que reune en un solo campo los textos 
filtered_movies["combined_features"] = (
    filtered_movies["genres"] + " " +
    filtered_movies["directors"] + " " +
    filtered_movies["actors"] + " " +
    filtered_movies["movie_info"]
)
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000, ngram_range=(1,2))
tfidf_matrix = vectorizer.fit_transform(filtered_movies["combined_features"])


# Función recomendación
def recomendar_por_similitud(selected_movie=None, top_n=5, alpha=0.7, beta=0.2, gamma=0.1):
    # caso de que no haya películas que cumplan los filtros o no haya info de referencia
    if filtered_movies.empty:
        print("No hay películas que cumplan los filtros.")
        return pd.DataFrame(columns=["movie_title", "genres", "actors", "directors", "match_score"])
    
    # Si no hay película de referencia o no está en el dataset
    if selected_movie is None or selected_movie not in filtered_movies["movie_title"].values:

        # Si no hay película de referencia, solo usar match_score + sentimiento (por si el camo de titulo esta vacio o no existe)
        print("⚠️ No se proporcionó película de referencia o no está en el dataset. Usando match_score + sentimiento.")

        # se usa solo match_score y sentimiento (es importante mostrar el print por que puede que los resultados no tengan que ver y sea por que estaba mal escrita el nombre de la pelicula)
        final_score = beta * filtered_movies["match_norm"] + gamma * filtered_movies["consensus_sentiment_norm"]
        top_indices = final_score.argsort()[::-1][:top_n]
        return filtered_movies.iloc[top_indices][
            ["movie_title", "genres", "actors", "directors", "match_score", "consensus_sentiment_prob"]
        ]
    
    # Película de referencia existe
    idx = filtered_movies[filtered_movies["movie_title"] == selected_movie].index[0]
    # Similaridad coseno entre la película seleccionada y todas las demás
    cosine_sim = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    
    # Score combinado
    final_score = (
        alpha * cosine_sim +
        beta * filtered_movies["match_norm"] +
        gamma * filtered_movies["consensus_sentiment_norm"]
    )
    
    # Obtener los índices de las películas con mayor puntaje combinado
    top_indices = final_score.argsort()[::-1]
    top_indices = [i for i in top_indices if i != idx]  # excluir la misma película
    top_indices = top_indices[:top_n]
    top_indices = filtered_movies["match_score"].argsort()[::-1][:top_n] #Lo oredena por match score para que las primeras sean las que mas coincidan con los filtros que es lo mas importante

    
    # Devuelve las recomendaciones, si quieren mostrar los datos busquen los titulos en el dataset original y muestren lo que vean necesario
    return filtered_movies.iloc[top_indices][
        ["movie_title", "genres", "actors", "directors", "match_score", "consensus_sentiment_prob"]
    ]


In [64]:
#Aqui se aplica los ejemplos
recommendations = recomendar_por_similitud(selected_movie, top_n=top_n, alpha=alpha, beta=beta, gamma=gamma)
recommendations

Unnamed: 0,movie_title,genres,actors,directors,match_score,consensus_sentiment_prob
9376,Legend of the Guardians: The Owls of Ga'Hoole,"Action & Adventure, Animation, Science Fiction...","Abbie Cornish, Miriam Margolyes, Helen Mirren,...",Zack Snyder,5,0.672336
17066,Watchmen,"Action & Adventure, Drama, Science Fiction & F...","Billy Crudup, Jeffrey Dean Morgan, Malin Akerm...",Zack Snyder,4,0.881251
14095,Man of Steel,"Action & Adventure, Science Fiction & Fantasy","Henry Cavill, Amy Adams, Michael Shannon, Dian...",Zack Snyder,4,0.798288
1956,300,"Action & Adventure, Drama","Gerard Butler, Lena Headey, David Wenham, Rodr...",Zack Snyder,4,0.694349
8855,Justice League,"Action & Adventure, Drama, Science Fiction & F...","Ben Affleck, Henry Cavill, Amy Adams, Gal Gado...",Zack Snyder,3,0.205442
