In [3]:
import pandas as pd
import json as json

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [19]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to C:\Users\Pablo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pablo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [79]:
dfMoviesFinal = pd.read_parquet('DataSets/dfMoviesSintetico.parquet')

In [80]:
dfMoviesFinal.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32184 entries, 0 to 581956
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   budget        32184 non-null  float64
 1   id_pelicula   32184 non-null  int64  
 2   overview      32184 non-null  object 
 3   release_date  32184 non-null  object 
 4   revenue       32184 non-null  int64  
 5   title         32184 non-null  object 
 6   vote_average  32184 non-null  float64
 7   vote_count    32184 non-null  float64
 8   release_year  32184 non-null  float64
 9   return        32184 non-null  float64
 10  name_genres   32184 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 2.9+ MB


In [81]:
# Verifica los tipos únicos en la columna
print(dfMoviesFinal['overview'].apply(type).value_counts())
print(dfMoviesFinal['overview'].head())
print(dfMoviesFinal['overview'].apply(type).value_counts())


overview
<class 'str'>    32184
Name: count, dtype: int64
0     Led by Woody, Andy's toys live happily in his ...
12    When siblings Judy and Peter discover an encha...
30    A family wedding reignites the ancient feud be...
46    Cheated on, mistreated and stepped on, the wom...
49    Just when George Banks has recovered from his ...
Name: overview, dtype: object
overview
<class 'str'>    32184
Name: count, dtype: int64


In [82]:
# Convertir a String
dfMoviesFinal['overview'] = dfMoviesFinal['overview'].astype(str)

In [83]:
# Verifica los tipos únicos en la columna
print(dfMoviesFinal['overview'].apply(type).value_counts())

overview
<class 'str'>    32184
Name: count, dtype: int64


In [84]:
# Funciones para procesar la columna de los comentarios
def process_tokens(text_series):
    """Tokeniza, elimina stopwords y aplica stemming."""
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    def process_row(text):
        tokens = word_tokenize(text)
        tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
        return ' '.join(tokens)

    return text_series.apply(process_row)

def preprocess_column(df):
    """Aplica limpieza y procesamiento de tokens a la columna del DataFrame."""
    # Limpieza previa (ejemplo)
    df.loc[:, 'overview'] = df['overview'].str.replace(r'[^a-zA-Z]', ' ', regex=True).str.lower()

    # Aplicar procesamiento de tokens
    df.loc[:, 'processed_overview'] = process_tokens(df['overview'])

    return df

In [85]:
# Procesar Columna
dfMoviesSintetico = preprocess_column(dfMoviesFinal)

In [97]:
def recomendacion(title):
    # Vectorizar los resúmenes utilizando TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(dfMoviesSintetico['processed_overview'])

    # Verificar si el título existe en el DataFrame
    if title not in dfMoviesSintetico['title'].values:
        raise ValueError(f"El título '{title}' no se encuentra en el DataFrame.")
    
    # Obtener el resumen preprocesado de la película dada
    processed_summary = dfMoviesSintetico[dfMoviesSintetico['title'] == title]['processed_overview'].values[0]
    query_vector = vectorizer.transform([processed_summary])

    # Calcular la similitud del coseno entre la película consultada y todas las demás
    cosine_sim = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Obtener los índices de las 5 películas más similares
    similar_indices = cosine_sim.argsort()[-6:-1][::-1]

    # Obtener las 5 películas más similares
    recommended_movies = dfMoviesSintetico.iloc[similar_indices]
    return recommended_movies['title']

In [98]:
recommendations = recomendacion('Batman')
print(recommendations)

159955    Batman Beyond: Return of the Joker
334181                 The Dark Knight Rises
435798                     Batman vs Dracula
300061            Batman: Under the Red Hood
558971                 DC Showcase: Catwoman
Name: title, dtype: object


In [None]:
# Elimino columna que no voy a usar en ninguna funcion
dfMoviesSintetico = dfMoviesSintetico.drop(columns=['overview'])

In [91]:
dfMoviesSintetico.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32184 entries, 0 to 581956
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   budget              32184 non-null  float64
 1   id_pelicula         32184 non-null  int64  
 2   release_date        32184 non-null  object 
 3   revenue             32184 non-null  int64  
 4   title               32184 non-null  object 
 5   vote_average        32184 non-null  float64
 6   vote_count          32184 non-null  float64
 7   release_year        32184 non-null  float64
 8   return              32184 non-null  float64
 9   name_genres         32184 non-null  object 
 10  processed_overview  32184 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 2.9+ MB


In [92]:
dfMoviesSintetico.to_parquet('DataSets/dfMoviesSintetico.parquet')

In [93]:
dfMoviesSintetico.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32184 entries, 0 to 581956
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   budget              32184 non-null  float64
 1   id_pelicula         32184 non-null  int64  
 2   release_date        32184 non-null  object 
 3   revenue             32184 non-null  int64  
 4   title               32184 non-null  object 
 5   vote_average        32184 non-null  float64
 6   vote_count          32184 non-null  float64
 7   release_year        32184 non-null  float64
 8   return              32184 non-null  float64
 9   name_genres         32184 non-null  object 
 10  processed_overview  32184 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 2.9+ MB


In [None]:

import joblib

# Inicializar el vectorizer y tfidf_matrix
def setup_recommendation_system(df):
    global vectorizer, tfidf_matrix
    
    # Vectorizar los resúmenes utilizando TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['processed_overview'])
    
    # Guardar el vectorizer y tfidf_matrix en el disco para ser reutilizados
    joblib.dump(vectorizer, 'vectorizer.joblib')
    joblib.dump(tfidf_matrix, 'tfidf_matrix.joblib')

# Cargar el vectorizer y tfidf_matrix desde el disco
def load_recommendation_system():
    global vectorizer, tfidf_matrix
    
    vectorizer = joblib.load('vectorizer.joblib')
    tfidf_matrix = joblib.load('tfidf_matrix.joblib')

# Función de recomendación
def recomendacion(title):
    # Verificar si el título existe en el DataFrame
    if title not in dfMoviesFinal['title'].values:
        raise ValueError(f"El título '{title}' no se encuentra en el DataFrame.")
    
    # Obtener el resumen preprocesado de la película dada
    processed_summary = dfMoviesFinal[dfMoviesFinal['title'] == title]['processed_overview'].values[0]
    query_vector = vectorizer.transform([processed_summary])
    
    # Calcular la similitud del coseno entre la película consultada y todas las demás
    cosine_sim = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Obtener los índices de las 5 películas más similares
    similar_indices = cosine_sim.argsort()[-6:-1][::-1]
    
    # Obtener las 5 películas más similares
    recommended_movies = dfMoviesFinal.iloc[similar_indices]
    
    return recommended_movies[['title', 'overview']]
