In [3]:
import pandas as pd
import json as json

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [19]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to C:\Users\Pablo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pablo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [56]:
dfMoviesFinal = pd.read_parquet('DataSets/dfMoviesSintetico.parquet')

In [57]:
dfMoviesFinal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45346 entries, 0 to 45345
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   budget              45346 non-null  float64
 1   id_pelicula         45346 non-null  int64  
 2   overview            45346 non-null  object 
 3   release_date        45346 non-null  object 
 4   revenue             45346 non-null  int64  
 5   title               45346 non-null  object 
 6   vote_average        45346 non-null  float64
 7   vote_count          45346 non-null  float64
 8   release_year        45346 non-null  float64
 9   return              45346 non-null  float64
 10  name_genres         42962 non-null  object 
 11  processed_overview  45346 non-null  object 
dtypes: float64(5), int64(2), object(5)
memory usage: 4.2+ MB


In [50]:
# Verifica los tipos únicos en la columna
print(dfMoviesFinal['overview'].apply(type).value_counts())
print(dfMoviesFinal['overview'].head())
print(dfMoviesFinal['overview'].apply(type).value_counts())


overview
<class 'str'>    45346
Name: count, dtype: int64
0    led by woody  andy s toys live happily in his ...
1    when siblings judy and peter discover an encha...
2    a family wedding reignites the ancient feud be...
3    cheated on  mistreated and stepped on  the wom...
4    just when george banks has recovered from his ...
Name: overview, dtype: object
overview
<class 'str'>    45346
Name: count, dtype: int64


In [42]:
# Convertir a String
dfMoviesFinal['overview'] = dfMoviesFinal['overview'].astype(str)

In [43]:
# Verifica los tipos únicos en la columna
print(dfMoviesFinal['overview'].apply(type).value_counts())

overview
<class 'str'>    45346
Name: count, dtype: int64


In [44]:
# Funciones para procesar la columna de los comentarios
def process_tokens(text_series):
    """Tokeniza, elimina stopwords y aplica stemming."""
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    def process_row(text):
        tokens = word_tokenize(text)
        tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
        return ' '.join(tokens)

    return text_series.apply(process_row)

def preprocess_column(df):
    """Aplica limpieza y procesamiento de tokens a la columna del DataFrame."""
    # Limpieza previa (ejemplo)
    df.loc[:, 'overview'] = df['overview'].str.replace(r'[^a-zA-Z]', ' ', regex=True).str.lower()

    # Aplicar procesamiento de tokens
    df.loc[:, 'processed_overview'] = process_tokens(df['overview'])

    return df

In [45]:
# Procesar Columna
dfMoviesSintetico = preprocess_column(dfMoviesFinal)

In [22]:
def recomendacion(title):
    # Vectorizar los resúmenes utilizando TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(dfMoviesSintetico['processed_overview'])

    # Verificar si el título existe en el DataFrame
    if title not in dfMoviesSintetico['title'].values:
        raise ValueError(f"El título '{title}' no se encuentra en el DataFrame.")
    
    # Obtener el resumen preprocesado de la película dada
    processed_summary = dfMoviesSintetico[dfMoviesSintetico['title'] == title]['processed_overview'].values[0]
    query_vector = vectorizer.transform([processed_summary])

    # Calcular la similitud del coseno entre la película consultada y todas las demás
    cosine_sim = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Obtener los índices de las 5 películas más similares
    similar_indices = cosine_sim.argsort()[-6:-1][::-1]

    # Obtener las 5 películas más similares
    recommended_movies = dfMoviesSintetico.iloc[similar_indices]
    return recommended_movies[['title', 'overview']]

In [24]:

recommendations = recomendacion('Batman')
print(recommendations)

                                    title  \
9223   Batman Beyond: Return of the Joker   
18232               The Dark Knight Rises   
25220                   Batman vs Dracula   
15498          Batman: Under the Red Hood   
41884               DC Showcase: Catwoman   

                                                overview  
9223   the joker is back with a vengeance  and gotham...  
18232  following the death of district attorney harve...  
25220  gotham city is terrorized not only by recent e...  
15498  batman faces his ultimate challenge as the mys...  
41884  catwoman attempts to track down a mysterious c...  


In [39]:
dfMoviesSintetico

Unnamed: 0,budget,id_pelicula,overview,release_date,revenue,title,vote_average,vote_count,release_year,return,name_genres,processed_overview
0,30000000.0,862,led by woody andy s toys live happily in his ...,1995-10-30,373554033,Toy Story,7.7,5415.0,1995.0,12.451801,Animation,led woodi andi toy live happili room andi birt...
1,65000000.0,8844,when siblings judy and peter discover an encha...,1995-12-15,262797249,Jumanji,6.9,2413.0,1995.0,4.043035,Adventure,sibl judi peter discov enchant board game open...
2,0.0,15602,a family wedding reignites the ancient feud be...,1995-12-22,0,Grumpier Old Men,6.5,92.0,1995.0,0.000000,Romance,famili wed reignit ancient feud next door neig...
3,16000000.0,31357,cheated on mistreated and stepped on the wom...,1995-12-22,81452156,Waiting to Exhale,6.1,34.0,1995.0,5.090760,Comedy,cheat mistreat step women hold breath wait elu...
4,0.0,11862,just when george banks has recovered from his ...,1995-02-10,76578911,Father of the Bride Part II,5.7,173.0,1995.0,0.000000,Comedy,georg bank recov daughter wed receiv news preg...
...,...,...,...,...,...,...,...,...,...,...,...,...
45341,0.0,30840,yet another version of the classic epic with ...,1991-05-13,0,Robin Hood,5.7,26.0,1991.0,0.000000,Drama,yet anoth version classic epic enough variat m...
45342,0.0,111109,an artist struggles to finish his work while a...,2011-11-17,0,Century of Birthing,9.0,3.0,2011.0,0.000000,Drama,artist struggl finish work storylin cult play ...
45343,0.0,67758,when one of her hits goes wrong a professiona...,2003-08-01,0,Betrayal,3.8,6.0,2003.0,0.000000,Action,one hit goe wrong profession assassin end suit...
45344,0.0,227506,in a small town live two brothers one a minis...,1917-10-21,0,Satan Triumphant,0.0,0.0,1917.0,0.000000,,small town live two brother one minist one hun...
