In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('Datasets/dataset_limpio.csv')

In [4]:
df.head()

Unnamed: 0,id,title,genres,overview,tagline,spoken_languages,decade
0,862,Toy Story,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...",,['English'],1990
1,8844,Jumanji,"['Adventure', 'Fantasy', 'Family']",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,"['English', 'Français']",1990
2,15602,Grumpier Old Men,"['Romance', 'Comedy']",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,['English'],1990
3,31357,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,['English'],1990
4,11862,Father of the Bride Part II,['Comedy'],Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,['English'],1990


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42196 entries, 0 to 42195
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                42196 non-null  object
 1   title             42196 non-null  object
 2   genres            42196 non-null  object
 3   overview          42196 non-null  object
 4   tagline           42196 non-null  object
 5   spoken_languages  42196 non-null  object
 6   decade            42196 non-null  object
dtypes: object(7)
memory usage: 2.3+ MB


In [6]:
df.isnull().sum()

id                  0
title               0
genres              0
overview            0
tagline             0
spoken_languages    0
decade              0
dtype: int64

In [7]:
#eliminación de caracteres especiales y conversión a minúsculas:
df['overview'] = df['overview'].str.replace('[^\w\s]', '').str.lower().str.replace(",", "").str.replace(".", "").str.replace("!", "")
df['tagline'] = df['tagline'].str.replace("[^\w\s]", "").str.lower().str.replace(",", "").str.replace(".", "").str.replace("!", "")

In [8]:
#eliminación de palabras vacías (stopwords) y lematización:

nltk.download('stopwords')
nltk.download('wordnet')
#nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

df['overview'] = df['overview'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in stop_words]))
df['tagline'] = df['tagline'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ssanjua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ssanjua/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
df['overview'].iloc[0]

"led woody andy's toy live happily room andy's birthday brings buzz lightyear onto scene afraid losing place andy's heart woody plot buzz circumstance separate buzz woody owner duo eventually learns put aside difference"

In [10]:
df['tagline'].iloc[1]

'roll dice unleash excitement'

In [11]:
# Seleccionar los campos 'overview' y 'tagline'
text_data = df[['overview', 'tagline']]
# Combinar los campos en un solo texto
df['texto_combinado'] = text_data['overview'] + ' ' + text_data['tagline']

In [12]:
df = df.drop('overview', axis=1)
df = df.drop('tagline', axis=1)

In [14]:
df.head()

Unnamed: 0,id,title,genres,spoken_languages,decade,texto_combinado
0,862,Toy Story,"['Animation', 'Comedy', 'Family']",['English'],1990,led woody andy's toy live happily room andy's ...
1,8844,Jumanji,"['Adventure', 'Fantasy', 'Family']","['English', 'Français']",1990,sibling judy peter discover enchanted board ga...
2,15602,Grumpier Old Men,"['Romance', 'Comedy']",['English'],1990,family wedding reignites ancient feud next-doo...
3,31357,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",['English'],1990,cheated mistreated stepped woman holding breat...
4,11862,Father of the Bride Part II,['Comedy'],['English'],1990,george bank recovered daughter's wedding recei...


In [15]:
# Crear el vectorizador TF-IDF. lo limito a 50000 para que no sea tan grande
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', max_features=50000)

# Aplicar el vectorizador a los datos de texto
tfidf_matrix = vectorizer.fit_transform(df['texto_combinado'])

In [16]:
tfidf_matrix.shape

(42196, 50000)

In [17]:
#similaritud del coseno: 
coseno_sim_text = cosine_similarity(tfidf_matrix)

COUNT MATRIX

In [18]:
#quito los espacios en los generos formados por mas de una silaba
df['genres'] = df['genres'].str.replace(" ", "")

In [19]:
#duplico 'genres' para darle mas importancia y las uno en un campo
df['combined_features'] = df['genres'] + df['genres'] + df['decade'] + df['spoken_languages']

In [20]:
df['combined_features'][7]

"['Action','Adventure','Drama','Family']['Action','Adventure','Drama','Family']1990['English', 'Deutsch']"

In [None]:
df = df.drop('genres', axis=1)
df = df.drop('decade', axis=1)
df = df.drop('id', axis=1)
df = df.drop('spoken_languages', axis=1)

In [27]:
df['combined_features'] = df['combined_features'].str.replace("'", " ").str.replace("[", " ").str.replace("]", " ").str.replace(",", " ").str.lower()

In [28]:
df.head(5)

Unnamed: 0,title,texto_combinado,combined_features
0,Toy Story,led woody andy's toy live happily room andy's ...,animation comedy family animation c...
1,Jumanji,sibling judy peter discover enchanted board ga...,adventure fantasy family adventure ...
2,Grumpier Old Men,family wedding reignites ancient feud next-doo...,romance comedy romance comedy 1990 ...
3,Waiting to Exhale,cheated mistreated stepped woman holding breat...,comedy drama romance comedy drama ...
4,Father of the Bride Part II,george bank recovered daughter's wedding recei...,comedy comedy 1990 english


In [31]:
#Vectorización de texto
vectorizer = CountVectorizer(stop_words='english')
feature_matrix = vectorizer.fit_transform(df['combined_features'])

In [32]:
feature_matrix.shape

(42196, 247)

In [33]:
coseno_sim_features = cosine_similarity(feature_matrix)

In [34]:
# Combina las similitudes utilizando algún método (por ejemplo, suma ponderada)
combined_similarity = 0.6 * coseno_sim_text + 0.4 * coseno_sim_features

In [35]:
def get_recommendations(title):
    titles = df['title']
    indices = pd.Series(df.index, index=df['title'])  
    idx = indices[title]
    sim_scores = list(enumerate(combined_similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Obtenemos las 20 películas más similares
    movie_indices = [i[0] for i in sim_scores]
    
    return df['title'].iloc[movie_indices]

In [36]:
get_recommendations('Toy Story')

2961              Toy Story 2
14764             Toy Story 3
23200               Small Fry
20865    Toy Story of Terror!
726             A Close Shave
Name: title, dtype: object

In [37]:
get_recommendations('Minions')

20145    Despicable Me 2
5271     Stuart Little 2
40892    Despicable Me 3
18828     Wreck-It Ralph
41898             Banana
Name: title, dtype: object

In [38]:
get_recommendations('Avatar')

6320     Lara Croft Tomb Raider: The Cradle of Life
25040                                Thor: Ragnarok
9904                                 Fantastic Four
14579                                  Solomon Kane
2502                                    Superman II
Name: title, dtype: object

In [39]:
get_recommendations('Titanic')

2045      Return to Paradise
342              Bitter Moon
652                     Fear
34752    Allyson Is Watching
14595                Voyager
Name: title, dtype: object

In [40]:
get_recommendations('Superman')

2502                          Superman II
10733                    Superman Returns
30221                Atom Man vs Superman
2504     Superman IV: The Quest for Peace
20058                        Man of Steel
Name: title, dtype: object

In [41]:
df.shape

(42196, 3)

In [42]:
small_df = df.sample(n=10000, random_state=42)

In [66]:
small_df=small_df.reset_index(drop=True)

In [91]:
small_df

Unnamed: 0,title,texto_combinado,combined_features
0,The Business of Fancydancing,seymour polatkin successful gay indian poet sp...,music drama music drama 2000 english
1,Dollman,brick bardo traveller outer space forced land ...,action comedy crime sciencefiction ...
2,The Dukes,dukesa doo wop group top world 17 struggling s...,comedy crime drama music comedy c...
3,Dragon Wars: D-War,based korean legend unknown creature return de...,fantasy drama horror action thriller...
4,Tom and Huck,mischievous young boy tom sawyer witness murde...,action adventure drama family actio...
...,...,...,...
9995,Slaughterhouse,owner slaughterhouse facing foreclosure instru...,horror horror 1980 english
9996,Mother's Heart,lorenz three young child victim medium wish tu...,drama drama 1960 italiano
9997,Antisocial,five university friend gather house party ring...,horror thriller horror thriller 2010...
9998,Shadows of the Dead,group teenager try escape creature life among ...,horror horror 2010 english


In [68]:
# Crear el vectorizador TF-IDF. lo limito a 50000 para que no sea tan grande
vectorizerSML = TfidfVectorizer(analyzer='word', stop_words='english', max_features=50000)

# Aplicar el vectorizador a los datos de texto
tfidf_matrixSML = vectorizerSML.fit_transform(small_df['texto_combinado'])

In [69]:
tfidf_matrixSML.shape

(10000, 34041)

In [70]:
#similaritud del coseno: 
coseno_sim_textdSML = cosine_similarity(tfidf_matrixSML)

In [71]:
#Vectorización de texto
vectorizerSML_features = CountVectorizer(stop_words='english')
feature_matrixSML = vectorizerSML_features.fit_transform(small_df['combined_features'])

In [72]:
feature_matrixSML.shape

(10000, 177)

In [73]:
coseno_sim_featuresSML = cosine_similarity(feature_matrixSML)

In [74]:
# Combina las similitudes utilizando algún método (por ejemplo, suma ponderada)
combined_similaritySML = 0.6 * coseno_sim_textdSML + 0.4 * coseno_sim_featuresSML

In [75]:
def get_recommendationsSML(title):
    titles = df['title']
    indices = pd.Series(df.index, index=df['title'])  
    idx = indices[title]
    sim_scores = list(enumerate(combined_similaritySML[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Obtenemos las 20 películas más similares
    movie_indices = [i[0] for i in sim_scores]
    
    return df['title'].iloc[movie_indices]

In [77]:
get_recommendationsSML('Titanic')

879                     North by Northwest
8460                            The Devils
4171    Whatever Happened to Harold Smith?
4102                        The Lost World
4507                            Black Robe
Name: title, dtype: object

In [89]:
get_recommendationsSML('Toy Story')

429                   Cliffhanger
2485    A Midsummer Night's Dream
2591             The Golden Child
6767                      Revenge
9827              Innocent Voices
Name: title, dtype: object

In [106]:
#definicion final para API:
def recomendacion(title:str):
    indices = pd.Series(df.index, index=df['title'])  
    if title not in indices:
        return "La película no está en el dataset reducido"

    idx = indices[title]
    vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(small_df['texto_combinado'])
    coseno_sim_text = cosine_similarity(tfidf_matrix)
    vectorizer_features = CountVectorizer(stop_words='english')
    feature_matrix = vectorizer_features.fit_transform(small_df['combined_features'])
    coseno_sim_features = cosine_similarity(feature_matrix)
    combined_similarity = 0.6 * coseno_sim_text + 0.4 * coseno_sim_features

    sim_scores = list(enumerate(combined_similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Obtenemos las 5 películas más similares
    movie_indices = [i[0] for i in sim_scores]
    recommendations=list(small_df['title'].iloc[movie_indices].str.title())
    
    return {'lista recomendada': recommendations} 

In [107]:
recomendacion('Minion')

'La película no está en el dataset reducido'

In [108]:
recomendacion('Toy Story')

{'lista recomendada': ['Winter Solstice',
  'Hendrix',
  'Masked And Anonymous',
  'Passing Strange',
  'Prey For Rock & Roll']}

In [109]:
#small_df.to_csv('Datasets/dataset_reducido.csv', index=False)

In [120]:
small_df = pd.read_csv('Datasets/dataset_reducido.csv')

In [123]:
def recomendacion2(title:str):
    '''Ingresas un nombre de pelicula y te recomienda las similares en una lista'''
    #small_df = pd.read_csv('../Datasets/dataset_reducido.csv')
    indices = pd.Series(df.index, index=df['title'])  
    if title not in indices:
        return "La película no está en el dataset reducido"
    idx = indices[title]
    vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(small_df['texto_combinado'])
    coseno_sim_text = cosine_similarity(tfidf_matrix)
    vectorizer_features = CountVectorizer(stop_words='english')
    feature_matrix = vectorizer_features.fit_transform(small_df['combined_features'])
    coseno_sim_features = cosine_similarity(feature_matrix)
    combined_similarity = 0.6 * coseno_sim_text + 0.4 * coseno_sim_features

    sim_scores = list(enumerate(combined_similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Obtenemos las 5 películas más similares
    movie_indices = [i[0] for i in sim_scores]
    recommendations=list(small_df['title'].iloc[movie_indices].str.title())
    
    return {'lista recomendada': recommendations} 

In [124]:
recomendacion2('Titanic')

{'lista recomendada': ['Porn Star: The Legend Of Ron Jeremy',
  'American Scary',
  'I Knew It Was You: Rediscovering John Cazale',
  'Cinemania',
  'American Movie']}

La siguiente etapa implica la construcción de una matriz de similitud entre las películas. Esta matriz nos ayudará a calcular la similitud entre películas y así generar recomendaciones basadas en películas similares.