In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import datetime
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import warnings
warnings.filterwarnings('ignore')

In [3]:
pelis = pd.read_csv('moviesclean.csv')

In [8]:
def desanidar_genres(row):
    genres_dict = ast.literal_eval(row)
    genres_list = [genre['name'] for genre in genres_dict]
    return genres_list
pelis['genres'] = pelis['genres'].apply(desanidar_genres)


In [13]:
import ast
import math

def obtener_nombre(row):
    try:
        diccionario = ast.literal_eval(row)
        if isinstance(diccionario, dict) and not math.isnan(diccionario.get('id')):
            nombre = diccionario['name']
            return nombre
        else:
            return None
    except (SyntaxError, ValueError):
        return None

pelis['belongs_to_collection_name'] = pelis['belongs_to_collection'].apply(obtener_nombre)



In [16]:
#Desanidar listas
#production_companies
def desanidar_production_companies(row):
    try:
        production_companies_list = ast.literal_eval(row)
        companies = [companie['name'] for companie in production_companies_list]
        return ', '.join(companies)
    except (SyntaxError, ValueError, TypeError):
        return ''


pelis['companies'] = pelis['production_companies'].apply(desanidar_production_companies)

#production_countries
def desanidar_production_countries(row):
    try:
        production_countries_list = ast.literal_eval(row)
        countries = [countrie['name'] for countrie in production_countries_list]
        return ', '.join(countries)
    except (SyntaxError, ValueError, TypeError):
        return ''

pelis['countries'] = pelis['production_countries'].apply(desanidar_production_countries)

#spoken_languages
def desanidar_spoken_languages(row):
    try:
        spoken_languages_list = ast.literal_eval(row)
        languages = [language['name'] for language in spoken_languages_list]
        return ', '.join(languages)
    except (SyntaxError, ValueError):
        return ''

pelis['languages'] = pelis['spoken_languages'].apply(desanidar_spoken_languages)





In [19]:
#Los valores nulos de los campos revenue, budget deben ser rellenados por el número 0.
pelis['revenue'].fillna(0, inplace=True)
pelis['budget'].fillna(0, inplace=True)

In [21]:
#Los valores nulos del campo release date deben eliminarse.


pelis.dropna(subset=['release_date'], inplace=True)

In [29]:
#Año de estreno
from datetime import datetime

def obtener_anio(fecha):
    try:
        fecha_objeto = datetime.strptime(fecha, "%Y-%m-%d")
        anio = fecha_objeto.year
        return int(anio)
    except ValueError:
        return None

pelis['release_year'] = pelis['release_date'].apply(obtener_anio).astype('Int64')

In [32]:
#retorno
pelis['revenue']=pelis['revenue'].replace(0,np.nan)
pelis['budget']=pd.to_numeric(pelis['budget'],errors='coerce')
pelis['budget']=pelis['budget'].replace(0,np.nan)

pelis['return']=pelis['revenue']/pelis['budget']
print(pelis[pelis['return'].isnull()].shape)
print(pelis[pelis['budget'].isnull()].shape)

(39998, 30)
(36493, 30)


In [34]:
columnas_eliminar = ['video', 'imdb_id', 'adult', 'original_title', 'poster_path', 'homepage']
pelis = pelis.drop(columnas_eliminar, axis=1)

In [36]:
columnas_eliminar = ['belongs_to_collection', 'spoken_languages', 'production_companies', 'production_countries']
pelis = pelis.drop(columnas_eliminar, axis=1)

In [38]:
pelis.to_csv('moviesclean.csv', index=False)

In [4]:
pelis.head()

Unnamed: 0,budget,genres,id,original_language,overview,popularity,release_date,revenue,runtime,status,tagline,title,vote_average,vote_count,belongs_to_collection_name,companies,countries,languages,release_year,return
0,30000000.0,"['Animation', 'Comedy', 'Family']",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,81.0,Released,,Toy Story,7.7,5415.0,Toy Story Collection,Pixar Animation Studios,United States of America,English,1995.0,12.451801
1,65000000.0,"['Adventure', 'Fantasy', 'Family']",8844,en,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249.0,104.0,Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,,"TriStar Pictures, Teitler Film, Interscope Com...",United States of America,"English, Français",1995.0,4.043035
2,,"['Romance', 'Comedy']",15602,en,A family wedding reignites the ancient feud be...,11.7129,1995-12-22,,101.0,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,Grumpy Old Men Collection,"Warner Bros., Lancaster Gate",United States of America,English,1995.0,
3,16000000.0,"['Comedy', 'Drama', 'Romance']",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,1995-12-22,81452156.0,127.0,Released,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,,Twentieth Century Fox Film Corporation,United States of America,English,1995.0,5.09076
4,,['Comedy'],11862,en,Just when George Banks has recovered from his ...,8.387519,1995-02-10,76578911.0,106.0,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,Father of the Bride Collection,"Sandollar Productions, Touchstone Pictures",United States of America,English,1995.0,


In [7]:
#peliculas por idioma de estreno
conteo_lenguage = pelis['original_language'].value_counts().reset_index()
conteo_lenguage.columns = ['Lenguaje', 'Cantidad']
conteo_lenguage.head(20)
conteo_lenguage.to_csv('conteo_lenguage.csv', index=False)


In [50]:
conteo_peliculas_por_idioma = pelis['languages'].str.split(',').str[0].value_counts()


In [65]:
    print(conteo_peliculas_por_idioma)

{'English': 28729, 'Français': 4194, '广州话 / 廣州話': 473, '普通话': 790, 'Pусский': 1562, 'Español': 2412, '': 45379, 'shqip': 29, 'Italiano': 2366, 'Deutsch': 2624, 'فارسی': 141, 'Nederlands': 339, 'Dansk': 302, 'العربية': 341, 'Magyar': 359, 'Český': 284, 'svenska': 560, '日本語': 1758, 'Português': 591, 'Català': 37, '한국어/조선말': 542, 'Afrikaans': 28, 'বাংলা': 47, 'עִבְרִית': 215, 'Latin': 140, 'Cymraeg': 9, 'Tiếng Việt': 61, 'Polski': 524, 'български език': 32, 'ελληνικά': 213, 'Norsk': 171, 'Bosanski': 33, 'Gaeilge': 21, 'Bokmål': 3, 'Український': 53, 'No Language': 319, 'Kiswahili': 24, 'Srpski': 108, 'हिन्दी': 707, 'Azərbaycan': 4, 'ภาษาไทย': 176, 'Bamanankan': 6, 'suomi': 371, 'Română': 128, 'Hrvatski': 52, 'Türkçe': 247, 'ქართული': 33, 'Slovenčina': 25, 'беларуская мова': 2, 'Esperanto': 7, 'Galego': 5, 'Íslenska': 60, 'isiZulu': 18, 'Eesti': 50, 'Latviešu': 20, 'қазақ': 11, 'Slovenščina': 22, 'Bahasa indonesia': 37, 'Wolof': 14, 'اردو': 55, 'Kinyarwanda': 3, 'euskera': 16, 'Bahasa mela

In [66]:
conteo_peliculas_por_idioma = {'English': 28729, 'Français': 4194, '广州话 / 廣州話': 473, '普通话': 790, 'Pусский': 1562, 'Español': 2412, '': 45379, 'shqip': 29, 'Italiano': 2366, 'Deutsch': 2624, 'فارسی': 141, 'Nederlands': 339, 'Dansk': 302, 'العربية': 341, 'Magyar': 359, 'Český': 284, 'svenska': 560, '日本語': 1758, 'Português': 591, 'Català': 37, '한국어/조선말': 542, 'Afrikaans': 28, 'বাংলা': 47, 'עִבְרִית': 215, 'Latin': 140, 'Cymraeg': 9, 'Tiếng Việt': 61, 'Polski': 524, 'български език': 32, 'ελληνικά': 213, 'Norsk': 171, 'Bosanski': 33, 'Gaeilge': 21, 'Bokmål': 3, 'Український': 53, 'No Language': 319, 'Kiswahili': 24, 'Srpski': 108, 'हिन्दी': 707, 'Azərbaycan': 4, 'ภาษาไทย': 176, 'Bamanankan': 6, 'suomi': 371, 'Română': 128, 'Hrvatski': 52, 'Türkçe': 247, 'ქართული': 33, 'Slovenčina': 25, 'беларуская мова': 2, 'Esperanto': 7, 'Galego': 5, 'Íslenska': 60, 'isiZulu': 18, 'Eesti': 50, 'Latviešu': 20, 'қазақ': 11, 'Slovenščina': 22, 'Bahasa indonesia': 37, 'Wolof': 14, 'اردو': 55, 'Kinyarwanda': 3, 'euskera': 16, 'Bahasa melayu': 16, 'தமிழ்': 111, 'తెలుగు': 68, 'Lietuvi\x9akai': 26}

df = pd.DataFrame(list(conteo_peliculas_por_idioma.items()), columns=['idioma', 'cantidad_peliculas'])

In [68]:
df.to_csv('peliculas_por_idioma.csv', index=False)

In [69]:
#def peliculas_duracion( Pelicula: str ): Se ingresa una pelicula. Debe devolver la duracion y el año.
duracion_peliculas = pelis.loc[:, ['title', 'release_year', 'runtime']]
duracion_peliculas.to_csv('duracion_peliculas.csv', index=False)

In [81]:

# Crear una lista de la columna 'belongs_to_collection_name' y contar la cantidad de veces que se repite
# Agrupar por 'belongs_to_collection_name' y sumar 'revenue' para cada grupo
pd.set_option('display.float_format', '{:.2f}'.format)
agrupado = pelis.groupby('belongs_to_collection_name')['revenue'].sum().reset_index()

# Contar la cantidad de veces que se repite cada valor de 'belongs_to_collection_name'
conteo_peliculas = pelis['belongs_to_collection_name'].value_counts().reset_index()
conteo_peliculas.columns = ['belongs_to_collection_name', 'peliculas']

# Combinar los datos de la suma de 'revenue' y el conteo de películas
nuevo_dataset = pd.merge(conteo_peliculas, agrupado, on='belongs_to_collection_name')

# Calcular el promedio dividiendo 'revenue' entre 'peliculas'
nuevo_dataset['promedio'] = nuevo_dataset['revenue'] / nuevo_dataset['peliculas']

In [83]:
nuevo_dataset.to_csv('franquicia.csv', index=False)

In [91]:
#def peliculas_pais( Pais: str ):
# Agrupar por 'production_countries' y contar la cantidad de veces que aparece cada país
conteo_paises = pelis['countries'].str.split(',').explode().str.strip().value_counts()
print(conteo_paises)
conteo_paises.to_csv('conteo_paises.csv', index=True)

United States of America                21147
                                         6214
United Kingdom                           4091
France                                   3939
Germany                                  2254
                                        ...  
Somalia                                     1
United States Minor Outlying Islands        1
Martinique                                  1
Kuwait                                      1
Guinea                                      1
Name: countries, Length: 161, dtype: int64


In [None]:
conteo_paises = pelis['countries'].str.split(',').explode().str.strip().value_counts()


In [10]:
#def productoras_exitosas( Productora: str ):
conteocomp = pelis['companies'].str.split(',').str[0].value_counts()
print(conteocomp)

Paramount Pictures                        998
Metro-Goldwyn-Mayer (MGM)                 852
Twentieth Century Fox Film Corporation    780
Warner Bros.                              757
Universal Pictures                        754
                                         ... 
29 fevralya                                 1
Full Stealth Films                          1
William Cagney Productions                  1
MiniFlix Films                              1
Yermoliev                                   1
Name: companies, Length: 10600, dtype: int64


In [18]:
conteocomp.to_csv('conteocomp.csv', index=True)

In [46]:
#Este codigo halla la  suma de revenue por companie
nombre_compani= 'Paramount Pictures'
filtered_rows = pelis[pelis['companies'].str.contains(nombre_compani, na=False, case=False)]

# Seleccionar las columnas 'companies' y 'revenue' del DataFrame filtrado
tabla = filtered_rows[['companies', 'revenue']]



# Mostrar la tabla y el total de 'revenue'

total_revenue = tabla['revenue'].sum()

# Mostrar la tabla y el total de 'revenue'
print(tabla)

print(f"La productora {nombre_compani} ha tenido un revenue de {total_revenue} con un total de {len(filtered_rows)} peliculas")



                                               companies      revenue
6      Paramount Pictures, Scott Rudin Productions, M...          NaN
38                                    Paramount Pictures          NaN
56     Paramount Pictures, Egg Pictures, PolyGram Fil...   17519169.0
59     Paramount Pictures, Columbia Pictures Corporat...          NaN
60                                    Paramount Pictures          NaN
...                                                  ...          ...
43822  Paramount Pictures, Paramount Television, Ubu ...          NaN
44762  Paramount Pictures, Di Bonaventura Pictures, A...  604942143.0
44863                                 Paramount Pictures          NaN
44875                                 Paramount Pictures          NaN
44920              Paramount Pictures, Fleischer Studios          NaN

[1007 rows x 2 columns]
La productora Paramount Pictures ha tenido un revenue de 48828192922.0 con un total de 1007 peliculas


In [None]:
#def get_director( nombre_director ): Se ingresa el nombre de un director que se encuentre dentro de un 
# dataset debiendo devolver el éxito del mismo medido a través del retorno. Además, deberá devolver el
#  nombre de cada película con la fecha de lanzamiento, retorno individual, costo y ganancia de la misma, 
# en formato lista.

In [49]:
get_director = pd.read_csv('dir_pel.csv', low_memory=False)

In [53]:
director = 'Quentin Tarantino'

# Filtrar las filas correspondientes al director especificado
filtered_rows = get_director[get_director['director'] == director]

# Obtener las columnas 'title', 'return', 'budget' y 'revenue' como lista
result = filtered_rows[['title', 'return', 'budget', 'revenue', 'year']].values.tolist()

# Mostrar el resultado
print(result)

[['Pulp Fiction', 26.74109525, 8000000.0, 213928762.0, '1994'], ['Reservoir Dogs', 12.217505833333334, 1200000.0, 14661007.0, '1992'], ['Jackie Brown', 3.306096833333333, 12000000.0, 39673162.0, '1997'], ['Kill Bill: Vol. 1', 6.031633333333334, 30000000.0, 180949000.0, '2003'], ['Kill Bill: Vol. 2', 5.071982033333334, 30000000.0, 152159461.0, '2004'], ['Death Proof', 1.00151588, 25000000.0, 25037897.0, '2007'], ['Inglourious Basterds', 4.559015, 70000000.0, 319131050.0, '2009'], ['Django Unchained', 4.25368238, 100000000.0, 425368238.0, '2012'], ['The Hateful Eight', 3.5400026590909093, 44000000.0, 155760117.0, '2015'], ["My Best Friend's Birthday", nan, nan, nan, '1987']]


In [48]:
read_csv.head()

Unnamed: 0,director,title,budget,revenue,return,year
0,John Lasseter,Toy Story,30000000.0,373554033.0,12.451801,1995
1,Joe Johnston,Jumanji,65000000.0,262797249.0,4.043035,1995
2,Howard Deutch,Grumpier Old Men,,,,1995
3,Forest Whitaker,Waiting to Exhale,16000000.0,81452156.0,5.09076,1995
4,Charles Shyer,Father of the Bride Part II,,76578911.0,,1995


In [37]:
pelis.head()

Unnamed: 0,budget,genres,id,original_language,overview,popularity,release_date,revenue,runtime,status,tagline,title,vote_average,vote_count,belongs_to_collection_name,companies,countries,languages,release_year,return
0,30000000.0,"[Animation, Comedy, Family]",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,81.0,Released,,Toy Story,7.7,5415.0,Toy Story Collection,Pixar Animation Studios,United States of America,English,1995,12.451801
1,65000000.0,"[Adventure, Fantasy, Family]",8844,en,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249.0,104.0,Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,,"TriStar Pictures, Teitler Film, Interscope Com...",United States of America,"English, Français",1995,4.043035
2,,"[Romance, Comedy]",15602,en,A family wedding reignites the ancient feud be...,11.7129,1995-12-22,,101.0,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,Grumpy Old Men Collection,"Warner Bros., Lancaster Gate",United States of America,English,1995,
3,16000000.0,"[Comedy, Drama, Romance]",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,1995-12-22,81452156.0,127.0,Released,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,,Twentieth Century Fox Film Corporation,United States of America,English,1995,5.09076
4,,[Comedy],11862,en,Just when George Banks has recovered from his ...,8.387519,1995-02-10,76578911.0,106.0,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,Father of the Bride Collection,"Sandollar Productions, Touchstone Pictures",United States of America,English,1995,
