### ETL credits ###

Procedemos a importar librerias necesarias, cargar el archivo, configurar vistas de impresiones. Luego visualizaremos la estructura del dataset.


In [1]:
import pandas as pd
import ast


In [4]:
# Cargamos el dataset de créditos
credits_df = pd.read_csv('credits.csv')

credits_df = credits_df.copy()   # Hacemos una copia para asegurarnos que se guarden los cambios que hagamos.



Se inspeccionará que columnas hay y que tipo de dato tienen


In [5]:
# Visualizamos la información del DataFrame
print(credits_df.info())

# Verificamos si hay valores nulos
print(credits_df.isnull().sum())

# Aplicamos type a todos los elementos de las columnas seleccionadas
type_data_columns = credits_df[['cast', 'crew', 'id']].applymap(type)

# Muestra los tipos de datos de cada elemento en las columnas seleccionadas
print(type_data_columns)

# Verificamos los tipos de datos de las columnas
tipos_por_columna = credits_df.applymap(type).nunique()
print("Cantidad de tipos de datos únicos por columna:")
print(tipos_por_columna)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB
None
cast    0
crew    0
id      0
dtype: int64
                cast           crew             id
0      <class 'str'>  <class 'str'>  <class 'int'>
1      <class 'str'>  <class 'str'>  <class 'int'>
2      <class 'str'>  <class 'str'>  <class 'int'>
3      <class 'str'>  <class 'str'>  <class 'int'>
4      <class 'str'>  <class 'str'>  <class 'int'>
...              ...            ...            ...
45471  <class 'str'>  <class 'str'>  <class 'int'>
45472  <class 'str'>  <class 'str'>  <class 'int'>
45473  <class 'str'>  <class 'str'>  <class 'int'>
45474  <class 'str'>  <class 'str'>  <class 'int'>
45475  <class 'str'>  <class 'str'>  <class 'int'>

[

  type_data_columns = credits_df[['cast', 'crew', 'id']].applymap(type)
  tipos_por_columna = credits_df.applymap(type).nunique()


Como vamos a usar el id para nuestro modelo de recomendación, vamos a revisar bien dicha columna para prevenir errores

In [None]:
# Verificamos el tipo de dato de la columna 'id'
print(type(credits_df['id'][0]))

<class 'numpy.int64'>


In [7]:
#Eliminamos duplicados
credits_df.drop_duplicates(subset='id', inplace=True)

Dado que son cadenas que representan listas de dict, se realiza la conversion necesaria para poder extraer la informacion útil 

In [8]:
# Convertimos columnas 'cast' y 'crew' de string a listas de diccionarios
credits_df['cast'] = credits_df['cast'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else None)
credits_df['crew'] = credits_df['crew'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else None)

# Extraemos nombres de actores en 'cast'
credits_df['actor_names'] = credits_df['cast'].apply(lambda x: [actor['name'] for actor in x if 'name' in actor] if x else pd.NA)

# Extraemos nombre de los directores en 'crew'
credits_df['director_names'] = credits_df['crew'].apply(lambda x: [director['name'] for director in x if director.get('job') == 'Director'] if x else pd.NA)

# Visualizamos algunas filas para verificar las transformaciones
print(credits_df[['actor_names', 'director_names']].head())



                                         actor_names     director_names
0  [Tom Hanks, Tim Allen, Don Rickles, Jim Varney...    [John Lasseter]
1  [Robin Williams, Jonathan Hyde, Kirsten Dunst,...     [Joe Johnston]
2  [Walter Matthau, Jack Lemmon, Ann-Margret, Sop...    [Howard Deutch]
3  [Whitney Houston, Angela Bassett, Loretta Devi...  [Forest Whitaker]
4  [Steve Martin, Diane Keaton, Martin Short, Kim...    [Charles Shyer]


Corroboramos si hay nulos luego de extraer la informacion que necesitamos

In [9]:
# Contabilizamos los valores nulos en 'actor_names' y 'director_names'
nulos_actor_names = credits_df['actor_names'].isna().sum()
nulos_director_names = credits_df['director_names'].isna().sum()

# Imprimimos los resultados
print(f"Cantidad de valores nulos en 'actor_names': {nulos_actor_names}")
print(f"Cantidad de valores nulos en 'director_names': {nulos_director_names}")


Cantidad de valores nulos en 'actor_names': 2414
Cantidad de valores nulos en 'director_names': 771


In [10]:
# Verificar el tipo de dato en cada fila para 'actor_names' y 'director_names'
tipos_actor_names = credits_df['actor_names'].apply(type)
tipos_director_names = credits_df['director_names'].apply(type)

# Mostrar los tipos
print("Tipos de 'actor_names':")
print(tipos_actor_names.value_counts())

print("\nTipos de 'director_names':")
print(tipos_director_names.value_counts())


Tipos de 'actor_names':
actor_names
<class 'list'>                           43018
<class 'pandas._libs.missing.NAType'>     2414
Name: count, dtype: int64

Tipos de 'director_names':
director_names
<class 'list'>                           44661
<class 'pandas._libs.missing.NAType'>      771
Name: count, dtype: int64


Al toparme con valores nulos en las columnas nuevas de actor_names y director_name, como son porcentajes minimos decido excluirlos.

In [11]:
# Eliminamos filas con valores nulos en 'actor_names' y 'director_names'
credits_df.dropna(subset=['actor_names', 'director_names'], inplace=True)

# Verificamos cuántas filas quedan después de la eliminación
print(f"Filas restantes en credits_df: {credits_df.shape[0]}")


Filas restantes en credits_df: 42668


In [12]:
# Contabilizamos los valores nulos en 'actor_names' y 'director_names' luego del proceso
nulos_actor_names = credits_df['actor_names'].isna().sum()
nulos_director_names = credits_df['director_names'].isna().sum()
print(nulos_actor_names)
print(nulos_director_names)

0
0


Elimino las columnas innecesarias

In [13]:
credits_df.drop(columns=['cast', 'crew'], inplace=True)

# Verificamos el nuevo DataFrame
print(credits_df.head())


      id                                        actor_names     director_names
0    862  [Tom Hanks, Tim Allen, Don Rickles, Jim Varney...    [John Lasseter]
1   8844  [Robin Williams, Jonathan Hyde, Kirsten Dunst,...     [Joe Johnston]
2  15602  [Walter Matthau, Jack Lemmon, Ann-Margret, Sop...    [Howard Deutch]
3  31357  [Whitney Houston, Angela Bassett, Loretta Devi...  [Forest Whitaker]
4  11862  [Steve Martin, Diane Keaton, Martin Short, Kim...    [Charles Shyer]


Se finaliza el proceso de transformaciones necesarias, se guarda el nuevo data set

In [14]:
credits_df.to_csv('credits_ok.csv', index=False)

Realizamos la unión de ambos data set limpios para disponer de toda la información junta para las consultas de la API

In [15]:
# Cargamos el nuevo dataset de movies 
movies_ok = pd.read_csv('movies_ok.csv')  

#Cargamos el nuevo dataset de credits
credits_ok = pd.read_csv('credits_ok.csv')

# Verifica que se haya cargado correctamente
print(movies_ok.head())
print(credits_ok.head())


     budget     id                                           overview  popularity release_date      revenue                        title  vote_average  vote_count                 collection_name                         genre_names                                    companies_names               countries_names    spoken_language_names  release_year     return
0  30000000    862  led by woody, andy's toys live happily in his ...   21.946943   1995-10-30  373554033.0                    toy story           7.7      5415.0            toy story collection   ['animation', 'comedy', 'family']                        ['pixar animation studios']  ['united states of america']              ['english']        1995.0  12.451801
1  65000000   8844  when siblings judy and peter discover an encha...   17.015539   1995-12-15  262797249.0                      jumanji           6.9      2413.0                   sin colección  ['adventure', 'fantasy', 'family']  ['tristar pictures', 'teitler film', 'inters

Chequeamos el tamaño del dataframe antes de hacer la unión.

In [16]:
print(credits_ok.shape) #chequeamos tamaño de df antes de filtrarlo

(42668, 3)


In [17]:
# Unir el DataFrame de credits con el de movies
merged_df = movies_ok.merge(credits_ok, on='id', how='left')

# Verificamos el nuevo DataFrame
print(merged_df.head())


     budget     id                                           overview  popularity release_date      revenue                        title  vote_average  vote_count                 collection_name                         genre_names                                    companies_names               countries_names    spoken_language_names  release_year     return                                        actor_names       director_names
0  30000000    862  led by woody, andy's toys live happily in his ...   21.946943   1995-10-30  373554033.0                    toy story           7.7      5415.0            toy story collection   ['animation', 'comedy', 'family']                        ['pixar animation studios']  ['united states of america']              ['english']        1995.0  12.451801  ['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...    ['John Lasseter']
1  65000000   8844  when siblings judy and peter discover an encha...   17.015539   1995-12-15  262797249.0                      jum

Revisamos que efectivamente se redujo el tamaño del dataframe

In [18]:
print(merged_df.shape) #chequeamos tamaño de df antes de filtrarlo

(21392, 18)


Para realizar el modelo de recomendación, necesitaremos unificar en una sola columna, las columnas de género, director y país. 

In [19]:
# Hacemos una copia del DataFrame, es buena practica.
merged_df = merged_df.copy()

# Convertimos cada lista en una cadena de texto
merged_df['genre_names'] = merged_df['genre_names'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
merged_df['director_names'] = merged_df['director_names'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
merged_df['countries_names'] = merged_df['countries_names'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

#Creamos la columna combinada 
merged_df['features'] = merged_df['genre_names'] + " " + merged_df['director_names'] + " " + merged_df['countries_names']

#Manejo de Nan
merged_df['features'] = merged_df['features'].fillna('')   




Para finalizar, convertimos el archivo a parquet porque es una excelente idea para optimizar el almacenamiento y lectura de datos en la API y guardamos el nuevo dataset.

In [20]:

# Guardamos el nuevo dataset 
merged_df.to_parquet("dataset_ok.parquet", index=False)

merged_df.to_csv("dataset_ok.csv", index=False)

In [22]:
merged_df   #Imprimimos para chequear que esté todo aplicado correctamente

Unnamed: 0,budget,id,overview,popularity,release_date,revenue,title,vote_average,vote_count,collection_name,genre_names,companies_names,countries_names,spoken_language_names,release_year,return,actor_names,director_names,features
0,30000000,862,"led by woody, andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,toy story,7.7,5415.0,toy story collection,"['animation', 'comedy', 'family']",['pixar animation studios'],['united states of america'],['english'],1995.0,12.451801,"['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...",['John Lasseter'],"['animation', 'comedy', 'family'] ['John Lasse..."
1,65000000,8844,when siblings judy and peter discover an encha...,17.015539,1995-12-15,262797249.0,jumanji,6.9,2413.0,sin colección,"['adventure', 'fantasy', 'family']","['tristar pictures', 'teitler film', 'intersco...",['united states of america'],"['english', 'français']",1995.0,4.043035,"['Robin Williams', 'Jonathan Hyde', 'Kirsten D...",['Joe Johnston'],"['adventure', 'fantasy', 'family'] ['Joe Johns..."
2,0,15602,a family wedding reignites the ancient feud be...,11.712900,1995-12-22,0.0,grumpier old men,6.5,92.0,grumpy old men collection,"['romance', 'comedy']","['warner bros.', 'lancaster gate']",['united states of america'],['english'],1995.0,0.000000,"['Walter Matthau', 'Jack Lemmon', 'Ann-Margret...",['Howard Deutch'],"['romance', 'comedy'] ['Howard Deutch'] ['unit..."
3,16000000,31357,"cheated on, mistreated and stepped on, the wom...",3.859495,1995-12-22,81452156.0,waiting to exhale,6.1,34.0,sin colección,"['comedy', 'drama', 'romance']",['twentieth century fox film corporation'],['united states of america'],['english'],1995.0,5.090760,"['Whitney Houston', 'Angela Bassett', 'Loretta...",['Forest Whitaker'],"['comedy', 'drama', 'romance'] ['Forest Whitak..."
4,0,11862,just when george banks has recovered from his ...,8.387519,1995-02-10,76578911.0,father of the bride part ii,5.7,173.0,father of the bride collection,['comedy'],"['sandollar productions', 'touchstone pictures']",['united states of america'],['english'],1995.0,0.000000,"['Steve Martin', 'Diane Keaton', 'Martin Short...",['Charles Shyer'],['comedy'] ['Charles Shyer'] ['united states o...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21387,0,289923,a film archivist revisits the story of rustin ...,0.386450,2000-10-03,0.0,the burkittsville 7,7.0,1.0,sin colección,['horror'],"['neptune salad entertainment', 'pirie product...",['united states of america'],['english'],2000.0,0.000000,"['Monty Bane', 'Lucy Butler', 'David Grammer',...",['Ben Rock'],['horror'] ['Ben Rock'] ['united states of ame...
21388,0,222848,it's the year 3000 ad. the world's most danger...,0.661558,1995-01-01,0.0,caged heat 3000,3.5,1.0,sin colección,['science fiction'],['concorde-new horizons'],['united states of america'],['english'],1995.0,0.000000,"['Lisa Boyle', 'Kena Land', 'Zaneta Polard', '...",['Aaron Osborne'],['science fiction'] ['Aaron Osborne'] ['united...
21389,0,30840,"yet another version of the classic epic, with ...",5.683753,1991-05-13,0.0,robin hood,5.7,26.0,sin colección,"['drama', 'action', 'romance']","['westdeutscher rundfunk (wdr)', 'working titl...","['canada', 'germany', 'united kingdom', 'unite...",['english'],1991.0,0.000000,"['Patrick Bergin', 'Uma Thurman', 'David Morri...",['John Irvin'],"['drama', 'action', 'romance'] ['John Irvin'] ..."
21390,0,67758,"when one of her hits goes wrong, a professiona...",0.903007,2003-08-01,0.0,betrayal,3.8,6.0,sin colección,"['action', 'drama', 'thriller']",['american world pictures'],['united states of america'],['english'],2003.0,0.000000,"['Erika Eleniak', 'Adam Baldwin', 'Julie du Pa...",['Mark L. Lester'],"['action', 'drama', 'thriller'] ['Mark L. Lest..."
