# ETL - Fase 3: Desanidar y normalizar credits

En esta seccion, desinadamos y normalizamos la informacion contenida en `credits.csv`

In [3]:
import os
import numpy as np
import pandas as pd
import json
from ast import literal_eval

In [4]:
# cargamos e inspeccionamos el dataset
data_credits = pd.read_csv(os.path.join('1_data','credits.zip'),compression='zip',low_memory=False).convert_dtypes()
data_credits.rename(columns = {'id':'pelicula_id'}, inplace = True)
data_credits.head()

Unnamed: 0,cast,crew,pelicula_id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [5]:
# nos concentramos en la primera columna
data_credits_cast=pd.DataFrame(data_credits[['cast','pelicula_id']])
data_credits_cast.head()

Unnamed: 0,cast,pelicula_id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...",11862


In [6]:
# problema: Si pasamos la columna a json_normalize nos da un DF vacio:
# esto es por que hay errores de syntaxis en la columna
data_credits_cast_error = pd.json_normalize(data_credits_cast['cast'])
data_credits_cast_error.head()

0
1
2
3
4


In [7]:
# pandas.explode() tampoco la entiende: no le hace ninguna transformacion,
data_credits_cast_error2=data_credits_cast.set_index('pelicula_id')
data_credits_cast_error2 = data_credits_cast_error2.explode('cast')
data_credits_cast_error2.head()

Unnamed: 0_level_0,cast
pelicula_id,Unnamed: 1_level_1
862,"[{'cast_id': 14, 'character': 'Woody (voice)',..."
8844,"[{'cast_id': 1, 'character': 'Alan Parrish', '..."
15602,"[{'cast_id': 2, 'character': 'Max Goldman', 'c..."
31357,"[{'cast_id': 1, 'character': ""Savannah 'Vannah..."
11862,"[{'cast_id': 1, 'character': 'George Banks', '..."


In [8]:
# tenemos que corregir la syntaxis de la columna
# primero evaluamos la sintaxis de la columna (tengo entendido que de esta manera corrige cualquier error)

# Esta alternativa no funciona
#data_credits_cast['cast']=data_credits_cast['cast'].str.replace("'", '"', regex=True)
#data_credits_cast['cast']=data_credits_cast['cast'].replace("None", "null")

# la mejor alternativa es primero usar literal_eval, con el fin de asignar la etiqueta de JSON object a la columna
data_credits_cast['cast']=data_credits_cast['cast'].apply(literal_eval)
data_credits_cast.head()

Unnamed: 0,cast,pelicula_id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...",15602
3,"[{'cast_id': 1, 'character': 'Savannah 'Vannah...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...",11862


In [9]:
# a pesar de la evaluacion json_normalizar no la entiende
data_credits_cast_error3=data_credits_cast.set_index('pelicula_id')
data_credits_cast_error3_unnested = pd.json_normalize(data_credits_cast_error3['cast']).set_index(data_credits_cast_error3.index)
data_credits_cast_error3_unnested.head(2)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,303,304,305,306,307,308,309,310,311,312
pelicula_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
862,"{'cast_id': 14, 'character': 'Woody (voice)', ...","{'cast_id': 15, 'character': 'Buzz Lightyear (...","{'cast_id': 16, 'character': 'Mr. Potato Head ...","{'cast_id': 17, 'character': 'Slinky Dog (voic...","{'cast_id': 18, 'character': 'Rex (voice)', 'c...","{'cast_id': 19, 'character': 'Hamm (voice)', '...","{'cast_id': 20, 'character': 'Bo Peep (voice)'...","{'cast_id': 26, 'character': 'Andy (voice)', '...","{'cast_id': 22, 'character': 'Sid (voice)', 'c...","{'cast_id': 23, 'character': 'Mrs. Davis (voic...",...,,,,,,,,,,
8844,"{'cast_id': 1, 'character': 'Alan Parrish', 'c...","{'cast_id': 8, 'character': 'Samuel Alan Parri...","{'cast_id': 2, 'character': 'Judy Sheperd', 'c...","{'cast_id': 24, 'character': 'Peter Shepherd',...","{'cast_id': 10, 'character': 'Sarah Whittle', ...","{'cast_id': 25, 'character': 'Nora Shepherd', ...","{'cast_id': 26, 'character': 'Carl Bentley', '...","{'cast_id': 11, 'character': 'Carol Anne Parri...","{'cast_id': 14, 'character': 'Alan Parrish (yo...","{'cast_id': 13, 'character': 'Sarah Whittle (y...",...,,,,,,,,,,


In [10]:
# solucion error3: expandimos la columna para que queden diccionarios anidados (deja de ser una lista de diccionarios)
data_credits_cast_exploded = data_credits_cast.explode('cast', ignore_index=True)
data_credits_cast_exploded.head()

Unnamed: 0,cast,pelicula_id
0,"{'cast_id': 14, 'character': 'Woody (voice)', ...",862
1,"{'cast_id': 15, 'character': 'Buzz Lightyear (...",862
2,"{'cast_id': 16, 'character': 'Mr. Potato Head ...",862
3,"{'cast_id': 17, 'character': 'Slinky Dog (voic...",862
4,"{'cast_id': 18, 'character': 'Rex (voice)', 'c...",862


In [11]:
# ahora si se normaliza la columna
data_credits_cast_exploded=data_credits_cast_exploded.set_index('pelicula_id')
data_credits_cast_unnested = pd.json_normalize(data_credits_cast_exploded['cast']).set_index(data_credits_cast_exploded.index)
data_credits_cast_unnested.head()

Unnamed: 0_level_0,cast_id,character,credit_id,gender,id,name,order,profile_path
pelicula_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
862,14.0,Woody (voice),52fe4284c3a36847f8024f95,2.0,31.0,Tom Hanks,0.0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg
862,15.0,Buzz Lightyear (voice),52fe4284c3a36847f8024f99,2.0,12898.0,Tim Allen,1.0,/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg
862,16.0,Mr. Potato Head (voice),52fe4284c3a36847f8024f9d,2.0,7167.0,Don Rickles,2.0,/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg
862,17.0,Slinky Dog (voice),52fe4284c3a36847f8024fa1,2.0,12899.0,Jim Varney,3.0,/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg
862,18.0,Rex (voice),52fe4284c3a36847f8024fa5,2.0,12900.0,Wallace Shawn,4.0,/oGE6JqPP2xH4tNORKNqxbNPYi7u.jpg


In [12]:
# inspeccionamos la diferencia entre cast_id y id
print(data_credits_cast_exploded.iloc[0,0])
print(data_credits_cast_exploded.iloc[100,0])
print(data_credits_cast_exploded.iloc[299,0])

print(data_credits_cast_unnested['cast_id'].min(),data_credits_cast_unnested['cast_id'].max())
print(data_credits_cast_unnested['id'].min(),data_credits_cast_unnested['id'].max())
    

{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}
{'cast_id': 97, 'character': 'Claudia', 'credit_id': '56be7769c3a36817f200506f', 'gender': 1, 'id': 167501, 'name': 'Farrah Forke', 'order': 32, 'profile_path': '/gmKm7TiZduyyaLlNgWnQEZHTc5Q.jpg'}
{'cast_id': 1, 'character': 'Morgan Adams', 'credit_id': '52fe42f4c3a36847f802f65f', 'gender': 1, 'id': 16935, 'name': 'Geena Davis', 'order': 0, 'profile_path': '/6b8cRJOItz7yNpYuLCPkl5kv4m2.jpg'}
0.0 1119.0
1.0 1908262.0


In [13]:
# con base en el output, concluimos lo siguiente:
# cast_id: corresponde al id unico para el cast por cada pelicula
# id: corresponde al id unico del cast en toda la base de datos.
# Renombramos
data_credits_cast_unnested.rename(columns = {'id':'unique_id'}, inplace = True)
data_credits_cast_unnested.info()

<class 'pandas.core.frame.DataFrame'>
Index: 564892 entries, 862 to 461257
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   cast_id       562474 non-null  float64
 1   character     562474 non-null  object 
 2   credit_id     562474 non-null  object 
 3   gender        562474 non-null  float64
 4   unique_id     562474 non-null  float64
 5   name          562474 non-null  object 
 6   order         562474 non-null  float64
 7   profile_path  388618 non-null  object 
dtypes: float64(4), object(4)
memory usage: 39.3+ MB


In [14]:
# si se prefiere el indice como columna
data_credits_cast_unnested.reset_index(inplace=True) # mueve le indice a una columna
data_credits_cast_unnested.head()

Unnamed: 0,pelicula_id,cast_id,character,credit_id,gender,unique_id,name,order,profile_path
0,862,14.0,Woody (voice),52fe4284c3a36847f8024f95,2.0,31.0,Tom Hanks,0.0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg
1,862,15.0,Buzz Lightyear (voice),52fe4284c3a36847f8024f99,2.0,12898.0,Tim Allen,1.0,/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg
2,862,16.0,Mr. Potato Head (voice),52fe4284c3a36847f8024f9d,2.0,7167.0,Don Rickles,2.0,/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg
3,862,17.0,Slinky Dog (voice),52fe4284c3a36847f8024fa1,2.0,12899.0,Jim Varney,3.0,/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg
4,862,18.0,Rex (voice),52fe4284c3a36847f8024fa5,2.0,12900.0,Wallace Shawn,4.0,/oGE6JqPP2xH4tNORKNqxbNPYi7u.jpg


In [15]:
# replicamos los pasos anteriores con columna crew
data_credits_crew=pd.DataFrame(data_credits[['crew','pelicula_id']])
data_credits_crew['crew']=data_credits_crew['crew'].apply(literal_eval)
data_credits_crew_exploded = data_credits_crew.explode('crew', ignore_index=True)
data_credits_crew_exploded=data_credits_crew_exploded.set_index('pelicula_id')
data_credits_crew_unnested = pd.json_normalize(data_credits_crew_exploded['crew']).set_index(data_credits_crew_exploded.index)
data_credits_crew_unnested.rename(columns = {'id':'unique_id'}, inplace = True)
data_credits_crew_unnested.reset_index(inplace=True)
data_credits_crew_unnested.head()

Unnamed: 0,pelicula_id,credit_id,department,gender,unique_id,job,name,profile_path
0,862,52fe4284c3a36847f8024f49,Directing,2.0,7879.0,Director,John Lasseter,/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg
1,862,52fe4284c3a36847f8024f4f,Writing,2.0,12891.0,Screenplay,Joss Whedon,/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg
2,862,52fe4284c3a36847f8024f55,Writing,2.0,7.0,Screenplay,Andrew Stanton,/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg
3,862,52fe4284c3a36847f8024f5b,Writing,2.0,12892.0,Screenplay,Joel Cohen,/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg
4,862,52fe4284c3a36847f8024f61,Writing,0.0,12893.0,Screenplay,Alec Sokolow,/v79vlRYi94BZUQnkkyznbGUZLjT.jpg


In [16]:
# renombramos las columnas de data_credits_crew_unnested: anexamos un prefijo
columns_cast=['character','credit_id','gender','unique_id','name','order','profile_path']
data_credits_cast_unnested.rename(
    columns={c: 'cast_'+c for c in data_credits_cast_unnested.columns if c in columns_cast},
    inplace=True)
data_credits_cast_unnested.head()

Unnamed: 0,pelicula_id,cast_id,cast_character,cast_credit_id,cast_gender,cast_unique_id,cast_name,cast_order,cast_profile_path
0,862,14.0,Woody (voice),52fe4284c3a36847f8024f95,2.0,31.0,Tom Hanks,0.0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg
1,862,15.0,Buzz Lightyear (voice),52fe4284c3a36847f8024f99,2.0,12898.0,Tim Allen,1.0,/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg
2,862,16.0,Mr. Potato Head (voice),52fe4284c3a36847f8024f9d,2.0,7167.0,Don Rickles,2.0,/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg
3,862,17.0,Slinky Dog (voice),52fe4284c3a36847f8024fa1,2.0,12899.0,Jim Varney,3.0,/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg
4,862,18.0,Rex (voice),52fe4284c3a36847f8024fa5,2.0,12900.0,Wallace Shawn,4.0,/oGE6JqPP2xH4tNORKNqxbNPYi7u.jpg


In [17]:
# renombramos las columnas de data_credits_crew_unnested
columns_crew=['credit_id','department','gender','unique_id','job','name','profile_path']
data_credits_crew_unnested.rename(
    columns={c: 'crew_'+c for c in data_credits_crew_unnested.columns if c in columns_crew},
    inplace=True)
data_credits_crew_unnested.head()

Unnamed: 0,pelicula_id,crew_credit_id,crew_department,crew_gender,crew_unique_id,crew_job,crew_name,crew_profile_path
0,862,52fe4284c3a36847f8024f49,Directing,2.0,7879.0,Director,John Lasseter,/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg
1,862,52fe4284c3a36847f8024f4f,Writing,2.0,12891.0,Screenplay,Joss Whedon,/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg
2,862,52fe4284c3a36847f8024f55,Writing,2.0,7.0,Screenplay,Andrew Stanton,/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg
3,862,52fe4284c3a36847f8024f5b,Writing,2.0,12892.0,Screenplay,Joel Cohen,/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg
4,862,52fe4284c3a36847f8024f61,Writing,0.0,12893.0,Screenplay,Alec Sokolow,/v79vlRYi94BZUQnkkyznbGUZLjT.jpg


In [18]:
print("Numero de peliuclas en credits_cast:",data_credits_cast_unnested['pelicula_id'].nunique())
print("Numero de peliuclas en credits_crew:",data_credits_crew_unnested['pelicula_id'].nunique())

Numero de peliuclas en credits_cast: 45432
Numero de peliuclas en credits_crew: 45432


In [19]:
# unimos los dataframes; como la disponibilidad de datos es la misma, hacemos un inner join
data_credits_unnested = pd.merge(
    data_credits_cast_unnested,
    data_credits_crew_unnested,
    on=['pelicula_id']
    )
data_credits_unnested.head()

Unnamed: 0,pelicula_id,cast_id,cast_character,cast_credit_id,cast_gender,cast_unique_id,cast_name,cast_order,cast_profile_path,crew_credit_id,crew_department,crew_gender,crew_unique_id,crew_job,crew_name,crew_profile_path
0,862,14.0,Woody (voice),52fe4284c3a36847f8024f95,2.0,31.0,Tom Hanks,0.0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg,52fe4284c3a36847f8024f49,Directing,2.0,7879.0,Director,John Lasseter,/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg
1,862,14.0,Woody (voice),52fe4284c3a36847f8024f95,2.0,31.0,Tom Hanks,0.0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg,52fe4284c3a36847f8024f4f,Writing,2.0,12891.0,Screenplay,Joss Whedon,/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg
2,862,14.0,Woody (voice),52fe4284c3a36847f8024f95,2.0,31.0,Tom Hanks,0.0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg,52fe4284c3a36847f8024f55,Writing,2.0,7.0,Screenplay,Andrew Stanton,/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg
3,862,14.0,Woody (voice),52fe4284c3a36847f8024f95,2.0,31.0,Tom Hanks,0.0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg,52fe4284c3a36847f8024f5b,Writing,2.0,12892.0,Screenplay,Joel Cohen,/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg
4,862,14.0,Woody (voice),52fe4284c3a36847f8024f95,2.0,31.0,Tom Hanks,0.0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg,52fe4284c3a36847f8024f61,Writing,0.0,12893.0,Screenplay,Alec Sokolow,/v79vlRYi94BZUQnkkyznbGUZLjT.jpg


In [20]:
# inspeccionamos las columnas del df final
data_credits_unnested.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9495072 entries, 0 to 9495071
Data columns (total 16 columns):
 #   Column             Dtype  
---  ------             -----  
 0   pelicula_id        Int64  
 1   cast_id            float64
 2   cast_character     object 
 3   cast_credit_id     object 
 4   cast_gender        float64
 5   cast_unique_id     float64
 6   cast_name          object 
 7   cast_order         float64
 8   cast_profile_path  object 
 9   crew_credit_id     object 
 10  crew_department    object 
 11  crew_gender        float64
 12  crew_unique_id     float64
 13  crew_job           object 
 14  crew_name          object 
 15  crew_profile_path  object 
dtypes: Int64(1), float64(6), object(9)
memory usage: 1.1+ GB


In [21]:
# seleccionamos unicamente las columnas de interes
columnas_credits = ['pelicula_id','cast_gender','cast_name','cast_order','crew_job','crew_name']
data_credits_unnested_selected=data_credits_unnested[columnas_credits]
data_credits_unnested_selected.info()
data_credits_unnested_selected.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9495072 entries, 0 to 9495071
Data columns (total 6 columns):
 #   Column       Dtype  
---  ------       -----  
 0   pelicula_id  Int64  
 1   cast_gender  float64
 2   cast_name    object 
 3   cast_order   float64
 4   crew_job     object 
 5   crew_name    object 
dtypes: Int64(1), float64(2), object(3)
memory usage: 443.7+ MB


Unnamed: 0,pelicula_id,cast_gender,cast_name,cast_order,crew_job,crew_name
0,862,2.0,Tom Hanks,0.0,Director,John Lasseter
1,862,2.0,Tom Hanks,0.0,Screenplay,Joss Whedon
2,862,2.0,Tom Hanks,0.0,Screenplay,Andrew Stanton
3,862,2.0,Tom Hanks,0.0,Screenplay,Joel Cohen
4,862,2.0,Tom Hanks,0.0,Screenplay,Alec Sokolow


In [22]:
# filtramos los valores de interes: primer protagonista y segundo protagonista
# suponemos que cast_order corresponde al orden en que se presenta el elenco de la pelicula
# data_credits_unnested_filtrada["cast_order"].unique()
# con base en lo anterior, suponemos que primer protagonista = 0, y segundo protagonista = 1
# filtramos la base de datos con estos valores
data_credits_unnested_selected_protagonsita_0=data_credits_unnested_selected.loc[
    data_credits_unnested_selected["cast_order"]==0]
data_credits_unnested_selected_protagonsita_1=data_credits_unnested_selected.loc[
    data_credits_unnested_selected["cast_order"]==1]

# confirmamos nuestra suposicion:
print("Cada pelicula_id tiene primer protagonista:",
      all(data_credits_unnested_selected_protagonsita_0.groupby("pelicula_id")['cast_order'].nunique() == 1))
print("Cada pelicula_id tiene segundo protagonista:",
      all(data_credits_unnested_selected_protagonsita_1.groupby("pelicula_id")['cast_order'].nunique() == 1))

Cada pelicula_id tiene primer protagonista: True
Cada pelicula_id tiene segundo protagonista: True


In [23]:
# filtramos los valores de interes: equipos de direccion y produccion
#data_credits_unnested_filtrada["crew_department"].unique()
crew_department_filter=['Directing', 'Production'] # seleccion con base en el output anterior

#data_credits_unnested_filtrada["crew_job"].unique()
crew_job_filter=['Director', 'Executive Producer'] # seleccion con base en el output anterior

In [24]:
data_credits_unnested_selected_filtrada=data_credits_unnested_selected.loc[
    (data_credits_unnested_selected["cast_order"].isin([0,1])) &
    #(data_credits_unnested_selected["crew_department"].isin(crew_department_filter)) &
    (data_credits_unnested_selected["crew_job"].isin(crew_job_filter))
]#.dropna()

In [25]:
# renonbramos el data_credits despues de la primera fase de ETL
data_credits_mvp=data_credits_unnested_selected_filtrada # renombramos el df para facilitar la lectura
del data_credits_unnested_selected_filtrada # eliminamos el df previo para liberar espacio en memoria
data_credits_mvp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120239 entries, 0 to 9495063
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   pelicula_id  120239 non-null  Int64  
 1   cast_gender  120239 non-null  float64
 2   cast_name    120239 non-null  object 
 3   cast_order   120239 non-null  float64
 4   crew_job     120239 non-null  object 
 5   crew_name    120239 non-null  object 
dtypes: Int64(1), float64(2), object(3)
memory usage: 6.5+ MB


In [26]:
# Uno de los principales criterios de nuestro sistema de recomendacion el protagonista
# para facilitar la lectura de la columnas, renombramos "prtgnst_" con "prtgnst_"
data_credits_mvp.columns=data_credits_mvp.columns.str.replace('cast_','prtgnst_')
data_credits_mvp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120239 entries, 0 to 9495063
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   pelicula_id     120239 non-null  Int64  
 1   prtgnst_gender  120239 non-null  float64
 2   prtgnst_name    120239 non-null  object 
 3   prtgnst_order   120239 non-null  float64
 4   crew_job        120239 non-null  object 
 5   crew_name       120239 non-null  object 
dtypes: Int64(1), float64(2), object(3)
memory usage: 6.5+ MB


In [27]:
# suponemos que el genero del protagonista esta en formato binario donde 0 = hombre y mujer = 1.
# Confirmamos nuestra suposicion:
print(
    data_credits_mvp.loc[:,['prtgnst_gender','prtgnst_name']][
        data_credits_mvp['prtgnst_gender']==0].head(4)) # hombre: 0
print(data_credits_mvp.loc[:,['prtgnst_gender','prtgnst_name']][
    data_credits_mvp['prtgnst_gender']==1].head(4)) # mujer: 1

       prtgnst_gender        prtgnst_name
17510             0.0     Rosie O'Donnell
17513             0.0     Rosie O'Donnell
33934             0.0       Peter Reznick
34012             0.0  Alicia Silverstone
      prtgnst_gender     prtgnst_name
1822             1.0  Whitney Houston
1828             1.0  Whitney Houston
1832             1.0   Angela Bassett
1838             1.0   Angela Bassett


In [28]:
#recodificamos la variable prtgnst_gender: asignamos strings para facilitar la lectura.
#preferimos este formato; en dado caso, mas adelante podemos convertir a dummies.
pd.options.mode.chained_assignment = None
data_credits_mvp['prtgnst_gender_strng']= np.where(
    data_credits_mvp.loc[:,['prtgnst_gender']]==1, 'mujer', 'hombre')
data_credits_mvp.drop(columns=['prtgnst_gender'], inplace=True)

In [29]:
# recodificamos la variable prtgnst_order para facilitar lectura: primer protagonsita (1), segundo protagonista (2).
# data_mvp['prtgnst_order'].unique() # array([0., 1.])
data_credits_mvp['prtgnst_nivel']= np.where(
    data_credits_mvp.loc[:,['prtgnst_order']]==0, 1, 2)
data_credits_mvp.drop(columns=['prtgnst_order'], inplace=True)
data_credits_mvp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120239 entries, 0 to 9495063
Data columns (total 6 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   pelicula_id           120239 non-null  Int64 
 1   prtgnst_name          120239 non-null  object
 2   crew_job              120239 non-null  object
 3   crew_name             120239 non-null  object
 4   prtgnst_gender_strng  120239 non-null  object
 5   prtgnst_nivel         120239 non-null  int64 
dtypes: Int64(1), int64(1), object(4)
memory usage: 6.5+ MB


In [30]:
# Asumimos que cada pelicula tenga un solo Director y productor
# Verificamos nuestra suposicion:
data_credits_mvp_director=data_credits_mvp.loc[
    data_credits_mvp['crew_job'].isin(['Director'])].groupby('pelicula_id').nunique('crew_name')
data_credits_mvp_execproducer=data_credits_mvp.loc[
    data_credits_mvp['crew_job'].isin(['Executive Producer'])].groupby('pelicula_id').nunique('crew_name')
print("Cada pelicula_id tiene un unico Director:",
      all(data_credits_mvp_director['crew_name'] == 1)
      )
print("Cada pelicula_id tiene un unico Productor:",
      all(data_credits_mvp_execproducer['crew_name'] == 1)
      )

Cada pelicula_id tiene un unico Director: False
Cada pelicula_id tiene un unico Productor: False


In [31]:
# Basado en lo anterior, visualizamos un ejemplo de peliculas con multiples directores.
data_credits_mvp[['pelicula_id','crew_job','crew_name']].loc[
    (data_credits_mvp['pelicula_id']==5) &
    (data_credits_mvp['crew_job'].str.contains('Director',na=False))
    ].drop_duplicates()

Unnamed: 0,pelicula_id,crew_job,crew_name
12390,5,Director,Allison Anders
12391,5,Director,Alexandre Rockwell
12392,5,Director,Robert Rodriguez
12393,5,Director,Quentin Tarantino


In [32]:
# nos concentramos en los valores == Director
data_credits_mvp_2pivot = data_credits_mvp[['pelicula_id','crew_job','crew_name']]
data_credits_mvp_2pivot_director = data_credits_mvp[data_credits_mvp['crew_job'].isin(['Director'])]
data_credits_mvp_director_pivoted=data_credits_mvp_2pivot_director.pivot_table(
    index=['pelicula_id'],
    columns="crew_job",
    values="crew_name",
    #aggfunc='first').reset_index(level='pelicula_id').rename_axis('Index',axis='columns') # toma solo el primer director
    aggfunc=list).reset_index(level='pelicula_id').rename_axis('Index',axis='columns').explode(['Director'], ignore_index=True).drop_duplicates()
data_credits_mvp_director_pivoted.columns=map(str.lower, data_credits_mvp_director_pivoted.columns)
data_credits_mvp_director_pivoted.info()
data_credits_mvp_director_pivoted.head()

<class 'pandas.core.frame.DataFrame'>
Index: 46316 entries, 0 to 84890
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   pelicula_id  46316 non-null  Int64 
 1   director     46316 non-null  object
dtypes: Int64(1), object(1)
memory usage: 1.1+ MB


Unnamed: 0,pelicula_id,director
0,2,Aki Kaurismäki
2,3,Aki Kaurismäki
4,5,Allison Anders
5,5,Alexandre Rockwell
6,5,Robert Rodriguez


In [33]:
# nos concentramos en los valores == Executive Producer
data_credits_mvp_2pivot = data_credits_mvp[['pelicula_id','crew_job','crew_name']]
data_credits_mvp_2pivot_producer = data_credits_mvp[data_credits_mvp['crew_job'].isin(['Executive Producer'])]
data_credits_mvp_producer_pivoted=data_credits_mvp_2pivot_producer.pivot_table(
    index=['pelicula_id'],
    columns="crew_job",
    values="crew_name",
    #aggfunc='first').reset_index(level='pelicula_id').rename_axis('Index',axis='columns') # toma solo el primer productor
    aggfunc=list).reset_index(level='pelicula_id').rename_axis('Index',axis='columns').explode(['Executive Producer'], ignore_index=True).drop_duplicates()
data_credits_mvp_producer_pivoted.columns=map(str.lower, data_credits_mvp_producer_pivoted.columns.str.replace(' ','_'))
data_credits_mvp_producer_pivoted.info()
data_credits_mvp_producer_pivoted.head()

<class 'pandas.core.frame.DataFrame'>
Index: 18621 entries, 0 to 35347
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   pelicula_id         18621 non-null  Int64 
 1   executive_producer  18621 non-null  object
dtypes: Int64(1), object(1)
memory usage: 454.6+ KB


Unnamed: 0,pelicula_id,executive_producer
0,5,Quentin Tarantino
1,5,Alexandre Rockwell
4,6,Lloyd Segan
5,6,Marilyn Vance
8,11,George Lucas


In [34]:
# Evaluamos cual de las dos dataframes contiene el mayor numero de registros;
# El df con el mayor numero de registros sera nuestra df de referencia en el merge
print("Disponibilidad de registros para director:",data_credits_mvp_director_pivoted['pelicula_id'].nunique())
print("Disponibilidad de registros para productor:",data_credits_mvp_producer_pivoted['pelicula_id'].nunique())

# unimos los dfs
data_credits_mvp_crew_pivoted=pd.merge(
    data_credits_mvp_director_pivoted, # tomamos como referencia la base de datos con mayor numero de registros
    data_credits_mvp_producer_pivoted,
    how='left', # tomamos como referencia la base de datos con mayor numero de registros
    on='pelicula_id')
data_credits_mvp_crew_pivoted.info()
data_credits_mvp_crew_pivoted.head()

Disponibilidad de registros para director: 42295
Disponibilidad de registros para productor: 8258
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57704 entries, 0 to 57703
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   pelicula_id         57704 non-null  Int64 
 1   director            57704 non-null  object
 2   executive_producer  20355 non-null  object
dtypes: Int64(1), object(2)
memory usage: 1.4+ MB


Unnamed: 0,pelicula_id,director,executive_producer
0,2,Aki Kaurismäki,
1,3,Aki Kaurismäki,
2,5,Allison Anders,Quentin Tarantino
3,5,Allison Anders,Alexandre Rockwell
4,5,Alexandre Rockwell,Quentin Tarantino


In [35]:
# Confirmamos que df con el mayor numero de registros es nuestra df principal: data_credits_mvp
print("Disponibilidad de registros por peliculas en df principal:",data_credits_mvp['pelicula_id'].nunique()) # df principal
print("Disponibilidad de registros por peliculas en df crew normalizada:",data_credits_mvp_crew_pivoted['pelicula_id'].nunique())

# primero, removemos las columnas en el df principal para evitar redundancia
data_credits_mvp_2joincrew=data_credits_mvp.drop(columns=['crew_job','crew_name']).drop_duplicates()

# segundo, unimos las df con el df global
data_credits_mvp_final=pd.merge(
    data_credits_mvp_2joincrew, # tomamos como punto de referencia nuestra df principal: data_credits_mvp
    data_credits_mvp_crew_pivoted,
    how='left',
    on='pelicula_id')
data_credits_mvp_final.to_csv(os.path.join("2_pipeline","credits_normalizada.csv"))
data_credits_mvp_final.info()
data_credits_mvp_final.head(2)

Disponibilidad de registros por peliculas en df principal: 42317
Disponibilidad de registros por peliculas en df crew normalizada: 42295
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106173 entries, 0 to 106172
Data columns (total 6 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   pelicula_id           106173 non-null  Int64 
 1   prtgnst_name          106173 non-null  object
 2   prtgnst_gender_strng  106173 non-null  object
 3   prtgnst_nivel         106173 non-null  int64 
 4   director              106138 non-null  object
 5   executive_producer    38468 non-null   object
dtypes: Int64(1), int64(1), object(4)
memory usage: 5.0+ MB


Unnamed: 0,pelicula_id,prtgnst_name,prtgnst_gender_strng,prtgnst_nivel,director,executive_producer
0,862,Tom Hanks,hombre,1,John Lasseter,Ed Catmull
1,862,Tom Hanks,hombre,1,John Lasseter,Steve Jobs
