# ETL - Fase 2: Desanidar y normalizar movies

En esta seccion, desanidamos y normalizamos las columnas de `movies_dataset.csv` que lo requieren

In [1]:
import os
import zipfile
import pandas as pd
import json
from ast import literal_eval

In [2]:
data_movies = pd.read_csv(os.path.join('1_data','movies_dataset.zip'),compression='zip').convert_dtypes()
data_movies.rename(columns = {'id':'pelicula_id'}, inplace = True)
data_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  string 
 1   belongs_to_collection  4494 non-null   string 
 2   budget                 45466 non-null  string 
 3   genres                 45466 non-null  string 
 4   homepage               7782 non-null   string 
 5   pelicula_id            45466 non-null  string 
 6   imdb_id                45449 non-null  string 
 7   original_language      45455 non-null  string 
 8   original_title         45466 non-null  string 
 9   overview               44512 non-null  string 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  string 
 12  production_companies   45463 non-null  string 
 13  production_countries   45463 non-null  string 
 14  release_date           45379 non-null  string 
 15  re

  data_movies = pd.read_csv(os.path.join('1_data','movies_dataset.zip'),compression='zip').convert_dtypes()


In [3]:
# inspeccionamos visualmente cuales solumnas requieres ser desanidadas y normalizadas
pd.set_option('display.max_columns', None) # para poder ver todas las columnas
data_movies.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,pelicula_id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033,81,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249,104,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413


In [4]:
# nos concentramos en: belongs_to_collection
data_movies_collection=pd.DataFrame(data_movies[['belongs_to_collection','pelicula_id']]).dropna()
data_movies_collection.head(2)

Unnamed: 0,belongs_to_collection,pelicula_id
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",862
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",15602


In [5]:
data_movies_collection['belongs_to_collection']=data_movies_collection['belongs_to_collection'].apply(literal_eval)
data_movies_collection.head(2)

Unnamed: 0,belongs_to_collection,pelicula_id
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",862
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",15602


In [6]:
# tercero se normaliza la columna
data_movies_collection=data_movies_collection.set_index('pelicula_id')
data_movies_collection_unnested = pd.json_normalize(data_movies_collection['belongs_to_collection']).set_index(data_movies_collection.index)
data_movies_collection_unnested.head(2)

Unnamed: 0_level_0,id,name,poster_path,backdrop_path
pelicula_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
862,10194.0,Toy Story Collection,/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg,/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg
15602,119050.0,Grumpy Old Men Collection,/nLvUdqgPgm3F85NMCii9gVFUcet.jpg,/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg


In [7]:
# finalizamos el df
data_movies_collection_unnested.rename(
    columns = {'id':'franquicia_id','name':'franquicia'},
    inplace = True)
data_movies_collection_unnested.reset_index(inplace=True) # mueve le indice a una columna
data_movies_collection_unnested.head(2)

Unnamed: 0,pelicula_id,franquicia_id,franquicia,poster_path,backdrop_path
0,862,10194.0,Toy Story Collection,/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg,/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg
1,15602,119050.0,Grumpy Old Men Collection,/nLvUdqgPgm3F85NMCii9gVFUcet.jpg,/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg


In [8]:
# nos concentramos en: production_companies
data_movies_companies=pd.DataFrame(data_movies[['production_companies','pelicula_id']]).dropna()
data_movies_companies.head(2)

Unnamed: 0,production_companies,pelicula_id
0,"[{'name': 'Pixar Animation Studios', 'id': 3}]",862
1,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...",8844


In [9]:
data_movies_companies['production_companies']=data_movies_companies['production_companies'].apply(literal_eval)
data_movies_companies.head(2)

Unnamed: 0,production_companies,pelicula_id
0,"[{'name': 'Pixar Animation Studios', 'id': 3}]",862
1,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...",8844


In [10]:
# segundo, expandimos la columna para que queden diccionarios anidados (deja de ser una lista de diccionarios)
data_movies_companies_exploded = data_movies_companies.explode('production_companies', ignore_index=True)
data_movies_companies_exploded.head(2)

Unnamed: 0,production_companies,pelicula_id
0,"{'name': 'Pixar Animation Studios', 'id': 3}",862
1,"{'name': 'TriStar Pictures', 'id': 559}",8844


In [11]:
# tercero se normaliza la columna
data_movies_companies_exploded=data_movies_companies_exploded.set_index('pelicula_id')
data_movies_companies_exploded_unnested = pd.json_normalize(data_movies_companies_exploded['production_companies']).set_index(data_movies_companies_exploded.index)
data_movies_companies_exploded_unnested.head(2)

Unnamed: 0_level_0,name,id
pelicula_id,Unnamed: 1_level_1,Unnamed: 2_level_1
862,Pixar Animation Studios,3.0
8844,TriStar Pictures,559.0


In [12]:
# se finaliza el df
data_movies_companies_exploded_unnested.rename(
    columns = {'id':'productora_id','name':'productora'},
    inplace = True)
data_movies_companies_exploded_unnested.reset_index(inplace=True) # mueve le indice a una columna
data_movies_companies_exploded_unnested.head(2)

Unnamed: 0,pelicula_id,productora,productora_id
0,862,Pixar Animation Studios,3.0
1,8844,TriStar Pictures,559.0


In [13]:
# nos concentramos en:  genres 
data_movies_genres=pd.DataFrame(data_movies[['genres','pelicula_id']]).dropna()
data_movies_genres.head(2)

Unnamed: 0,genres,pelicula_id
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844


In [14]:
# replicamos los pasos para el caso de  lista de diccionario
data_movies_genres['genres']=data_movies_genres['genres'].apply(literal_eval)
data_movies_genres_exploded = data_movies_genres.explode('genres', ignore_index=True)
data_movies_genres_exploded=data_movies_genres_exploded.set_index('pelicula_id')
data_movies_genres_exploded_unnested = pd.json_normalize(data_movies_genres_exploded['genres']).set_index(data_movies_genres_exploded.index)
data_movies_genres_exploded_unnested.rename(
    columns = {'id':'genres_id','name':'genres'},
    inplace = True)
data_movies_genres_exploded_unnested.reset_index(inplace=True) # mueve le indice a una columna
data_movies_genres_exploded_unnested.head(2)

Unnamed: 0,pelicula_id,genres_id,genres
0,862,16.0,Animation
1,862,35.0,Comedy


In [15]:
data_movies_countries = pd.DataFrame(data_movies[['production_countries', 'pelicula_id']]).dropna()
data_movies_countries['production_countries'] = data_movies_countries['production_countries'].apply(literal_eval)
data_movies_countries_exploded = data_movies_countries.explode('production_countries')
data_movies_countries_exploded_unnested = pd.json_normalize(data_movies_countries_exploded['production_countries']).set_index(data_movies_countries_exploded.pelicula_id)
data_movies_countries_exploded_unnested.reset_index(inplace=True)
data_movies_countries_exploded_unnested.rename(
    columns={'iso_3166_1':'pais_isocode','name' : 'pais_name'},
    inplace=True
    )
data_movies_countries_exploded_unnested.head(2)

Unnamed: 0,pelicula_id,pais_isocode,pais_name
0,862,US,United States of America
1,8844,US,United States of America


In [16]:
data_movies_languages = pd.DataFrame(data_movies[['spoken_languages', 'pelicula_id']]).dropna()
data_movies_languages['spoken_languages']= data_movies_languages['spoken_languages'].apply(literal_eval)
data_movies_languages_exploded = data_movies_languages.explode('spoken_languages')
data_movies_languages_exploded_unnested = pd.json_normalize(data_movies_languages_exploded['spoken_languages']).set_index(data_movies_languages_exploded.pelicula_id)
data_movies_languages_exploded_unnested.reset_index(inplace=True)
data_movies_languages_exploded_unnested.rename(
    columns={'iso_639_1':'spoken_languages_isocode','name' : 'spoken_languages_name'},
    inplace=True
    )
data_movies_languages_exploded_unnested.head(2)

Unnamed: 0,pelicula_id,spoken_languages_isocode,spoken_languages_name
0,862,en,English
1,8844,en,English


In [17]:
# unir los dataframes
data_movies_normalized = pd.merge(
    pd.merge(pd.merge(pd.merge(data_movies_collection_unnested,
    data_movies_companies_exploded_unnested,
    on=['pelicula_id']),
    data_movies_genres_exploded_unnested,
    on=['pelicula_id']),
    data_movies_countries_exploded_unnested,
    on=['pelicula_id']),
    data_movies_languages_exploded_unnested,
    on=['pelicula_id']
    )
data_movies_normalized.head(2)

Unnamed: 0,pelicula_id,franquicia_id,franquicia,poster_path,backdrop_path,productora,productora_id,genres_id,genres,pais_isocode,pais_name,spoken_languages_isocode,spoken_languages_name
0,862,10194.0,Toy Story Collection,/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg,/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg,Pixar Animation Studios,3.0,16.0,Animation,US,United States of America,en,English
1,862,10194.0,Toy Story Collection,/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg,/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg,Pixar Animation Studios,3.0,35.0,Comedy,US,United States of America,en,English


In [18]:
data_movies_final=data_movies_normalized[
    data_movies_normalized.columns[~data_movies_normalized.columns.str.contains("_path")] # quitamos las NO contienen "_path"
    ]
data_movies_final.info()
data_movies_final.to_csv(os.path.join("2_pipeline","data_movies_normalizada.csv"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55897 entries, 0 to 55896
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   pelicula_id               55897 non-null  string 
 1   franquicia_id             55897 non-null  float64
 2   franquicia                55897 non-null  object 
 3   productora                53879 non-null  object 
 4   productora_id             53879 non-null  float64
 5   genres_id                 55754 non-null  float64
 6   genres                    55754 non-null  object 
 7   pais_isocode              55285 non-null  object 
 8   pais_name                 55285 non-null  object 
 9   spoken_languages_isocode  55728 non-null  object 
 10  spoken_languages_name     55728 non-null  object 
dtypes: float64(3), object(7), string(1)
memory usage: 4.7+ MB
