In [21]:
import pandas as pd
import numpy as np

title_akas = pd.read_csv('../downloads/title.akas.tsv', delimiter='\t', dtype={'isOriginalTitle': object})
title_basics = pd.read_csv('../downloads/title.basics.tsv', delimiter='\t', dtype={'isAdult': object})
title_crew = pd.read_csv('../downloads/title.crew.tsv', delimiter='\t', dtype={'directors': str})
title_ratings = pd.read_csv('../downloads/title.ratings.tsv', delimiter='\t')
name_basics = pd.read_csv('../downloads/name.basics.tsv', delimiter='\t', dtype={'primaryName': str})

In [22]:
# filter by titleType

title_basics = title_basics[(title_basics['titleType'] == 'movie') | (title_basics['titleType'] == 'tvMovie')]
title_basics.reset_index(inplace=True, drop=True)

# filter by isOriginalTitle

title_akas = title_akas[(title_akas['isOriginalTitle'] == '1') | (title_akas['isOriginalTitle'] == '\\N')]
title_akas.reset_index(inplace=True, drop=True)


In [23]:
# remove duplicate values with null value in 'isOriginalTitle' column

duplicates = title_akas[title_akas.duplicated(['titleId'])]
duplicates = duplicates[(duplicates['isOriginalTitle'] == '\\N') | (duplicates['attributes'] != '\\N')]

title_akas.drop(labels=duplicates.index, axis=0, inplace=True)
title_akas.reset_index(inplace=True, drop=True)

In [24]:
duplicates = title_akas[title_akas.duplicated(['titleId'])]

title_akas.drop(labels=duplicates.index, axis=0, inplace=True)
title_akas.reset_index(inplace=True, drop=True)

In [25]:
# drop useless columns 

title_akas.drop(labels=['ordering', 'types', 'attributes', 'title', 'isOriginalTitle'], axis=1, inplace=True)
title_basics.drop(labels=['endYear', 'titleType'], axis=1, inplace=True)
title_crew.drop(labels=['writers'], axis=1, inplace=True)
title_ratings.drop(labels=['numVotes'], axis=1, inplace=True)
name_basics.drop(labels=['birthYear', 'deathYear', 'primaryProfession', 'knownForTitles'], axis=1, inplace=True)

In [26]:
title_akas['language'].value_counts()

\N     1790606
en           4
tr           1
yue          1
da           1
ca           1
Name: language, dtype: int64

:warning:  language column values have little to none importance, hence not using this dataframe

In [27]:
# If it is to remove movies before a certain year
# Remove from title_basics


#REVIEW
# I don't think we should remove them 
# Bia

In [28]:
df = title_basics.merge(
  title_crew, on='tconst', how='left').merge(
    title_ratings, on='tconst', how='left')

df

Unnamed: 0,tconst,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,averageRating
0,tt0000009,Miss Jerry,Miss Jerry,0,1894,45,Romance,nm0085156,5.2
1,tt0000502,Bohemios,Bohemios,0,1905,100,\N,nm0063413,3.7
2,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,70,"Action,Adventure,Biography",nm0846879,6.0
3,tt0000591,The Prodigal Son,L'enfant prodigue,0,1907,90,Drama,nm0141150,4.0
4,tt0000615,Robbery Under Arms,Robbery Under Arms,0,1907,\N,Drama,nm0533958,4.1
...,...,...,...,...,...,...,...,...,...
760391,tt9916680,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,100,Documentary,nm0652213,
760392,tt9916692,Teatroteka: Czlowiek bez twarzy,Teatroteka: Czlowiek bez twarzy,0,2015,66,Drama,nm10538592,
760393,tt9916706,Dankyavar Danka,Dankyavar Danka,0,2013,\N,Comedy,nm7764440,
760394,tt9916730,6 Gunn,6 Gunn,0,2017,116,\N,nm10538612,8.4


In [29]:
# explode movie rows with more than one director into multiple rows, each one with only one director code

df = df.assign(directors=df['directors'].str.split(',')).explode('directors')
df = df.merge(name_basics, left_on='directors', right_on='nconst', how='left')

# drop useless columns

df.drop(labels=['directors', 'nconst'], axis=1, inplace=True)
df.rename(columns={'primaryName' : 'directors'}, inplace=True)

df

Unnamed: 0,tconst,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,directors
0,tt0000009,Miss Jerry,Miss Jerry,0,1894,45,Romance,5.2,Alexander Black
1,tt0000502,Bohemios,Bohemios,0,1905,100,\N,3.7,Ricardo de Baños
2,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,70,"Action,Adventure,Biography",6.0,Charles Tait
3,tt0000591,The Prodigal Son,L'enfant prodigue,0,1907,90,Drama,4.0,Michel Carré
4,tt0000615,Robbery Under Arms,Robbery Under Arms,0,1907,\N,Drama,4.1,Charles MacMahon
...,...,...,...,...,...,...,...,...,...
841135,tt9916692,Teatroteka: Czlowiek bez twarzy,Teatroteka: Czlowiek bez twarzy,0,2015,66,Drama,,Andrzej Bartnikowski
841136,tt9916706,Dankyavar Danka,Dankyavar Danka,0,2013,\N,Comedy,,Kanchan Nayak
841137,tt9916730,6 Gunn,6 Gunn,0,2017,116,\N,8.4,Kiran Gawade
841138,tt9916754,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,0,2013,49,Documentary,,Vinicius Augusto Bozzo


In [30]:
# remove duplicate rows while joining director names

df['directors'] = df['directors'].astype(str)
directors_column = df.groupby(['tconst']).agg({'directors': ', '.join})['directors'].values

df.drop_duplicates(subset='tconst', inplace=True)
df['directors'] = directors_column
df.reset_index(inplace=True, drop=True)

df

Unnamed: 0,tconst,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,directors
0,tt0000009,Miss Jerry,Miss Jerry,0,1894,45,Romance,5.2,Alexander Black
1,tt0000502,Bohemios,Bohemios,0,1905,100,\N,3.7,Ricardo de Baños
2,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,70,"Action,Adventure,Biography",6.0,Charles Tait
3,tt0000591,The Prodigal Son,L'enfant prodigue,0,1907,90,Drama,4.0,Michel Carré
4,tt0000615,Robbery Under Arms,Robbery Under Arms,0,1907,\N,Drama,4.1,Charles MacMahon
...,...,...,...,...,...,...,...,...,...
760391,tt9916680,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,100,Documentary,,Luis Ospina
760392,tt9916692,Teatroteka: Czlowiek bez twarzy,Teatroteka: Czlowiek bez twarzy,0,2015,66,Drama,,Andrzej Bartnikowski
760393,tt9916706,Dankyavar Danka,Dankyavar Danka,0,2013,\N,Comedy,,Kanchan Nayak
760394,tt9916730,6 Gunn,6 Gunn,0,2017,116,\N,8.4,Kiran Gawade


In [31]:
df.to_csv('../data/data.csv', index=False, sep=';')

# TODO:
- remover filmes mais antigos que x data
- filtrar filmes apenas em en 
  - not enough values
- remover filmes sem sinopse (isto é quando formos fazer o webscrapping ig)
