In [113]:
import pandas as pd
import numpy as np

title_akas = pd.read_csv('../downloads/title.akas.tsv', delimiter='\t', dtype={'isOriginalTitle': object})
title_basics = pd.read_csv('../downloads/title.basics.tsv', delimiter='\t', dtype={'isAdult': object})
title_crew = pd.read_csv('../downloads/title.crew.tsv', delimiter='\t', dtype={'directors': str})
title_ratings = pd.read_csv('../downloads/title.ratings.tsv', delimiter='\t')
name_basics = pd.read_csv('../downloads/name.basics.tsv', delimiter='\t', dtype={'primaryName': str})

In [114]:
# filter by titleType

title_basics = title_basics[(title_basics['titleType'] == 'movie') | (title_basics['titleType'] == 'tvMovie')]
title_basics.reset_index(inplace=True, drop=True)

# filter by isOriginalTitle

title_akas = title_akas[(title_akas['isOriginalTitle'] == '1') | (title_akas['isOriginalTitle'] == '\\N')]
title_akas.reset_index(inplace=True, drop=True)


In [115]:
# remove duplicate values with null value in 'isOriginalTitle' column

duplicates = title_akas[title_akas.duplicated(['titleId'])]
duplicates = duplicates[(duplicates['isOriginalTitle'] == '\\N') | (duplicates['attributes'] != '\\N')]

title_akas.drop(labels=duplicates.index, axis=0, inplace=True)
title_akas.reset_index(inplace=True, drop=True)

In [116]:
duplicates = title_akas[title_akas.duplicated(['titleId'])]

title_akas.drop(labels=duplicates.index, axis=0, inplace=True)
title_akas.reset_index(inplace=True, drop=True)

In [117]:
# remove movies before a certain year

title_basics['startYear'] = pd.to_numeric(title_basics['startYear'], errors='coerce')
title_basics = title_basics[title_basics['startYear'] > 1990]
title_basics.reset_index(drop=True, inplace=True)

In [118]:
# drop useless columns 

title_akas.drop(labels=['ordering', 'types', 'attributes', 'title', 'isOriginalTitle'], axis=1, inplace=True)
title_basics.drop(labels=['endYear', 'titleType'], axis=1, inplace=True)
title_crew.drop(labels=['writers'], axis=1, inplace=True)
title_ratings.drop(labels=['numVotes'], axis=1, inplace=True)
name_basics.drop(labels=['birthYear', 'deathYear', 'primaryProfession', 'knownForTitles'], axis=1, inplace=True)

In [119]:
title_akas['language'].value_counts()

\N     1790606
en           4
tr           1
yue          1
da           1
ca           1
Name: language, dtype: int64

:warning:  language column values have little to none importance, hence not using this dataframe

In [120]:
df = title_basics.merge(
  title_crew, on='tconst', how='left').merge(
    title_ratings, on='tconst', how='left')

df

Unnamed: 0,tconst,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,averageRating
0,tt0011801,Tötet nicht mehr,Tötet nicht mehr,0,2019.0,\N,"Action,Crime",nm0681726,
1,tt0013274,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021.0,133,Documentary,"nm0412842,nm0895048",6.4
2,tt0015414,La tierra de los toros,La tierra de los toros,0,2000.0,60,\N,nm0615736,5.3
3,tt0015724,Dama de noche,Dama de noche,0,1993.0,102,"Drama,Mystery,Romance",nm0529960,6.1
4,tt0035423,Kate & Leopold,Kate & Leopold,0,2001.0,118,"Comedy,Fantasy,Romance",nm0003506,6.4
...,...,...,...,...,...,...,...,...,...
414115,tt9916680,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007.0,100,Documentary,nm0652213,
414116,tt9916692,Teatroteka: Czlowiek bez twarzy,Teatroteka: Czlowiek bez twarzy,0,2015.0,66,Drama,nm10538592,
414117,tt9916706,Dankyavar Danka,Dankyavar Danka,0,2013.0,\N,Comedy,nm7764440,
414118,tt9916730,6 Gunn,6 Gunn,0,2017.0,116,\N,nm10538612,8.4


In [121]:
df = df[(not df['averageRating'].isnull()) & (df['averageRating'] >= 7)]
df.reset_index(inplace=True, drop=True)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
# explode movie rows with more than one director into multiple rows, each one with only one director code

df = df.assign(directors=df['directors'].str.split(',')).explode('directors')
df = df.merge(name_basics, left_on='directors', right_on='nconst', how='left')

# drop useless columns

df.drop(labels=['directors', 'nconst'], axis=1, inplace=True)
df.rename(columns={'primaryName' : 'directors'}, inplace=True)

df

Unnamed: 0,tconst,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,directors
0,tt0038086,Shiva und die Galgenblume,Shiva und die Galgenblume,0,1993.0,\N,Thriller,7.0,Hans Steinhoff
1,tt0079644,November 1828,November 1828,0,2001.0,140,"Drama,War",7.3,Teguh Karya
2,tt0084015,Goodbye Paradise,Goodbye Paradise,0,1991.0,101,Drama,7.2,Tim Savage
3,tt0084015,Goodbye Paradise,Goodbye Paradise,0,1991.0,101,Drama,7.2,Dennis Christianson
4,tt0084660,Sentinel,Sentinel,0,1993.0,26,"Action,Adventure",7.1,Mark Roper
...,...,...,...,...,...,...,...,...,...
84932,tt9916170,The Rehearsal,O Ensaio,0,2019.0,51,Drama,7.0,Tamar Guimaraes
84933,tt9916192,Danielle Darrieux: Il est poli d'être gai!,Danielle Darrieux: Il est poli d'être gai!,0,2019.0,53,Biography,7.6,Pierre-Henri Gibert
84934,tt9916460,Pink Taxi,Pink Taxi,0,2019.0,\N,Comedy,9.4,Gabriel Athanasiou
84935,tt9916538,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019.0,123,Drama,8.3,Azhar Kinoi Lubis


In [None]:
# remove duplicate rows while joining director names

df['directors'] = df['directors'].astype(str)
directors_column = df.groupby(['tconst']).agg({'directors': ', '.join})['directors'].values

df.drop_duplicates(subset='tconst', inplace=True)
df['directors'] = directors_column

df

Unnamed: 0,tconst,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,directors
0,tt0038086,Shiva und die Galgenblume,Shiva und die Galgenblume,0,1993.0,\N,Thriller,7.0,Hans Steinhoff
1,tt0079644,November 1828,November 1828,0,2001.0,140,"Drama,War",7.3,Teguh Karya
2,tt0084015,Goodbye Paradise,Goodbye Paradise,0,1991.0,101,Drama,7.2,"Tim Savage, Dennis Christianson"
4,tt0084660,Sentinel,Sentinel,0,1993.0,26,"Action,Adventure",7.1,Mark Roper
5,tt0084870,Memories and Confessions,Visita ou Memórias e Confissões,0,1993.0,73,"Biography,History",7.1,Manoel de Oliveira
...,...,...,...,...,...,...,...,...,...
84932,tt9916170,The Rehearsal,O Ensaio,0,2019.0,51,Drama,7.0,Tamar Guimaraes
84933,tt9916192,Danielle Darrieux: Il est poli d'être gai!,Danielle Darrieux: Il est poli d'être gai!,0,2019.0,53,Biography,7.6,Pierre-Henri Gibert
84934,tt9916460,Pink Taxi,Pink Taxi,0,2019.0,\N,Comedy,9.4,Gabriel Athanasiou
84935,tt9916538,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019.0,123,Drama,8.3,Azhar Kinoi Lubis


In [None]:
#df.to_csv('../data/data.csv', index=False, sep=';')

# TODO:
- remover filmes mais antigos que x data
- filtrar filmes apenas em en 
  - not enough values
- remover filmes sem sinopse (isto é quando formos fazer o webscrapping ig)
