In [1]:
import pandas as pd
import numpy as np

title_akas = pd.read_csv('../downloads/title.akas.tsv', delimiter='\t', dtype={'isOriginalTitle': object})
title_basics = pd.read_csv('../downloads/title.basics.tsv', delimiter='\t', dtype={'isAdult': object})
title_crew = pd.read_csv('../downloads/title.crew.tsv', delimiter='\t', dtype={'directors': str})
title_ratings = pd.read_csv('../downloads/title.ratings.tsv', delimiter='\t')
name_basics = pd.read_csv('../downloads/name.basics.tsv', delimiter='\t', dtype={'primaryName': str})

In [2]:
# filter by titleType

title_basics = title_basics[(title_basics['titleType'] == 'movie') | (title_basics['titleType'] == 'tvMovie')]
title_basics.reset_index(inplace=True, drop=True)

# filter by isOriginalTitle

title_akas = title_akas[(title_akas['isOriginalTitle'] == '1') | (title_akas['isOriginalTitle'] == '\\N')]
title_akas.reset_index(inplace=True, drop=True)


In [3]:
# remove duplicate values with null value in 'isOriginalTitle' column

duplicates = title_akas[title_akas.duplicated(['titleId'])]
duplicates = duplicates[(duplicates['isOriginalTitle'] == '\\N') | (duplicates['attributes'] != '\\N')]

title_akas.drop(labels=duplicates.index, axis=0, inplace=True)
title_akas.reset_index(inplace=True, drop=True)

In [4]:
duplicates = title_akas[title_akas.duplicated(['titleId'])]

title_akas.drop(labels=duplicates.index, axis=0, inplace=True)
title_akas.reset_index(inplace=True, drop=True)

In [5]:
# remove movies before a certain year

title_basics['startYear'] = pd.to_numeric(title_basics['startYear'], errors='coerce')
to_remove = title_basics[(title_basics['startYear'] <= 2000) | (title_basics['startYear'] >= 2010)].index
title_basics.drop(to_remove, inplace=True)
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
24415,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,\N,118,"Comedy,Fantasy,Romance"
45084,tt0061366,movie,Around the World,Around the World,0,,\N,178,"Comedy,Romance"
50120,tt0067683,movie,Workers '71: Nothing About Us Without Us,Robotnicy 1971 - Nic o nas bez nas,0,2006.0,\N,47,Documentary
59666,tt0079644,movie,November 1828,November 1828,0,2001.0,\N,140,"Drama,War"
60113,tt0080155,movie,The Wonderful Years,Die wunderbaren Jahre,0,,\N,104,Drama
...,...,...,...,...,...,...,...,...,...
760662,tt9916134,movie,The Thing in the Bag,The Thing in the Bag,0,,\N,78,\N
760666,tt9916178,movie,Yesterday's Dreams,Yesterday's Dreams,0,,\N,\N,\N
760668,tt9916188,movie,Minotaur,Minotaur,0,,\N,\N,Thriller
760678,tt9916620,movie,The Copeland Case,The Copeland Case,0,,\N,\N,Drama


In [6]:
# drop useless columns 

title_akas.drop(labels=['ordering', 'types', 'attributes', 'title', 'isOriginalTitle'], axis=1, inplace=True)
title_basics.drop(labels=['endYear', 'titleType'], axis=1, inplace=True)
title_crew.drop(labels=['writers'], axis=1, inplace=True)
title_ratings.drop(labels=['numVotes'], axis=1, inplace=True)
name_basics.drop(labels=['birthYear', 'deathYear', 'primaryProfession', 'knownForTitles'], axis=1, inplace=True)

In [7]:
title_akas['language'].value_counts()

\N     1790881
en           4
tr           1
yue          1
da           1
ca           1
Name: language, dtype: int64

:warning:  language column values have little to none importance, hence not using this dataframe

In [8]:
df = title_basics.merge(
  title_crew, on='tconst', how='left').merge(
    title_ratings, on='tconst', how='left')

df

Unnamed: 0,tconst,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,averageRating
0,tt0035423,Kate & Leopold,Kate & Leopold,0,2001.0,118,"Comedy,Fantasy,Romance",nm0003506,6.4
1,tt0061366,Around the World,Around the World,0,,178,"Comedy,Romance",nm1487785,
2,tt0067683,Workers '71: Nothing About Us Without Us,Robotnicy 1971 - Nic o nas bez nas,0,2006.0,47,Documentary,"nm0001425,nm0959099",6.3
3,tt0079644,November 1828,November 1828,0,2001.0,140,"Drama,War",nm0440323,7.3
4,tt0080155,The Wonderful Years,Die wunderbaren Jahre,0,,104,Drama,nm0475360,
...,...,...,...,...,...,...,...,...,...
184897,tt9916134,The Thing in the Bag,The Thing in the Bag,0,,78,\N,nm10538318,
184898,tt9916178,Yesterday's Dreams,Yesterday's Dreams,0,,\N,\N,\N,
184899,tt9916188,Minotaur,Minotaur,0,,\N,Thriller,nm2410311,
184900,tt9916620,The Copeland Case,The Copeland Case,0,,\N,Drama,\N,


In [9]:
# explode movie rows with more than one director into multiple rows, each one with only one director code

df = df.assign(directors=df['directors'].str.split(',')).explode('directors')
df = df.merge(name_basics, left_on='directors', right_on='nconst', how='left')

# drop useless columns

df.drop(labels=['directors', 'nconst'], axis=1, inplace=True)
df.rename(columns={'primaryName' : 'directors'}, inplace=True)

df

Unnamed: 0,tconst,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,directors
0,tt0035423,Kate & Leopold,Kate & Leopold,0,2001.0,118,"Comedy,Fantasy,Romance",6.4,James Mangold
1,tt0061366,Around the World,Around the World,0,,178,"Comedy,Romance",,Pachhi
2,tt0067683,Workers '71: Nothing About Us Without Us,Robotnicy 1971 - Nic o nas bez nas,0,2006.0,47,Documentary,6.3,Krzysztof Kieslowski
3,tt0067683,Workers '71: Nothing About Us Without Us,Robotnicy 1971 - Nic o nas bez nas,0,2006.0,47,Documentary,6.3,Tomasz Zygadlo
4,tt0079644,November 1828,November 1828,0,2001.0,140,"Drama,War",7.3,Teguh Karya
...,...,...,...,...,...,...,...,...,...
201520,tt9916134,The Thing in the Bag,The Thing in the Bag,0,,78,\N,,Joseph Graves
201521,tt9916178,Yesterday's Dreams,Yesterday's Dreams,0,,\N,\N,,
201522,tt9916188,Minotaur,Minotaur,0,,\N,Thriller,,Dean Israelite
201523,tt9916620,The Copeland Case,The Copeland Case,0,,\N,Drama,,


In [10]:
# remove duplicate rows while joining director names

df['directors'] = df['directors'].astype(str)
directors_column = df.groupby(['tconst']).agg({'directors': ', '.join})['directors'].values

df.drop_duplicates(subset='tconst', inplace=True)
df['directors'] = directors_column

df

Unnamed: 0,tconst,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,directors
0,tt0035423,Kate & Leopold,Kate & Leopold,0,2001.0,118,"Comedy,Fantasy,Romance",6.4,James Mangold
1,tt0061366,Around the World,Around the World,0,,178,"Comedy,Romance",,Pachhi
2,tt0067683,Workers '71: Nothing About Us Without Us,Robotnicy 1971 - Nic o nas bez nas,0,2006.0,47,Documentary,6.3,"Krzysztof Kieslowski, Tomasz Zygadlo"
4,tt0079644,November 1828,November 1828,0,2001.0,140,"Drama,War",7.3,Teguh Karya
5,tt0080155,The Wonderful Years,Die wunderbaren Jahre,0,,104,Drama,,Reiner Kunze
...,...,...,...,...,...,...,...,...,...
201520,tt9916134,The Thing in the Bag,The Thing in the Bag,0,,78,\N,,Joseph Graves
201521,tt9916178,Yesterday's Dreams,Yesterday's Dreams,0,,\N,\N,,
201522,tt9916188,Minotaur,Minotaur,0,,\N,Thriller,,Dean Israelite
201523,tt9916620,The Copeland Case,The Copeland Case,0,,\N,Drama,,


In [11]:
df.to_csv('../data/data.csv', index=False, sep=';')

# TODO:
- remover filmes mais antigos que x data
- filtrar filmes apenas em en 
  - not enough values
- remover filmes sem sinopse (isto é quando formos fazer o webscrapping ig)
