# Informações iniciais

Informações sobre os datasets https://www.imdb.com/interfaces/

Baixar os tsvs no link https://datasets.imdbws.com/ e colocar na mesma pasta do arquivo ipynb

In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

plt.style.use('ggplot')

In [21]:
name_basics = "name.basics.tsv"
nb_columns = ['nconst', 'primaryName', 'birthYear', 'deathYear', 'primaryProfession','knownForTitles']

nb_df = pd.read_csv(name_basics,sep='\t',header=0,usecols=nb_columns)
nb_df.dropna()
nb_df.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0053137,tt0050419,tt0031983,tt0072308"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0071877,tt0038355,tt0117057,tt0037382"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0057345,tt0049189,tt0054452,tt0056404"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0078723,tt0072562,tt0077975,tt0080455"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0083922,tt0069467,tt0060827"


In [159]:
title_akas = "title.akas.tsv"
ta_columns = ['titleId', 'ordering', 'title', 'region', 'language', 'types', 'attributes', 'isOriginalTitle']
convert_dict = {'titleId': str, 'ordering': int, 'title': str, 'region': str, 'language': str, 'types': str, 'attributes': str, 'isOriginalTitle': str}

ta_df = pd.read_csv(title_akas,sep='\t',header=0,usecols=ta_columns,dtype=convert_dict)
ta_df = ta_df[ta_df['isOriginalTitle'] == '1']
ta_df.rename(columns={'titleId':'tconst'}, inplace=True)
ta_df.dropna()
ta_df.head()

Unnamed: 0,tconst,ordering,title,region,language,types,attributes,isOriginalTitle
6,tt0000001,7,Carmencita,\N,\N,original,\N,1
8,tt0000002,1,Le clown et ses chiens,\N,\N,original,\N,1
21,tt0000003,6,Pauvre Pierrot,\N,\N,original,\N,1
25,tt0000004,1,Un bon bock,\N,\N,original,\N,1
34,tt0000005,11,Blacksmith Scene,\N,\N,original,\N,1


In [160]:
title_basics = "title.basics.tsv"
tb_columns = ['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'startYear', 'endYear', 'runtimeMinutes', 'genres']
convert_dict = {'tconst': str, 'titleType': str, 'primaryTitle': str, 'originalTitle': str, 'startYear': str, 'endYear': str, 'runtimeMinutes': str, 'genres': object}

tb_df = pd.read_csv(title_basics,sep='\t',header=0,usecols=tb_columns,dtype=convert_dict)
tb_df.dropna()
tb_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,1893,\N,1,"Comedy,Short"


In [23]:
title_crew = "title.crew.tsv"
tc_columns = ['tconst', 'directors', 'writers']
tc_df = pd.read_csv(title_crew,sep='\t',header=0,usecols=tc_columns)
tc_df.dropna()
tc_df.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N


In [24]:
title_episode = "title.episode.tsv"
te_columns = ['tconst', 'parentTconst', 'seasonNumber', 'episodeNumber']

te_df = pd.read_csv(title_episode,sep='\t',header=0,usecols=te_columns)
te_df.dropna()
te_df.head()

Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0020666,tt15180956,1,2
1,tt0020829,tt15180956,1,1
2,tt0021166,tt15180956,1,3
3,tt0021612,tt15180956,2,2
4,tt0021655,tt15180956,2,5


In [25]:
title_principals = "title.principals.tsv"
tp_columns = ['tconst', 'ordering', 'nconst', 'category', 'job', 'characters']

tp_df = pd.read_csv(title_principals,sep='\t',header=0,usecols=tp_columns)
tp_df.dropna()
tp_df.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N


In [26]:
title_ratings = "title.ratings.tsv"
tr_columns = ['tconst', 'averageRating', 'numVotes']

tr_df = pd.read_csv(title_ratings,sep='\t',header=0,usecols=tr_columns)
tr_df.dropna()
tr_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1914
1,tt0000002,5.8,259
2,tt0000003,6.5,1720
3,tt0000004,5.6,172
4,tt0000005,6.2,2537


In [227]:
combine = pd.merge(ta_df, tb_df,how='outer', on='tconst')

In [157]:
combine.head()

Unnamed: 0,tconst,ordering,title,region,language,types,attributes,isOriginalTitle,titleType,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,genres
8,tt0000009,1.0,Miss Jerry,\N,\N,original,\N,1,movie,Miss Jerry,Miss Jerry,1894,\N,45,Romance
497,tt0000502,1.0,Bohemios,\N,\N,original,\N,1,movie,Bohemios,Bohemios,1905,\N,100,\N
569,tt0000574,5.0,The Story of the Kelly Gang,\N,\N,original,\N,1,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,\N,70,"Action,Adventure,Biography"
586,tt0000591,2.0,L'enfant prodigue,\N,\N,original,\N,1,movie,The Prodigal Son,L'enfant prodigue,1907,\N,90,Drama
609,tt0000615,1.0,Robbery Under Arms,\N,\N,original,\N,1,movie,Robbery Under Arms,Robbery Under Arms,1907,\N,\N,Drama


# Perguntas

1- Quais os gêneros de filmes mais produzidos de cada década?

In [228]:
combine = combine[combine['titleType'] == 'movie']
combine = combine[~combine['startYear'].str.contains("N")]
decades = combine.groupby(combine['startYear'].astype(int)//10*10).size()
decades.head(14)

startYear
1890        18
1900       191
1910     13081
1920     22012
1930     20369
1940     14593
1950     22715
1960     30863
1970     39735
1980     43211
1990     44743
2000     74671
2010    161474
2020     52090
dtype: int64