In [85]:
import pandas as pd
import numpy as np

### Title Basics

In [86]:
basics = pd.read_csv('title.basics.tsv', sep='\t', dtype={'startYear': str})
basics = basics.drop(columns=['originalTitle', 'endYear', 'genres'])
len(basics)

6564302

#### Filter out entries that aren't movies

In [87]:
is_movie = basics['titleType'] == 'movie'
basics = basics[is_movie]
len(basics)

544128

#### Filter out adult titles

In [88]:
not_adult = basics['isAdult'] == 0
basics = basics[not_adult]
basics = basics.drop(columns=['isAdult'])
len(basics)

535418

#### Filter out movies that do not have a start year

In [89]:
has_start_year = basics['startYear'] != '\\N'
basics = basics[has_start_year]
len(basics)

473339

In [90]:
#basics

### Principals

In [91]:
principals = pd.read_csv('title.principals.tsv', sep='\t', dtype={'types': str})
# principals = principals.drop(columns=['job', 'ordering'])
len(principals)

37930494

#### Only actors or actresses, not director, composer, etc.

In [92]:
is_actor = (principals['category'] == 'actor') | (principals['category'] == 'actress')
principals = principals[is_actor]
principals = principals.drop(columns=['category'])
len(principals)

15388567

In [93]:
# principals

### Name Basics

In [94]:
names = pd.read_csv('name.basics.tsv', sep='\t')
names = names.drop(columns=['primaryProfession', 'knownForTitles'])

In [95]:
 names

Unnamed: 0,nconst,primaryName,birthYear,deathYear
0,nm0000001,Fred Astaire,1899,1987
1,nm0000002,Lauren Bacall,1924,2014
2,nm0000003,Brigitte Bardot,1934,\N
3,nm0000004,John Belushi,1949,1982
4,nm0000005,Ingmar Bergman,1918,2007
...,...,...,...,...
9901911,nm9993714,Romeo del Rosario,\N,\N
9901912,nm9993716,Essias Loberg,\N,\N
9901913,nm9993717,Harikrishnan Rajan,\N,\N
9901914,nm9993718,Aayush Nair,\N,\N


### Results

In [96]:
merged = basics.merge(principals, on=['tconst'])
merged = merged.merge(names, on=['nconst'])
merged.columns

Index(['tconst', 'titleType', 'primaryTitle', 'startYear', 'runtimeMinutes',
       'ordering', 'nconst', 'job', 'characters', 'primaryName', 'birthYear',
       'deathYear'],
      dtype='object')

In [97]:
results = pd.DataFrame(
    {'TitleId': merged['tconst'],
     'NameId': merged['nconst'],
     'Movie Title': merged['primaryTitle'],
     'Year': merged['startYear'],
     'Actor': merged['primaryName'],
     'Characters': merged['characters']})
len(results)

1601420

### Things To Do With The Data

In [98]:
avengers_movie = (results['Movie Title'] == 'The Avengers') & (results['Year'] == '2012')
avengers_principal_actors = results[avengers_movie]
avengers_principal_actors

Unnamed: 0,TitleId,NameId,Movie Title,Year,Actor,Characters
601711,tt0848228,nm0000375,The Avengers,2012,Robert Downey Jr.,"[""Tony Stark"",""Iron Man""]"
695586,tt0848228,nm0424060,The Avengers,2012,Scarlett Johansson,"[""Natasha Romanoff"",""Black Widow""]"
707615,tt0848228,nm0262635,The Avengers,2012,Chris Evans,"[""Steve Rogers"",""Captain America""]"
938561,tt0848228,nm0719637,The Avengers,2012,Jeremy Renner,"[""Clint Barton"",""Hawkeye""]"


In [99]:
johnny_depp = results['Actor'] == 'Johnny Depp'
johnny_depp_movies = results[johnny_depp]
johnny_depp_movies = johnny_depp_movies.sort_values(by=['Year'])
johnny_depp_movies

Unnamed: 0,TitleId,NameId,Movie Title,Year,Actor,Characters
567730,tt0087800,nm0000136,A Nightmare on Elm Street,1984,Johnny Depp,"[""Glen Lantz""]"
567731,tt0089839,nm0000136,Private Resort,1985,Johnny Depp,"[""Jack""]"
567732,tt0099329,nm0000136,Cry-Baby,1990,Johnny Depp,"[""Cry-Baby""]"
567733,tt0099487,nm0000136,Edward Scissorhands,1990,Johnny Depp,"[""Edward Scissorhands""]"
567734,tt0106307,nm0000136,Arizona Dream,1993,Johnny Depp,"[""Axel Blackmar""]"
567735,tt0106387,nm0000136,Benny & Joon,1993,Johnny Depp,"[""Sam""]"
567736,tt0108550,nm0000136,What's Eating Gilbert Grape,1993,Johnny Depp,"[""Gilbert Grape""]"
567737,tt0109707,nm0000136,Ed Wood,1994,Johnny Depp,"[""Ed Wood""]"
567739,tt0112883,nm0000136,Don Juan DeMarco,1994,Johnny Depp,"[""Don Juan DeMarco""]"
567738,tt0112817,nm0000136,Dead Man,1995,Johnny Depp,"[""William Blake""]"
