In [88]:
import pandas as pd
import numpy as np

### Title Basics

In [89]:
basics = pd.read_csv('title.basics.tsv', sep='\t', dtype={'startYear': str})
basics = basics.drop(columns=['originalTitle', 'endYear', 'genres'])
len(basics)

6564302

#### Filter out entries that aren't movies

In [90]:
is_movie = basics['titleType'] == 'movie'
basics = basics[is_movie]
len(basics)

544128

#### Filter out adult titles

In [91]:
not_adult = basics['isAdult'] == 0
basics = basics[not_adult]
basics = basics.drop(columns=['isAdult'])
len(basics)

535418

#### Filter out movies that do not have a start year

In [92]:
has_start_year = basics['startYear'] != '\\N'
basics = basics[has_start_year]
len(basics)

473339

In [None]:
#basics

### Principals

In [93]:
principals = pd.read_csv('title.principals.tsv', sep='\t', dtype={'types': str})
len(principals)

37930494

#### Only actors or actresses, not director, composer, etc.

In [94]:
is_actor = (principals['category'] == 'actor') | (principals['category'] == 'actress')
principals = principals[is_actor]
principals = principals.drop(columns=['category'])
len(principals)

15388567

#### Remove if they played no characters

In [95]:
 principals = principals[principals['characters'] != '\\N']
 len(principals)

12601274

### Name Basics

In [96]:
names = pd.read_csv('name.basics.tsv', sep='\t')

In [97]:
len(names)

9901916

#### Remove names without a birth year

In [98]:
names = names[names['birthYear'] != '\\N']
len(names)

496535

#### Remove names without a primary profession

In [99]:
names = names.dropna(subset=['primaryProfession'])
len(names)

438537

#### Remove Names without any known-for titles

In [100]:
names = names[names['knownForTitles'] != '\\N']
len(names)

436811

#### Drop duplicate names
This will result in removing actors erroneously. For example, if there was a Ben Smith born in 1928 who starred in 2 movies and another Ben Smith born in 1945 who starred in 6 movies, one of those Ben Smiths will be removed from the data set.

This is good enough for now.

In [101]:
names = names.drop_duplicates(subset=['primaryName'])
len(names)

425575

### Results

In [102]:
merged = basics.merge(principals, on=['tconst'])
merged = merged.merge(names, on=['nconst'])
merged.columns

Index(['tconst', 'titleType', 'primaryTitle', 'startYear', 'runtimeMinutes',
       'ordering', 'nconst', 'job', 'characters', 'primaryName', 'birthYear',
       'deathYear', 'primaryProfession', 'knownForTitles'],
      dtype='object')

In [103]:
results = pd.DataFrame(
    {'TitleId': merged['tconst'],
     'NameId': merged['nconst'],
     'Movie Title': merged['primaryTitle'],
     'Year': merged['startYear'],
     'Actor': merged['primaryName'],
     'Characters': merged['characters']})
len(results)

644489

In [104]:
len(pd.unique(results['Actor']))

114738

### Things To Do With The Data

In [105]:
avengers_movie = (results['Movie Title'] == 'The Avengers') & (results['Year'] == '2012')
avengers_principal_actors = results[avengers_movie]
avengers_principal_actors

Unnamed: 0,TitleId,NameId,Movie Title,Year,Actor,Characters
387153,tt0848228,nm0000375,The Avengers,2012,Robert Downey Jr.,"[""Tony Stark"",""Iron Man""]"
442975,tt0848228,nm0424060,The Avengers,2012,Scarlett Johansson,"[""Natasha Romanoff"",""Black Widow""]"
518970,tt0848228,nm0719637,The Avengers,2012,Jeremy Renner,"[""Clint Barton"",""Hawkeye""]"


In [106]:
johnny_depp = results['Actor'] == 'Johnny Depp'
johnny_depp_movies = results[johnny_depp]
johnny_depp_movies = johnny_depp_movies.sort_values(by=['Year'])
johnny_depp_movies

Unnamed: 0,TitleId,NameId,Movie Title,Year,Actor,Characters
367960,tt0087800,nm0000136,A Nightmare on Elm Street,1984,Johnny Depp,"[""Glen Lantz""]"
367961,tt0089839,nm0000136,Private Resort,1985,Johnny Depp,"[""Jack""]"
367962,tt0099329,nm0000136,Cry-Baby,1990,Johnny Depp,"[""Cry-Baby""]"
367963,tt0099487,nm0000136,Edward Scissorhands,1990,Johnny Depp,"[""Edward Scissorhands""]"
367964,tt0106307,nm0000136,Arizona Dream,1993,Johnny Depp,"[""Axel Blackmar""]"
367965,tt0106387,nm0000136,Benny & Joon,1993,Johnny Depp,"[""Sam""]"
367966,tt0108550,nm0000136,What's Eating Gilbert Grape,1993,Johnny Depp,"[""Gilbert Grape""]"
367967,tt0109707,nm0000136,Ed Wood,1994,Johnny Depp,"[""Ed Wood""]"
367969,tt0112883,nm0000136,Don Juan DeMarco,1994,Johnny Depp,"[""Don Juan DeMarco""]"
367968,tt0112817,nm0000136,Dead Man,1995,Johnny Depp,"[""William Blake""]"


In [107]:
#### Write the combined results to a file

In [108]:
results.to_csv('movies_and_actors.csv')