In [1]:
%matplotlib inline

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("whitegrid")

# imdb.title.principals

In [2]:
title_principals = pd.read_csv("data/imdb.title.principals.csv.gz")
title_principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"


In [3]:
title_principals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028186 entries, 0 to 1028185
Data columns (total 6 columns):
tconst        1028186 non-null object
ordering      1028186 non-null int64
nconst        1028186 non-null object
category      1028186 non-null object
job           177684 non-null object
characters    393360 non-null object
dtypes: int64(1), object(5)
memory usage: 47.1+ MB


In [4]:
print(title_principals.job.isna().sum() / len(title_principals) * 100)
print(title_principals.characters.isna().sum() / len(title_principals) * 100)

82.71869097614633
61.74233066779746


so yea, we don't need them

In [5]:
title_principals.drop(columns=["job", "characters"], inplace=True)
title_principals.head()

Unnamed: 0,tconst,ordering,nconst,category
0,tt0111414,1,nm0246005,actor
1,tt0111414,2,nm0398271,director
2,tt0111414,3,nm3739909,producer
3,tt0323808,10,nm0059247,editor
4,tt0323808,1,nm3579312,actress


# imdb.title.ratings

In [6]:
title_ratings = pd.read_csv("data/imdb.title.ratings.csv.gz")
title_ratings.head()

Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


In [7]:
title_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73856 entries, 0 to 73855
Data columns (total 3 columns):
tconst           73856 non-null object
averagerating    73856 non-null float64
numvotes         73856 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.7+ MB


# merge

In [8]:
df = title_principals.merge(title_ratings, on="tconst", how="inner")
df.head()

Unnamed: 0,tconst,ordering,nconst,category,averagerating,numvotes
0,tt0323808,10,nm0059247,editor,3.9,2328
1,tt0323808,1,nm3579312,actress,3.9,2328
2,tt0323808,2,nm2694680,actor,3.9,2328
3,tt0323808,3,nm0574615,actor,3.9,2328
4,tt0323808,4,nm0502652,actress,3.9,2328


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 629755 entries, 0 to 629754
Data columns (total 6 columns):
tconst           629755 non-null object
ordering         629755 non-null int64
nconst           629755 non-null object
category         629755 non-null object
averagerating    629755 non-null float64
numvotes         629755 non-null int64
dtypes: float64(1), int64(2), object(3)
memory usage: 33.6+ MB


In [10]:
df[df.duplicated()]

Unnamed: 0,tconst,ordering,nconst,category,averagerating,numvotes


# imdb.name.basics

In [11]:
name_basics = pd.read_csv("data/imdb.name.basics.csv.gz")
name_basics.head()

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"


In [12]:
name_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606648 entries, 0 to 606647
Data columns (total 6 columns):
nconst                606648 non-null object
primary_name          606648 non-null object
birth_year            82736 non-null float64
death_year            6783 non-null float64
primary_profession    555308 non-null object
known_for_titles      576444 non-null object
dtypes: float64(2), object(4)
memory usage: 27.8+ MB


In [13]:
name_basics.death_year.unique()

array([  nan, 2013., 2004., 2017., 1965., 2003., 2018., 2012., 1937.,
       1976., 2019., 1971., 1994., 2008., 2009., 1986., 2011., 2010.,
       1890., 2015., 1918., 2016., 1916., 1995., 1985., 2014., 1779.,
       1997., 1961., 1840., 2002., 1900., 1688., 1707., 1969., 1948.,
       1990., 1870., 1960., 1991., 2005., 1981., 1968., 1989., 2006.,
       1951., 1967., 1938., 1926., 1944., 1975., 1898., 1970., 1883.,
       1972., 1974., 1998., 1993., 1959., 1979., 1999., 1987., 2000.,
       1925., 1992., 1978., 1878., 1902., 1942., 1954., 1935., 2001.,
       1940., 1996., 1815., 2007., 1982., 1893., 1933., 1983., 1946.,
       1988., 1803., 1939., 1980., 1984., 1872., 1880., 1924., 1934.,
       1932., 1966., 1949., 1947., 1774., 1957., 1873., 1593., 1956.,
       1952., 1855., 1912., 1963., 1919., 1915., 1950., 1962., 1955.,
       1894., 1892., 1929., 1977., 1843., 1876., 1812., 1901., 1817.,
       1828., 1964., 1031., 1851., 1864., 1973., 1943., 1831., 1904.,
       1838., 1931.,

In [14]:
name_basics.death_year.isna().sum() / len(name_basics) * 100

98.88188867349764

In [15]:
name_basics.death_year.isna().sum()

599865

We don't want those that already passed. Can't hire them...

In [16]:
name_basics = name_basics[name_basics.death_year.isna()]
len(name_basics)

599865

In [17]:
name_basics.drop(columns=["birth_year", "death_year", "primary_profession", "known_for_titles"], inplace=True)
name_basics.head()

Unnamed: 0,nconst,primary_name
0,nm0061671,Mary Ellen Bauder
1,nm0061865,Joseph Bauer
2,nm0062070,Bruce Baum
3,nm0062195,Axel Baumann
4,nm0062798,Pete Baxter


# more merge

In [18]:
df = df.merge(name_basics, on="nconst", how="inner")
df.head()

Unnamed: 0,tconst,ordering,nconst,category,averagerating,numvotes,primary_name
0,tt0323808,10,nm0059247,editor,3.9,2328,Sean Barton
1,tt2081348,10,nm0059247,editor,4.1,48,Sean Barton
2,tt1414378,10,nm0059247,editor,6.5,3460,Sean Barton
3,tt2712990,10,nm0059247,editor,5.6,329,Sean Barton
4,tt2395207,9,nm0059247,editor,5.5,22,Sean Barton


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 621366 entries, 0 to 621365
Data columns (total 7 columns):
tconst           621366 non-null object
ordering         621366 non-null int64
nconst           621366 non-null object
category         621366 non-null object
averagerating    621366 non-null float64
numvotes         621366 non-null int64
primary_name     621366 non-null object
dtypes: float64(1), int64(2), object(4)
memory usage: 37.9+ MB


In [20]:
df[df.duplicated()]

Unnamed: 0,tconst,ordering,nconst,category,averagerating,numvotes,primary_name


## good movies

In [21]:
df_good_rating = df[df.averagerating >= 8]
df_good_rating.head()

Unnamed: 0,tconst,ordering,nconst,category,averagerating,numvotes,primary_name
15,tt2379402,8,nm0676104,cinematographer,8.1,10,Jan Pester
39,tt7769118,3,nm0532721,actor,9.0,22,Luis Machín
61,tt4551544,5,nm1145057,director,8.1,14,Alejandro Chomski
62,tt6622064,1,nm1145057,self,8.5,8,Alejandro Chomski
73,tt2139847,1,nm0036109,self,8.0,9,Lucie Arnaz


In [22]:
len(df_good_rating)

69665

In [23]:
df_good_rating_cat = df_good_rating.groupby("category")

In [24]:
good_actors = list(df_good_rating_cat.get_group("actor").primary_name)
print(len(set(good_actors)))
good_actors = list(set(good_actors))
print(len(good_actors))

13397
13397


In [25]:
good_actresses = list(df_good_rating_cat.get_group("actress").primary_name)
print(len(set(good_actresses)))
good_actresses = list(set(good_actresses))
print(len(good_actresses))

6690
6690


In [26]:
good_directors = list(df_good_rating_cat.get_group("director").primary_name)
print(len(set(good_directors)))
good_directors = list(set(good_directors))
print(len(good_directors))

8967
8967


In [27]:
good_writers = list(df_good_rating_cat.get_group("writer").primary_name)
print(len(set(good_writers)))
good_writers = list(set(good_writers))
print(len(good_writers))

3609
3609


## pick the best ones

In [28]:
df_people = df.groupby("primary_name")

In [29]:
good_actors_new = []
for person in good_actors:
    if len(df_people.get_group(person)) > 5 and df_people.get_group(person).averagerating.mean() >= 8:
        good_actors_new.append(person)
print(len(good_actors_new))
len(set(good_actors_new))

18


18

In [30]:
good_actresses_new = []
for person in good_actresses:
    if len(df_people.get_group(person)) > 5 and df_people.get_group(person).averagerating.mean() >= 8:
        good_actresses_new.append(person)
print(len(good_actresses_new))
len(set(good_actresses_new))

7


7

In [31]:
good_directors_new = []
for person in good_directors:
    if len(df_people.get_group(person)) > 5 and df_people.get_group(person).averagerating.mean() >= 8:
        good_directors_new.append(person)
print(len(good_directors_new))
len(set(good_directors_new))

28


28

In [32]:
good_writers_new = []
for person in good_writers:
    if len(df_people.get_group(person)) > 5 and df_people.get_group(person).averagerating.mean() >= 8:
        good_writers_new.append(person)
print(len(good_writers_new))
len(set(good_writers_new))

7


7

In [33]:
good_actors_new[:10]

['Darren Wilson',
 'Thom Keller',
 'Pauli Janhunen Calderón',
 'Steve Ravic',
 'Timothy J. Cox',
 'Elton John',
 'Kalyan Chatterjee',
 'Joshua Bell',
 'Alex Honnold',
 'David Anghel']

In [34]:
good_actresses_new[:10]

['Sargun Mehta',
 'Lauren Lopez',
 'Sylvia Earle',
 'Lelani Mitchem',
 'Jayani Senanayake',
 'Susan Brown',
 'CC King']

In [35]:
good_directors_new[:10]

['Darren Wilson',
 'Thom Zimny',
 'Brian Holden',
 'Pauli Janhunen Calderón',
 'Branko Istvancic',
 'Carlos Nader',
 'Christopher Nolan',
 'Steve Ravic',
 'Christopher Kenneally',
 'Joe L. Roberts']

In [36]:
good_writers_new[:10]

['Matt Lang',
 'Darren Wilson',
 'Nick Rosen',
 'Brian Holden',
 'Eric Frith',
 'Alvaro Calderón',
 'David Anghel']

In [37]:
df_hire = pd.DataFrame([good_actors_new, good_actresses_new, good_directors_new, good_writers_new])

In [38]:
df_hire = df_hire.T

In [39]:
df_hire.columns = ["Actors", "Actresses", "Directors", "Writers"]

In [40]:
df_hire

Unnamed: 0,Actors,Actresses,Directors,Writers
0,Darren Wilson,Sargun Mehta,Darren Wilson,Matt Lang
1,Thom Keller,Lauren Lopez,Thom Zimny,Darren Wilson
2,Pauli Janhunen Calderón,Sylvia Earle,Brian Holden,Nick Rosen
3,Steve Ravic,Lelani Mitchem,Pauli Janhunen Calderón,Brian Holden
4,Timothy J. Cox,Jayani Senanayake,Branko Istvancic,Eric Frith
5,Elton John,Susan Brown,Carlos Nader,Alvaro Calderón
6,Kalyan Chatterjee,CC King,Christopher Nolan,David Anghel
7,Joshua Bell,,Steve Ravic,
8,Alex Honnold,,Christopher Kenneally,
9,David Anghel,,Joe L. Roberts,
