In [1]:
# General libraries
import numpy as np
import pandas as pd
import csv

#### Revenue and budget merging <a class="tocSkip">

In [63]:
# Import dataset with revenue and budget info
df_revenue = pd.read_csv("final_dataset_v3.csv")
df_revenue.shape

(10112, 26)

In [61]:
# Drop columns with NA values that will not be included in the analysis
df_revenue_nonan = df_revenue.dropna()
df_revenue_nonan.shape

(8331, 26)

There are still some values of the column "runtimeMinutes" with null values that should be dropped.

In [65]:
df_revenue_nonan.groupby("runtimeMinutes").count()

Unnamed: 0_level_0,Unnamed: 0,tconst,primaryTitle,originalTitle,isAdult,startYear,Adult,Animation,Fantasy,Game-Show,...,Sci-Fi,Short,Sport,Talk-Show,War,Western,averageRating,numVotes,budget,revenue
runtimeMinutes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100,213,213,213,213,213,213,213,213,213,213,...,213,213,213,213,213,213,213,213,213,213
101,158,158,158,158,158,158,158,158,158,158,...,158,158,158,158,158,158,158,158,158,158
102,136,136,136,136,136,136,136,136,136,136,...,136,136,136,136,136,136,136,136,136,136
103,124,124,124,124,124,124,124,124,124,124,...,124,124,124,124,124,124,124,124,124,124
104,157,157,157,157,157,157,157,157,157,157,...,157,157,157,157,157,157,157,157,157,157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,171,171,171,171,171,171,171,171,171,171,...,171,171,171,171,171,171,171,171,171,171
97,195,195,195,195,195,195,195,195,195,195,...,195,195,195,195,195,195,195,195,195,195
98,199,199,199,199,199,199,199,199,199,199,...,199,199,199,199,199,199,199,199,199,199
99,175,175,175,175,175,175,175,175,175,175,...,175,175,175,175,175,175,175,175,175,175


In [68]:
df_revenue_filtered = df_revenue_nonan[df_revenue_nonan['runtimeMinutes'].str.isnumeric()]
df_revenue_filtered.shape

(8060, 26)

#### Actors and directors merging <a class="tocSkip">

The goal of this section is identify if the titles in the dataset include top directors or top actors. For that purpose, the most famous 100 actors and most famost 100 directors will be considered, and two columns will be added to each title indicating whether it contains at least one famous actors and whether it contains at least one famous director.

In [71]:
# Dataset with actors and directors names of the movies
actors = pd.read_csv("name_basics.tsv", sep='\t')
actors.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0053137,tt0031983,tt0072308,tt0050419"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0117057,tt0071877,tt0038355,tt0037382"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0049189,tt0057345,tt0054452,tt0056404"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0077975,tt0080455,tt0072562,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050976,tt0060827,tt0050986,tt0083922"


In [72]:
# Dataset indicating which actors and directors appear in each of the movies 
principals = pd.read_csv("title_principals.tsv", sep='\t')
principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N


In [73]:
# Dataset containing the name of the most famous 100 actors
best_actors = pd.read_csv("best_actors.csv")
best_actors.head()

Unnamed: 0,# actor_name
0,Jack Nicholson
1,Marlon Brando
2,Robert De Niro
3,Al Pacino
4,Daniel Day-Lewis


In [74]:
# Dataset containing the name of the most famous 100 directors
best_directors = pd.read_csv("best_directors.csv")
best_directors.head()

Unnamed: 0,# director_name
0,Steven Spielberg
1,Martin Scorsese
2,Alfred Hitchcock
3,Stanley Kubrick
4,Quentin Tarantino


Let's identify the rows of the datasets regarding actors that are included in the dataset filtered with the revenue information.

In [75]:
# Array of the id of the movies included in the filtered movie dataset
unique_movies_ids = pd.unique(df_revenue_filtered["tconst"])
unique_movies_ids

array(['tt0035423', 'tt0116391', 'tt0118589', ..., 'tt9883996',
       'tt9892546', 'tt9904802'], dtype=object)

In [76]:
# Filtered dataset of actors contained in the filtered dataset of movies
filtered_actors = principals[principals["tconst"].isin(unique_movies_ids)]
filtered_actors.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
282230,tt0035423,10,nm0107463,editor,\N,\N
282231,tt0035423,1,nm0000212,actress,\N,"[""Kate McKay""]"
282232,tt0035423,2,nm0413168,actor,\N,"[""Leopold""]"
282233,tt0035423,3,nm0000630,actor,\N,"[""Stuart Besser""]"
282234,tt0035423,4,nm0005227,actor,\N,"[""Charlie McKay""]"


In [78]:
# Join the actors 
actors_names = filtered_actors.join(actors.set_index('nconst'), on='nconst')
actors_names.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
282230,tt0035423,10,nm0107463,editor,\N,\N,David Brenner,\N,\N,"editor,editorial_department","tt0319262,tt0116629,tt0096969,tt1190080"
282231,tt0035423,1,nm0000212,actress,\N,"[""Kate McKay""]",Meg Ryan,1961,\N,"actress,producer,soundtrack","tt0120632,tt0108160,tt0098635,tt0128853"
282232,tt0035423,2,nm0413168,actor,\N,"[""Leopold""]",Hugh Jackman,1968,\N,"actor,soundtrack,producer","tt3315342,tt0120903,tt1707386,tt0458525"
282233,tt0035423,3,nm0000630,actor,\N,"[""Stuart Besser""]",Liev Schreiber,1967,\N,"actor,producer,miscellaneous","tt0458525,tt0368008,tt1895587,tt0404030"
282234,tt0035423,4,nm0005227,actor,\N,"[""Charlie McKay""]",Breckin Meyer,1974,\N,"actor,writer,producer","tt0250687,tt0101917,tt0112697,tt0215129"


In [79]:
# Create a list of the most 100 famous actors (top actors)
best_actors_list = list(best_actors["# actor_name"])

In [80]:
# Identify whether each of the titles contains top actors 
actors_names["isTopActor"] = actors_names["primaryName"].isin(best_actors_list)

In [81]:
# Number of movies with top actors
actors_names["isTopActor"].sum()

1084

In [82]:
# Create a list of the most 100 famous directors (top directors)
best_directors_list = list(best_directors["# director_name"])

In [83]:
# Identify whether each of the titles contains top directors 
actors_names["isTopDirector"] = actors_names["primaryName"].isin(best_directors_list)

In [84]:
# Number of movies with top directors
actors_names["isTopDirector"].sum()

857

In [85]:
# Get the columns of isTopActor and isTopDirector
movies_tops = actors_names.groupby("tconst").sum()
movies_tops = movies_tops[["isTopActor", "isTopDirector"]]
movies_tops

Unnamed: 0_level_0,isTopActor,isTopDirector
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0035423,1,0
tt0116391,0,0
tt0118589,0,0
tt0120166,0,0
tt0120467,0,0
...,...,...
tt9872052,0,0
tt9875554,0,0
tt9883996,0,0
tt9892546,0,0


In [86]:
# Filter those movies with at least one top actor and one top director
movies_tops["isTopActor"] = movies_tops["isTopActor"] > 0
movies_tops["isTopDirector"] = movies_tops["isTopDirector"] > 0

In [87]:
# Merge the top actors and directors information with the previous dataset
final_df = df_revenue_filtered.join(movies_tops, on="tconst")

In [90]:
final_df.head()

Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,Adult,Animation,Fantasy,...,Sport,Talk-Show,War,Western,averageRating,numVotes,budget,revenue,isTopActor,isTopDirector
0,0,tt0035423,Kate & Leopold,Kate & Leopold,0,2001,118,0,0,1,...,0,0,0,0,6.4,81936.0,48000000,76019048,True,False
2,5062,tt0116391,Gang,Gang,0,2000,152,0,0,0,...,0,0,0,0,6.2,236.0,30000000,41480851,False,False
3,5778,tt0118589,Glitter,Glitter,0,2001,104,0,0,0,...,0,0,0,0,2.3,23292.0,22000000,5271666,False,False
4,6540,tt0120166,The Sorcerer's Apprentice,The Sorcerer's Apprentice,0,2001,86,0,0,1,...,0,0,0,0,4.5,565.0,150000000,215283742,False,False
5,6682,tt0120467,Vulgar,Vulgar,0,2000,87,0,0,0,...,0,0,0,0,5.2,4078.0,120000,14904,False,False


In [91]:
final_df.shape

(8060, 28)

In [93]:
# Exporting filtered dataset to CSV
final_df.to_csv('revenue_actors.csv', encoding='utf-8', index=False)