In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import tarfile
import math
from scipy import stats
import statsmodels.formula.api as smf

In [7]:
#import movie dataset
movie = pd.read_csv('../data/MovieSummaries/movie.metadata.tsv', sep='\t')

In [8]:
#import character dataset
character = pd.read_csv('../data/MovieSummaries/'+'character.metadata.tsv', sep='\t')

# Data cleaning

In [9]:
movie

Unnamed: 0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science Fiction"", ""/m/03npn"": ""Horror"", ""/m/03k9fj"": ""Adventure"", ""/m/0fdjb"": ""Supernatural"", ""/m/02kdv5l"": ""Action"", ""/m/09zvmj"": ""Space western""}"
0,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
1,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
2,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
3,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"
4,13696889,/m/03cfc81,The Gangsters,1913-05-29,,35.0,"{""/m/06ppq"": ""Silent film"", ""/m/02h40lc"": ""Eng...","{""/m/09c7w0"": ""United States of America""}","{""/m/02hmvc"": ""Short Film"", ""/m/06ppq"": ""Silen..."
...,...,...,...,...,...,...,...,...,...
81735,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}"
81736,34980460,/m/0g4pl34,Knuckle,2011-01-21,,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0..."
81737,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,,66.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06nbt"": ""Satire"", ""/m/01z4y"": ""Comedy""}"
81738,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ..."


In [5]:
#move release dates and actor birth dates to datetime format and keep only release year
character.Movie_release_date = pd.to_datetime(character['Movie_release_date'],
                                              errors='coerce').dt.year

movie.Movie_release_date = pd.to_datetime(movie['Movie_release_date'],
                                              errors='coerce').dt.year

KeyError: 'Movie_release_date'

In [None]:
#sort dataframes by ascending release year
character.sort_values(by=['Movie_release_date'], ascending=True, inplace=True)

movie.sort_values(by=['Movie_release_date'], ascending=True, inplace=True)

In [None]:
#merge dataframes
ds = pd.merge(movie,character)

In [None]:
#find movies without box office and filter them out
bad_ids = ds.loc[ds['Movie_box_office_revenue'].isnull()].index
ds_filt = ds.drop(bad_ids)

## Preliminary results on the impact of one actor

In [None]:
#slice the character dataframe to get only Leonardo Di Caprio movies
nc = pd.DataFrame(ds_filt.query("Actor_name == 'Leonardo DiCaprio'")).reset_index(drop=True)

In [None]:
#number of movies Leonardo Di Caprio starred in
print('Leonardo Di Caprio starred in ' + str(len(nc)) + ' movies')

In [None]:
nc.head(3)

As a preliminary analysis, we plot the distribution of box office revenues for Leonardo Di Caprio movies to see the type of distribution. From the histogram below, we observe that most of the movies have a low box office revenue (skewed distribution); the goal is to verify if there is a correlation between the 'experience' of the actor (number of times he has starred in the movie) and the revenue. 

From the box plot below it is possible to distinguish the median and the quartiles of the distribution. A couple of outliers make it difficult to visualize the distribution properly. To correctly read the data, we proceed to remove them.

In [None]:
#plot distribution
fig, ax = plt.subplots(1,2, figsize=(8,3), sharey = True)

sbplt = ax[0]
sbplt.hist(nc.Movie_box_office_revenue, bins=15, orientation='horizontal')
sbplt.set_title('Histogram')

sbplt = ax[1]
sbplt.boxplot(nc.Movie_box_office_revenue)
sbplt.set_title('Box plot')
    
fig.tight_layout()
fig.text(0,0.3, "Box office revenue [$]", rotation = 90)
plt.show()

In [None]:
#remove outliers
nc.query('Movie_box_office_revenue < 0.5e9', inplace=True)
nc.reset_index(inplace=True, drop=True)
print('The new number of movies considered is ' + str(len(nc)))

In this case, the distribution is way more readable. Most of the movies are in the low box office part

In [None]:
#some stats
print('The median is ' + str(np.median(nc.Movie_box_office_revenue)/1e6) + ' M$')
print('The first quartile is ' + str(np.quantile(nc.Movie_box_office_revenue,0.25)/1e6) + ' M$')
print('The third quartile is ' + str(np.quantile(nc.Movie_box_office_revenue,0.75)/1e6) + ' M$')

In [None]:
#plot again

fig, ax = plt.subplots(1,2, figsize=(8,3), sharey = True)

sbplt = ax[0]
sbplt.hist(nc.Movie_box_office_revenue, bins=10, orientation='horizontal')
sbplt.set_title('Histogram')

sbplt = ax[1]
sbplt.boxplot(nc.Movie_box_office_revenue)
sbplt.set_title('Box plot')
    
fig.tight_layout()
fig.text(0,0.3, "Box office revenue", rotation = 90)
plt.show()

### Actor's experience

In [None]:
#series with number of times the actor starred before (related to dataframe)
#in reality there are more movies
exp = np.zeros(len(nc)) 
t = 1
for i in range(len(nc)):
    exp[i] = t
    t += 1

In [None]:
nc['experience'] = exp #add to dataframe

In [None]:
#is there a correlation between experience and box office revenue??
stats.pearsonr(nc['experience'],nc['Movie_box_office_revenue']) #pearson correlation

In [None]:
stats.spearmanr(nc['experience'],nc['Movie_box_office_revenue'])

Applying the Pearson and Spearman relation, we obtain a positive correlation of **0.59** and **0.63**, respectively. This seems to lead to a link between experience and box office revenue.

In [None]:
#plot linear correlation
sns.lmplot(x='experience',y='Movie_box_office_revenue', data=nc)
plt.show()

### Regression analysis

To understand a bit better the influence of the other parameters on the box office revenue, a model of the revenue based on multiple factors can be tested. The factors taken into consideration are:
- Movie runtime;
- Experience.

In [None]:
#copy dataframe
nc_reg = nc.copy()

In [None]:
#logarithmic scaling of continuous variables
nc_reg['Movie_box_office_revenue'] = np.log(nc_reg['Movie_box_office_revenue'])
#nc_reg['Movie_runtime'] = np.log(nc_reg['Movie_runtime'])
#nc_reg['experience'] = np.log(nc_reg['experience'])

In [None]:
#standardize continuous variables
#nc_reg['Movie_box_office_revenue'] = (nc_reg['Movie_box_office_revenue'] - nc_reg['Movie_box_office_revenue'].mean())/nc_reg['Movie_box_office_revenue'].std()
#nc['Movie_release_date'] = (nc['Movie_release_date'] - nc['Movie_release_date'].mean())/nc['Movie_release_date'].std()
#nc['Actor_age_at_movie_release'] = (nc['Actor_age_at_movie_release'] - nc['Actor_age_at_movie_release'].mean())/nc['Actor_age_at_movie_release'].std()
#nc['experience'] = (nc['experience'] - nc['experience'].mean())/nc['experience'].std()

In [None]:
mod = smf.ols(formula='Movie_box_office_revenue ~  Movie_runtime +  experience', data=nc_reg)

res = mod.fit()
print(res.summary())

From the results, it is possible to get interesting information. The R squared is **0.52**, meaning more than half of the variance is explained. The movie runtime (**C=0.03**) and the experience of the actor (**C=0.174**) positively influence the revenue, even if only the latter is statistically significant (**P < 5%**).
A confounder that could naively lead us to overestimate the statistical significance of the actor's experience is inflation. Indeed, over the years the value of money changes; for a proper evaluation, an inflation correction has to be done.

In [None]:
#take inflation into account
#https://liberalarts.oregonstate.edu/spp/polisci/faculty-staff/robert-sahr/inflation-conversion-factors-years-1774-estimated-2024-dollars-recent-years/individual-year-conversion-factor-table-0
cf = pd.read_csv('../data/inflation_correction.csv')
cf.query('Year > 1969', inplace=True)
cf.reset_index(drop=True, inplace=True)
#cf['Year'] = pd.to_datetime(pd.Series(cf.Year)).dt.year #convert to datetime format

In [None]:
cf.head(3)

In [None]:
#function to compute inflation for a certain year
def infl(year):
    inf = cf['CF'][cf['Year'] == year]
    return inf

In [None]:
#compute inflation for dataframe rows and add column
infl_corr = np.zeros(len(nc))
for i in range(len(nc)):
    infl_corr[i] = infl(nc['Movie_release_date'][i])
nc['CF'] = infl_corr

In [None]:
#add inflation corrected box office revenue
nc['inflation_revenue'] = nc['Movie_box_office_revenue']/nc['CF']

In [None]:
nc.head(3)

In [None]:
#compare the new revenues
plt.plot(nc.Movie_release_date, nc.inflation_revenue/1e6, label='Inflation correction')
plt.plot(nc.Movie_release_date, nc.Movie_box_office_revenue/1e6, label='Real revenue')
plt.legend()
plt.xlabel('Year')
plt.ylabel('Boc office revenue [M$]')
plt.show()

In [None]:
#run the least mean squares again
nc_reg2 = nc.copy()
#logarithmic scaling of continuous variables
nc_reg2['inflation_revenue'] = np.log(nc_reg2['inflation_revenue'])
#nc_reg2['Movie_runtime'] = np.log(nc_reg2['Movie_runtime'])
#nc_reg2['Movie_release_date'] = np.log(nc_reg2['Movie_release_date'])
#nc_reg2['experience'] = np.log(nc_reg2['experience'])

In [None]:
mod = smf.ols(formula='inflation_revenue ~ Movie_runtime + experience', data=nc_reg2)

res = mod.fit()
print(res.summary())

From the new results above, it is possible to confirm the statistical significance of the experience on the box office (**P < 5%**), even if its influence has decreased (**C=0.148**), due to inflation.