In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import tarfile
import math
from scipy import stats
import statsmodels.formula.api as smf

In [3]:
filename = "MovieSummaries.tar.gz"
ds = tarfile.open(filename)
print(ds.getnames())
tf = tarfile.open(filename)
tf.extractall('MS_decomp')

['MovieSummaries', 'MovieSummaries/tvtropes.clusters.txt', 'MovieSummaries/name.clusters.txt', 'MovieSummaries/plot_summaries.txt', 'MovieSummaries/README.txt', 'MovieSummaries/movie.metadata.tsv', 'MovieSummaries/character.metadata.tsv']


In [None]:
movie = pd.read_csv('./MS_decomp/MovieSummaries/'+'movie.metadata.tsv', 
                 sep='\t',
                 names=['Wikipedia_movie_ID',
                        'Freebase_movie_ID',
                        'Movie_name',
                        'Movie_release_date',
                        'Movie_box_office_revenue',
                        'Movie_runtime',
                        'Movie_languages',
                        'Movie_countries',
                        'Movie_genres'],
                 header=None)

In [None]:
character = pd.read_csv('./MS_decomp/MovieSummaries/'+'character.metadata.tsv', 
                 sep='\t',
                 names=['Wikipedia_movie_ID',
                        'Freebase_movie_ID',
                        'Movie_release_date',
                        'Character_name',
                        'Actor_date_of_birth',
                        'Actor_gender',
                        'Actor_height',
                        'Actor_ethnicity',
                        'Actor_name',
                        'Actor_age_at_movie_release',
                        'Freebase_character/actor_map_ID',
                        'Freebase_character_ID',
                        'Freebase_actor_ID'],
                 header=None)

In [None]:
plots = pd.read_csv('./MS_decomp/MovieSummaries/'+'plot_summaries.txt', 
                 sep='\t',
                 names=['Wikipedia_movie_ID',
                        'Summary'],
                 header=None)

In [None]:
movie.head(3)

In [None]:
character.head(3)

In [None]:
plots.head(3)

# Data cleaning

In [None]:
#move release dates and actor birth dates to datetime format and keep only release year
character.Movie_release_date = pd.to_datetime(character['Movie_release_date'],
                                              errors='coerce').dt.year

movie.Movie_release_date = pd.to_datetime(movie['Movie_release_date'],
                                              errors='coerce').dt.year

In [None]:
#sort dataframes by ascending release year
character.sort_values(by=['Movie_release_date'], ascending=True, inplace=True)

movie.sort_values(by=['Movie_release_date'], ascending=True, inplace=True)

In [None]:
#merge dataframes
ds = pd.merge(movie,character)

In [None]:
#find movies without box office and filter them out
bad_ids = ds.loc[ds['Movie_box_office_revenue'].isnull()].index
ds_filt = ds.drop(bad_ids)

In [None]:
#sort dataframe by movie release date
#ds_filt.sort_values(by=['Movie_release_date'], ascending=True, inplace=True)

In [None]:
ds_filt.head(3)

## Preliminary results on the impact of one actor

In [None]:
#slice the character dataframe to get only Leonardo Di Caprio movies
nc = pd.DataFrame(ds_filt.query("Actor_name == 'Leonardo DiCaprio'")).reset_index(drop=True)

In [None]:
#number of movies Nicolas Cage starred in
print('Leonardo Di Caprio starred in ' + str(len(nc)) + ' movies')

In [None]:
nc.head(3)

As a preliminary analysis, we plot the distribution of box office revenues for Leonardo Di Caprio movies to see the type of distribution. From the histogram below, we observe that most of the movies have a low box office revenue (skewed distribution); the goal is to verify if there is a correlation between the 'experience' of the actor (number of times he has starred in the movie) and the revenue. 

From the box plot below it is possible to distinguish the median and the quartiles of the distribution. A couple of outliers make it difficult to visualize the distribution properly. To correctly read the data, we proceed to remove them.

In [None]:
#plot distribution
fig, ax = plt.subplots(1,2, figsize=(8,3), sharey = True)

sbplt = ax[0]
sbplt.hist(nc.Movie_box_office_revenue, bins=15, orientation='horizontal')
sbplt.set_title('Histogram')

sbplt = ax[1]
sbplt.boxplot(nc.Movie_box_office_revenue)
sbplt.set_title('Box plot')
    
fig.tight_layout()
fig.text(0,0.3, "Box office revenue", rotation = 90)
plt.show()

In [None]:
#remove outliers
nc.query('Movie_box_office_revenue < 0.5e9', inplace=True)
nc.reset_index(inplace=True, drop=True)
print('The new number of movies considered is ' + str(len(nc)))

In this case, the distribution is way more readable. Most of the movies are in the low box office part

In [None]:
#some stats
print('The median is ' + str(np.median(nc.Movie_box_office_revenue)/1e6) + ' M$')
print('The first quartile is ' + str(np.quantile(nc.Movie_box_office_revenue,0.25)/1e6) + ' M$')
print('The third quartile is ' + str(np.quantile(nc.Movie_box_office_revenue,0.75)/1e6) + ' M$')

In [None]:
#plot again

fig, ax = plt.subplots(1,2, figsize=(8,3), sharey = True)

sbplt = ax[0]
sbplt.hist(nc.Movie_box_office_revenue, bins=10, orientation='horizontal')
sbplt.set_title('Histogram')

sbplt = ax[1]
sbplt.boxplot(nc.Movie_box_office_revenue)
sbplt.set_title('Box plot')
    
fig.tight_layout()
fig.text(0,0.3, "Box office revenue", rotation = 90)
plt.show()

### Actor's experience

In [None]:
#series with number of times the actor starred before (related to dataframe)
#in reality there are more movies
exp = np.zeros(len(nc)) 
t = 1
for i in range(len(nc)):
    exp[i] = t
    t += 1

In [None]:
nc['experience'] = exp #add to dataframe

In [None]:
#is there a correlation between experience and box office revenue??
stats.pearsonr(nc['experience'],nc['Movie_box_office_revenue']) #pearson correlation

In [None]:
stats.spearmanr(nc['experience'],nc['Movie_box_office_revenue'])

Applying the Pearson and Spearman relation, we obtain a positive correlation of **0.59** and **0.63**, respectively. This seems to lead to a link between experience and box office revenue.

In [None]:
#plot linear correlation
sns.lmplot(x='experience',y='Movie_box_office_revenue', data=nc)
plt.show()

### Regression analysis

To understand a bit better the influence of the other parameters on the box office revenue, a model of the revenue based on multiple factors can be tested. The factors taken into consideration are:
- Movie release date;
- Movie runtime;
- Experience.

In [None]:
#copy dataframe
nc_reg = nc.copy()

In [None]:
#logarithmic scaling of continuous variables
nc_reg['Movie_box_office_revenue'] = np.log(nc_reg['Movie_box_office_revenue'])
nc_reg['Movie_runtime'] = np.log(nc_reg['Movie_runtime'])
nc_reg['Movie_release_date'] = np.log(nc_reg['Movie_release_date'])
nc_reg['experience'] = np.log(nc_reg['experience'])

In [None]:
#standardize continuous variables
#nc['Movie_runtime'] = (nc['Movie_runtime'] - nc['Movie_runtime'].mean())/nc['Movie_runtime'].std()
#nc['Movie_release_date'] = (nc['Movie_release_date'] - nc['Movie_release_date'].mean())/nc['Movie_release_date'].std()
#nc['Actor_age_at_movie_release'] = (nc['Actor_age_at_movie_release'] - nc['Actor_age_at_movie_release'].mean())/nc['Actor_age_at_movie_release'].std()
#nc['experience'] = (nc['experience'] - nc['experience'].mean())/nc['experience'].std()

In [None]:
mod = smf.ols(formula='Movie_box_office_revenue ~ Movie_runtime + Movie_release_date + \
                      + experience', data=nc_reg)

res = mod.fit()
print(res.summary())

From the results, it is possible to get interesting information. The R squared is **0.53**, meaning more than half of the variance is explained; the movie release date significantly influences the revenue (C=**155.4**) and both the movie runtime and experience of the actor positively affect the revenue, even if their effect on the outcome is lower.
Of course, the inflation is not taken into account here, therefore the strong statistical influence of release dates on box office revenues could be biased by this confounder.

In [None]:
#take inflation into account
#https://liberalarts.oregonstate.edu/spp/polisci/faculty-staff/robert-sahr/inflation-conversion-factors-years-1774-estimated-2024-dollars-recent-years/individual-year-conversion-factor-table-0
cf = pd.read_csv('inflation_correction.csv')
cf.query('Year > 1969', inplace=True)
cf.reset_index(drop=True, inplace=True)
#cf['Year'] = pd.to_datetime(pd.Series(cf.Year)).dt.year #convert to datetime format

In [None]:
cf.head(3)

In [None]:
#function to compute inflation for a certain year
def infl(year):
    inf = cf['CF'][cf['Year'] == year]
    return inf

In [None]:
#compute inflation for dataframe rows and add column
infl_corr = np.zeros(len(nc))
for i in range(len(nc)):
    infl_corr[i] = infl(nc['Movie_release_date'][i])
nc['CF'] = infl_corr

In [None]:
#add inflation corrected box office revenue
nc['inflation_revenue'] = nc['Movie_box_office_revenue']/nc['CF']

In [None]:
nc.head(3)

In [None]:
#compare the new revenues
plt.plot(nc.Movie_release_date, nc.inflation_revenue/1e6, label='Inflation correction')
plt.plot(nc.Movie_release_date, nc.Movie_box_office_revenue/1e6, label='Real revenue')
plt.legend()
plt.xlabel('Year')
plt.ylabel('Boc office revenue [M$]')
plt.show()

In [None]:
#run the least mean squares again
nc_reg2 = nc.copy()
#logarithmic scaling of continuous variables
nc_reg2['inflation_revenue'] = np.log(nc_reg2['inflation_revenue'])
nc_reg2['Movie_runtime'] = np.log(nc_reg2['Movie_runtime'])
nc_reg2['Movie_release_date'] = np.log(nc_reg2['Movie_release_date'])
nc_reg2['experience'] = np.log(nc_reg2['experience'])

In [None]:
mod = smf.ols(formula='inflation_revenue ~ Movie_runtime + Movie_release_date + \
                      + experience', data=nc_reg2)

res = mod.fit()
print(res.summary())

From the results above, a significant decrease of **1/3** in the influence of the movie release date is observed.

We can go further in the analysis, by investigating the gender equality/inequality in the movies to see if they are related to revenue as well. The eventual goal would be to establish the benefit of choosing an actor over another one, and so a gender study is fundamental for this purpose.

To have some preliminary results on a single sample, the analysis will regard one of the previously studied Leonardo Di Caprio movies.

In [None]:
ex = pd.merge(ds_filt,nc, on='Movie_name', how='right')

In [None]:
ds_filt

In [None]:
dt = pd.DataFrame(ds.groupby('Movie_name')['Actor_name'].count())

In [None]:
dt.head(50)