## Final Project Submission

Please fill out:
* Student name: Magali Solimano
* Student pace: self paced 
* Scheduled project review date/time: 
* Instructor name: Jeff Herman
* Blog post URL:


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
## Load the data (11 files)
df1 = pd.read_csv('zippedData/bom.movie_gross.csv.gz', compression='gzip')
df2 = pd.read_csv('zippedData/imdb.name.basics.csv.gz', compression='gzip')
df3 = pd.read_csv('zippedData/imdb.title.akas.csv.gz', compression='gzip')
df4 = pd.read_csv('zippedData/imdb.title.basics.csv.gz', compression='gzip') 
df5 = pd.read_csv('zippedData/imdb.title.crew.csv.gz', compression='gzip')
df6 = pd.read_csv('zippedData/imdb.title.principals.csv.gz', compression='gzip')
df7 = pd.read_csv('zippedData/imdb.title.ratings.csv.gz', compression='gzip')
df8 = pd.read_csv('zippedData/rt.movie_info.tsv.gz', delimiter='\t', compression='gzip')
df9 = pd.read_csv('zippedData/rt.reviews.tsv.gz', compression='gzip', delimiter='\t', encoding='iso-8859-1')
df10 = pd.read_csv('zippedData/tmdb.movies.csv.gz', compression='gzip')
df11 = pd.read_csv('zippedData/tn.movie_budgets.csv.gz', compression='gzip')

In [None]:
# Explore the dataframes and data types
print(df1.info())
df1.head()

In [None]:
print(df2.info())
df2.head()

In [None]:
print(df3.info())
df3.head()

In [None]:
print(df4.info())
df4.head()

In [None]:
print(df5.info())
df5.head()

In [None]:
print(df6.info())
df6.head()

In [None]:
print(df7.info())
df7.head()

In [None]:
print(df8.info())
df8.head()

In [None]:
print(df9.info())
df9.head()

In [None]:
print(df10.info())
df10.head()

In [None]:
print(df11.info())
df11.head()

In [None]:
## QUESTIONS - PART 1: WHAT MOVIES ARE TOP REVENUE AND ROI GENERATORS? WHAT ARE THE TOP STUDIOS?

In [None]:
#df1 and #df11 have revenue cols -- compare these results. Can the dfs be joined?

In [None]:
# Convert budget and revenue cols to float dtype; release_date to datetime. Remove '$' and ','.
df11['production_budget'] = df11['production_budget'].map(lambda x: float(x.replace("$", "").replace(',','')))
df11['domestic_gross'] = df11['domestic_gross'].map(lambda x: float(x.replace("$", "").replace(',','')))
df11['worldwide_gross'] = df11['worldwide_gross'].map(lambda x: float(x.replace("$", "").replace(',','')))
df11['release_date'] = pd.to_datetime(df11['release_date'])
df11['year'] = pd.DatetimeIndex(df11['release_date']).year

# Confirm that dtype conversions worked and that null values did not increase
display(df11.info())

In [None]:
df11 = df11.rename(columns={"movie": "title"})
df11.head()

In [None]:
# Compare top gross: df1 vs df11

In [None]:
display(df1.sort_values(by=['domestic_gross'], ascending=False).head(20))
df11.sort_values(by=['domestic_gross'], ascending=False).head(20)

In [None]:
# There is overlap between df1 & df11. But, df11 has more movie entries (incl. several top revenue generators) that are not in df1.
# df1 might not have a particular studio's movies.

In [None]:
# Set index to title.
# Join df11 and df1--left join to bring in df1's studio info. 

In [None]:
df1.set_index('title', inplace = True)
df11.set_index('title', inplace = True)

In [None]:
df_financials = df11.join(df1, how='left', rsuffix='_right')
display(df_financials.info())
df_financials.head()

In [None]:
# Drop df1 revenue data since they seem to round, while df11's revenue data is more precise.
# Drop foreign_gross, year_right.

In [None]:
df_financials = df_financials.drop(columns=['domestic_gross_right', 'foreign_gross', 'year_right'])

In [None]:
# Convert "0" values to NaN
df_financials['production_budget'] = df_financials['production_budget'].replace(0.0, np.nan)
df_financials['domestic_gross'] = df_financials['domestic_gross'].replace(0.0, np.nan)
df_financials['worldwide_gross'] = df_financials['worldwide_gross'].replace(0.0, np.nan)


# Convert values to millions
df_financials['production_budget_m'] = (df_financials['production_budget'] / 1000000).round(1)
df_financials['domestic_gross_m'] = (df_financials['domestic_gross'] / 1000000).round(1)
df_financials['worldwide_gross_m'] = (df_financials['worldwide_gross'] / 1000000).round(1)

df_financials.head(10)

In [None]:
# Create new cols for foreign_gross, net_revenue, ROI
df_financials['foreign_gross_m'] = (df_financials['worldwide_gross_m'] - df_financials['domestic_gross_m']).round(1)
df_financials['net_revenue_m'] = (df_financials['worldwide_gross_m'] - df_financials['production_budget_m']).round(1)
df_financials['ROI'] = ((df_financials['worldwide_gross'] - df_financials['production_budget']) / df_financials['production_budget']).round(2)
df_financials['ROIpct'] = (((df_financials['worldwide_gross'] - df_financials['production_budget']) / df_financials['production_budget'])*100).round(2)
df_financials.head()

In [None]:
# reset index
df_financials.reset_index(inplace = True)

In [None]:
# Check for duplicates. Explore if need to remove or can keep.
df_financials['title'].value_counts()

In [None]:
df_financials[df_financials['title'].duplicated() == True]

In [None]:
# view selection of rows with duplicate titles
df_financials.sort_values('title').iloc[231:250, :]

In [None]:
# Duplicate movie titles are due to movie being re-released years later. 
# Keep all rows and create new col 'title_yr'. 
df_financials['title_yr'] = df_financials['title'].str.cat(df_financials.year.astype(str), sep = '-')
df_financials.head()

In [None]:
df_financials[df_financials['title_yr'].duplicated() == True]

In [None]:
df_financials.sort_values('title').iloc[2017:2022, :]

In [None]:
# Home movie - Double entry for 2009 release--unable to determine which one has accurate data, both appear to have notable outliers. Remove two 2009 entries.
df_financials = df_financials.drop([2019, 2020])

In [None]:
df_financials[df_financials['title_yr'].duplicated() == True]

In [None]:
df_financials.sort_values('title').iloc[2017:2022, :]

In [None]:
# Create month col
df_financials['month_num'] = pd.DatetimeIndex(df_financials['release_date']).month
df_financials['month'] = pd.to_datetime(df_financials['month_num'], format='%m').dt.month_name().str.slice(stop=3)
df_financials.head()

In [None]:
# Examine distribution of data

In [None]:
display(df_financials.info())
display(df_financials[["production_budget_m", "domestic_gross_m", "worldwide_gross_m", "foreign_gross_m", "net_revenue_m", "ROI"]].apply(['mean', 'median', 'std']))
display(df_financials.quantile(q=0.75))
display(df_financials.quantile(q=0.90))
display(df_financials.quantile(q=0.95))
display(df_financials.quantile(q=0.99))

In [None]:
# Domestic and worldwide gross revenue, net revenue, ROI are positively skewed--mean is larger than median, with large outliers distorting the mean.

In [None]:
df_financials.hist(column=['worldwide_gross_m'], bins=50)

In [None]:
df_financials.boxplot(column=['worldwide_gross_m'])

In [None]:
df_financials.boxplot(column=['ROI'])

In [None]:
# Number of movies in 75th, 90th, 99th percentiles - By worldwide gross revenue
print(f"Number of movies in 75th percentile: {len(df_financials[df_financials['worldwide_gross_m']>1.044750e+02])}")    
print(f"Number of movies in 90th percentile: {len(df_financials[df_financials['worldwide_gross_m']>2.596100e+02])}")
print(f"Number of movies in 95th percentile: {len(df_financials[df_financials['worldwide_gross_m']>4.103150e+02])}")
print(f"Number of movies in 99th percentile: {len(df_financials[df_financials['worldwide_gross_m']>8.938990e+02])}")

In [None]:
# Q1: What movies are the top gross revenue generators? (including outliers)

# Create new df - movies with gross revenue above 90th percentile
df_grossrev_99p = df_financials[df_financials.worldwide_gross_m >= 8.938658e+02].sort_values('worldwide_gross_m', ascending = False)
display(df_grossrev_99p.shape)

# top 25
df_grossrev_top25 = df_financials.sort_values('worldwide_gross_m', ascending = False).head(25)
display(df_grossrev_top25.shape)

In [None]:
# Graph gross revenue movies

ax1 = df_grossrev_99p.plot.barh(x= 'title', 
                                y= 'worldwide_gross_m', 
                                color = 'green', 
                                stacked=False, 
                                rot=0, 
                                figsize=(12,12), 
                                label = '_nolegend_')
ax1.set(
    title='Top Revenue Grossing Movies',
    xlabel='Millions $',
    ylabel='Movies')
ax1.invert_yaxis()

ax2 = df_grossrev_99p.plot.barh(x= 'title', 
                                y = 'ROIpct', 
                                color = 'green', 
                                stacked=False, 
                                rot=0, 
                                figsize=(12,10),
                                label = '_nolegend_')

ax2.set(
    title='ROI of Top Revenue Grossing Movies',
    xlabel='ROI (%)',
    ylabel='Movies')
ax2.invert_yaxis()

plt.style.use('ggplot')
plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(14,9))

title = df_grossrev_99p['title']
ww_gross = df_grossrev_99p['worldwide_gross_m']
ROIp = df_grossrev_99p['ROIpct']

ax2 = ax1.twinx()
ax1.bar(title, ww_gross, color='lightsteelblue')
ax2.plot(title, ROIp, marker = '.', markersize = 12, color='navy', linestyle='None')

#ax1.set_xlabel('Movie')
ax1.set_xticklabels(title, rotation=90, ha='center')

ax1.set_ylabel('Gross Revenue (Millions $)', color='gray')
ax1.set_yticks(np.arange (0, 3250, 250))
ax2.set_ylabel('ROI (Percent)', color='navy')
ax2.set_yticks(np.arange (0, 3250, 250))

plt.show()

In [None]:
# Question: What is total revenue trend per year?
df_financials.groupby(['year'])['worldwide_gross_m'].agg('sum')

In [None]:
# Question: What are the top grossing studios by year?
# Create studio df, remove NaN
df_studio_tot_rev = df_financials[df_financials['studio'].notnull()]
df_studio_tot_rev.year.value_counts(ascending = True)

In [None]:
# More studio data available from 2010 onwards
# Sum of each studio's total revenue from 2010-present
df_studio_tot_rev = df_studio_tot_rev[df_studio_tot_rev.year>2010].groupby('studio')['worldwide_gross_m'].agg(['sum'])
df_studio_tot_rev.reset_index(inplace = True)
df_studio_tot_rev.head()

In [None]:
# Rename column to 'total_revenue'
df_studio_tot_rev = df_studio_tot_rev.rename(columns={"sum": "total_revenue"})
# Sort in alphabetical order by studio
df_studio_tot_rev = df_studio_tot_rev.sort_values(by='studio', ascending=True)
df_studio_tot_rev

In [None]:
# Top 20 studios by total revenue, 2010-2019
df_studio_tot_rev_2010_2019 = df_studio_tot_rev.sort_values('total_revenue', ascending = False).head(20)
df_studio_tot_rev_2010_2019.head()

In [None]:
fig, ax = plt.subplots(figsize=(14,6))

studio = df_studio_tot_rev_2010_2019['studio']
tot_revenue = df_studio_tot_rev_2010_2019['total_revenue']
ax.barh(studio, tot_revenue, color = 'navy')

ax.set_title('Top Studios by Total Revenue (2010-2019)')
ax.set_xlabel('Total Revenue (Millions $)')

ax.set_xticks(np.arange(0,37500, 2500))

ax.invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
## Top studios by revenue from 2015-2019

In [None]:
# Top grossing studios (year 2015 to present)
# Create studio df, remove NaN
df_studio_tot_rev_2015_2019 = df_financials[df_financials['studio'].notnull()]
df_studio_tot_rev_2015_2019.head()

In [None]:
df_studio_tot_rev_2015_2019 = df_studio_tot_rev_2015_2019[df_studio_tot_rev_2015_2019.year>2014].groupby('studio')['worldwide_gross_m'].agg(['sum'])
df_studio_tot_rev_2015_2019.reset_index(inplace = True)
df_studio_tot_rev_2015_2019.head()

In [None]:
df_studio_tot_rev_2015_2019 = df_studio_tot_rev_2015_2019.rename(columns={"sum": "total_revenue"})
df_studio_tot_rev_2015_2019.head()

In [None]:
# Top 25 studios by total revenue, 2015-2019
df_studio_tot_rev_2015_2019 = df_studio_tot_rev_2015_2019.sort_values('total_revenue', ascending = False).head(20)
df_studio_tot_rev_2015_2019.head()

In [None]:
fig, ax = plt.subplots(figsize=(14,6))

studio_2015_2019 = df_studio_tot_rev_2015_2019['studio']
tot_revenue_2015_2019 = df_studio_tot_rev_2015_2019['total_revenue']
ax.barh(studio_2015_2019, tot_revenue_2015_2019, color = 'navy')

ax.set_title('Top Studios by Total Revenue (2015-2019)')
ax.set_xlabel('Total Revenue (Millions $)')

ax.set_xticks(np.arange(0,27500, 2500))

ax.invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Plot top studios' revenue by year

In [None]:
# Top grossing studios by year: 2010-present
# Create studio df, remove NaN

df_studio_tot_rev_annual = df_financials[df_financials['studio'].notnull()]
df_studio_tot_rev_annual = df_studio_tot_rev_annual[df_studio_tot_rev_annual.year>2009]
df_studio_tot_rev_annual.reset_index(inplace = True)
df_studio_tot_rev_annual.head()

In [None]:
# Create a subset
df_studio_tot_rev_annual = df_studio_tot_rev_annual.loc[:, ['studio', 'year', 'worldwide_gross_m']]
df_studio_tot_rev_annual.head()

In [None]:
# Separate top 6 studios from others
top6_studios = ['BV', 'Uni.', 'Fox', 'WB', 'Sony', 'Par.']

df_top6_studios_tot_rev_annual = df_studio_tot_rev_annual.loc[df_studio_tot_rev_annual['studio'].isin(top6_studios)]
df_top6_studios_tot_rev_annual.head()


In [None]:
# Group by multiple categories
df_top6_studios_tot_rev_annual = df_top6_studios_tot_rev_annual.groupby(['year', 'studio']).sum().unstack()
df_top6_studios_tot_rev_annual.tail()

In [None]:
from matplotlib import cm
viridis = cm.get_cmap('viridis', 6)

fig, ax = plt.subplots(figsize=(12,8))

df_top6_studios_tot_rev_annual.plot(kind='bar', stacked='True', color=viridis.colors, ax=ax)
ax.legend(['BV', 'Fox', 'Par.', 'Sony', 'Uni.', 'WB'], loc='center left', bbox_to_anchor=(1, 0.5))
ax.set_title('Top Six Studios - Gross Revenue (Millions $)')
ax.set_ylabel('Millions $')
ax.set_xlabel('Year')
plt.show()


In [None]:
## QUESTIONS - PART 2: INDUSTRY TRENDS: IS MOVIE INDUSTRY PROFITABLE? Analysis w/o outliers and from year 2000 - present

# Create ROI df, remove NaN
df_roi = df_financials[df_financials['ROI'].notnull()]
display(df_financials.describe())
df_roi.describe()

In [None]:
# Outlier detection
min_roi_threshold, max_roi_threshold = df_roi['ROI'].quantile([0.00, 0.90]).round(2)
min_roi_threshold, max_roi_threshold

In [None]:
df_roi[df_roi['ROI'] > max_roi_threshold]

In [None]:
# new DF without ROI outliers and with movies from year 2000 - present
df_roi = df_roi[(df_roi['ROI'] < max_roi_threshold) & 
                       (df_roi['ROI'] > min_roi_threshold) & 
                       (df_roi['year'] >= 2000)]
display(df_roi.sort_values('ROI', ascending = False).head(10))

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
ax.hist(df_roi['ROI'], bins='auto', alpha = 0.5)
ax.set_title('Distribution of ROI')
ax.set_xlabel('ROI')
ax.set_ylabel('Number of Movies')
ax.axvline(df_roi['ROI'].mean(), color = 'black')
ax.axvline(df_roi['ROI'].median(), color = 'red')

In [None]:
display(df_roi.ROI.apply(['mean', 'median', 'std']))

In [None]:
df_roi.boxplot(column=['ROI'])

In [None]:
#create new ROI buckets column 
def ROI_buckets(r):
    if r < 0: 
        return 'Not profitable'
    if r <= 1 : 
        return '0 - 100%'
    elif r <= 2: 
        return '100% - 200%'
    elif r <= 3: 
        return '200% - 300%'
    elif r <= 4: 
        return '300% - 400%'
    elif r <= 5: 
        return '400% - 500%'
    elif r <= 6: 
        return '500% - 600%'    
    elif r <= 7:
        return '600% - 700&'
    elif r <= 8:
        return '700% - 800%'
    else: 
        return 'Other'

df_roi['ROI_buckets'] = df_roi['ROI'].apply(ROI_buckets)
display(df_roi.head(10))
df_roi.info()

In [None]:
#Distribution of ROI by year
df_roi_yr = df_roi.groupby('year')['ROI_buckets'].value_counts(normalize = True)
df_roi_yr.tail(10)

In [None]:
# Analysis of ROI of movies in 90th percentile by ROI and after year 2000 (3,676 total movies)
df_yr_median = df_roi.groupby(['year']).agg(['median'])
df_yr_median

In [None]:
df_yr_median_roi = df_yr_median['ROIpct']
df_yr_median_roi = df_yr_median_roi.rename(columns={"median": "ROIpct_median"})
df_yr_median_roi.columns

In [None]:
df_yr_median_roi.head()

In [None]:
# Graph industry median and mean ROI 

fig, ax = plt.subplots(figsize=(10,8))

ROIpct_yr_median = df_yr_median_roi.ROIpct_median
yr = df_yr_median_roi.index

ax.bar(yr, ROIpct_yr_median, width=0.8, color = 'navy')
ax.set_title('Median ROI - Movies in 90th percentile')
ax.set_xlabel('Year')
ax.set_ylabel('Median ROI (%)')

ax.set_xticks(np.arange(2000,2020, 1))
ax.set_yticks(np.arange(0,130,10))

plt.style.use('ggplot')
plt.tight_layout()
plt.show()

In [None]:
df_roi_pct = ((df_roi.groupby(['year', 'ROI_buckets'])['ROI_buckets'].count()/df_roi.groupby(['year'])['ROI_buckets'].count()))*100
display(df_roi_pct.tail(8))

In [None]:
viridis = cm.get_cmap('viridis', 9)

ax = df_roi_pct.unstack().plot.bar(stacked = True, figsize=(14,10), color=viridis.colors)

ax.set_title('ROI - Movies in 90th percentile', fontsize = 18)
ax.set_xlabel('Year', fontsize = 14)
ax.set_ylabel('Percent of movies (%)', fontsize = 14)
ax.yaxis.set_ticks(np.arange(0, 110, 10))
ax.tick_params(labeltop=False, labelright=True)

# Reverse legend order
handles, labels = ax.get_legend_handles_labels()
ax.legend(reversed(handles), reversed(labels), loc='center left', bbox_to_anchor=(1.05,0.5))

plt.show()

In [None]:
## Explore distributions and relationships
df_roi.plot.scatter(x = 'ROIpct',
                       y = 'production_budget', 
                       c = 'blue', 
                   figsize = (10,6))

from scipy.stats import linregress
x = df_roi.ROIpct
y = df_roi.production_budget
stats = linregress(x, y)

m = stats.slope
b = stats.intercept

# Set the linewidth on the regression line to 3px
plt.plot(x, m * x + b, color="red", linewidth=3)

plt.title('Relationship between ROI and Production Budget', fontsize = 12)
plt.xlabel('ROI (%)', fontsize = 12)
#plt.ylabel('Production Budget ($)', fontsize = 12)

plt.show()

In [None]:
df_roi.groupby('ROI_buckets').production_budget_m.median().plot.bar(figsize=(10,6))
plt.title('Median Production Budget (Millions $)', fontsize = 12)
plt.xlabel('ROI', fontsize = 12)
plt.yticks(np.arange(0, 45, 5))
#plt.ylabel('Median Production Budget ($)', fontsize = 12)
plt.show()


In [None]:
df_yr_median.reset_index(inplace = True)
df_yr_median.head()

In [None]:
yr = df_yr_median[['year']]
worldwide_gross = df_yr_median[['worldwide_gross_m']]
production_budget = df_yr_median[['production_budget_m']]

df_roi_comp = df_yr_median[['year','worldwide_gross_m','production_budget_m', 'domestic_gross_m', 'foreign_gross_m']]
df_roi_comp = df_roi_comp.rename(columns = {'year':'Year', 'worldwide_gross_m': 'Gross Revenue', 'production_budget_m': 'Production Budget', 'domestic_gross_m': 'Domestic Gross', 'foreign_gross_m': 'Foreign Gross'})
df_roi_comp.head()

In [None]:
# Graph gross revenue and production budget by year

df_roi_comp.plot(x="Year", y=["Production Budget", "Gross Revenue"], kind="bar", figsize=(8,8))
plt.title('Gross Revenue and Production Budget (Millions $)', fontsize = 12)
plt.xlabel('Year', fontsize = 12)
plt.yticks(np.arange (0, 80, 5))

plt.legend()

In [None]:
# Breakdown of domestic and foreign gross revenue

df_roi_comp.plot(x="Year", y=["Domestic Gross", "Foreign Gross"], figsize=(12,8))
plt.title('Gross Revenue and Domestic Revenue (Millions $)', fontsize = 12)
plt.xlabel('Year', fontsize = 12)
plt.xticks(np.arange (2000, 2020, 1))
plt.yticks(np.arange (0, 50, 5))
plt.legend()
plt.show()

In [None]:
df_roi_comp_from2014 = df_yr_median[['year','worldwide_gross_m','production_budget_m']][df_yr_median['year']>=2014]
df_roi_comp_from2014.head()

In [None]:
df_roi_comp_from2014.plot.scatter(x = 'worldwide_gross_m',
                       y = 'production_budget_m', 
                       c = 'blue', 
                   figsize = (10,6))

from scipy.stats import linregress
x = df_roi_comp_from2014.worldwide_gross_m
y = df_roi_comp_from2014.production_budget_m

m = stats.slope
b = stats.intercept

plt.title('Relationship between Gross Revenue and Production Budget', fontsize = 12)
#plt.xlabel('Gross Revenue', fontsize = 12)
#plt.ylabel('Production Budget', fontsize = 12)

plt.show()

In [None]:
## ROI and year movie released
df_yr_median.plot.bar(x = 'year', y = 'ROIpct', 
                       color = 'navy', figsize = (10,8), legend=None)

plt.title('Median ROI by Year (%)', fontsize = 12)
plt.xlabel('Year', fontsize = 12)
plt.yticks(np.arange (0, 160, 20))

plt.show()


In [None]:
#Distribution of ROI by month
df_roi_mo_median = df_roi.groupby(['month_num']).agg(['median'])
df_roi_mo_median.head()

In [None]:
df_roi_mo_median.reset_index(inplace = True)

In [None]:
df_roi_mo_median['month'] = pd.to_datetime(df_roi_mo_median['month_num'], format='%m').dt.month_name().str.slice(stop=3)
df_roi_mo_median.head()

In [None]:
df_roi_mo_median.plot.bar(x='month', y='ROIpct', color = 'green', figsize=(10,6), legend=None)
plt.title('Median ROI by Month (%)', fontsize = 12)
plt.xlabel('Month', fontsize = 12)
plt.yticks(np.arange (0, 120, 20))

plt.show()

In [None]:
# Which studios produce high ROI movies?

In [None]:
df_roi['studio'].nunique()

In [None]:
# Create new df - top studios by ROI
df_studio_roi = df_roi.dropna(subset=['studio'])
df_studio_roi.head()

In [None]:
df_studio_roi = df_roi.sort_values('ROIpct', ascending = False)
df_studio_roi = df_roi.groupby('studio')['ROIpct'].agg(['median'])
df_studio_roi.reset_index(inplace = True)

In [None]:
df_studio_roi.rename(columns = {'median':'ROIpct_median'}, inplace = True)

In [None]:
df_studio_roi.sort_values('ROIpct_median', ascending = False)

In [None]:
df_studio_roi_top25 = df_studio_roi.sort_values('ROIpct_median', ascending = False).head(25)
df_studio_roi_top50 = df_studio_roi.sort_values('ROIpct_median', ascending = False).head(50)
df_studio_positiveROI = df_studio_roi[df_studio_roi.ROIpct_median>100].sort_values('ROIpct_median', ascending = False)
#df_financials[df_financials.worldwide_gross_m >= 8.938658e+02].sort_values('worldwide_gross_m', ascending = False)

display(df_studio_roi_top25.shape)
display(df_studio_roi_top50.shape)
display(df_studio_positiveROI.shape)

In [None]:
df_studio_positiveROI.plot.barh(x='studio', y='ROIpct_median', color = 'navy', figsize=(10,8), legend=None)

plt.title('Top Studios by ROI')
plt.xlabel('Median ROI (%)')
plt.ylabel=('Studio')

plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
## QUESTIONS - PART 3: WHAT IS THE BUSINESS MODEL OF TOP STUDIOS? (Movie runtime, genre)

In [None]:
# Explore df4 and df8

In [None]:
df4.columns

In [None]:
df8.columns

In [None]:
df8.studio.value_counts().nlargest(25)

In [None]:
# Explore genres, runtimes for top revenue grossing studios

In [None]:
#Create new df - studios' business model (genres, runtime for top revenue grossing studios)
df_topstudio_model = df8[(df8['studio'] == 'Universal Pictures') | 
                         (df8['studio'] == 'Paramount Pictures') |
                         (df8['studio'] == '20th Century Fox') |
                         (df8['studio'] == 'Sony Pictures Classics') |
                         (df8['studio'] == 'Warner Bros. Pictures') |
                        (df8['studio'] == 'Warner Bros.') |
                        (df8['studio'] =='Buenva Vista Pictures') |
                        (df8['studio'] == 'Walt Disney Pictures')]

df_topstudio_model.head()


In [None]:
# Clean up studio names
df_topstudio_model['studio'] = df_topstudio_model['studio'].replace({'Warner Bros.' : 'Warner Bros. Pictures'})
df_topstudio_model['studio'] = df_topstudio_model['studio'].replace({'Walt Disney Pictures' : 'Buena Vista Pictures'})
df_topstudio_model.head(10)

In [None]:
# Average runtime by studio
df_topstudio_model.info()

In [None]:
df_topstudio_model = df_topstudio_model.set_index('studio')
df_topstudio_model.head()

In [None]:
df_topstudio_model['runtime'] = df_topstudio_model.runtime.str.split(" ", n = 1, expand = True)
df_topstudio_model = df_topstudio_model.rename(columns={'runtime': 'runtime_minutes'})
df_topstudio_model.head()

In [None]:
df_topstudio_model.info()

In [None]:
#Convert runtime_minutes col to int
df_topstudio_model_runtime = pd.to_numeric(df_topstudio_model.runtime_minutes, errors='coerce')
df_topstudio_model_runtime = df_topstudio_model_runtime.reset_index()
df_topstudio_model_runtime.head()

In [None]:
df_topstudio_model_runtime.describe()

In [None]:
df_topstudio_model_runtime = df_topstudio_model_runtime.groupby('studio')['runtime_minutes'].agg('mean')
df_topstudio_model_runtime = df_topstudio_model_runtime.reset_index()
df_topstudio_model_runtime.head()

In [None]:
df_topstudio_model_runtime = df_topstudio_model_runtime.sort_values(by = 'runtime_minutes', ascending = False)
df_topstudio_model_runtime.head()

In [None]:
df_topstudio_model_runtime.plot.barh(x='studio',  y = 'runtime_minutes', color = 'navy', figsize=(6, 8), legend=None)

plt.title('Average Runtime by Studio')
plt.xlabel('Minutes')

plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Create new df splitting genres
df_topstudio_model_genres = df_topstudio_model
df_topstudio_model_genres.head()

In [None]:
df_topstudio_model_genres = df_topstudio_model_genres['genre'].str.split("|", n=10, expand = True).rename(columns = lambda x: f"genre_{x+1}")
df_topstudio_model_genres.head(10)

In [None]:
# Genre value counts by studio

In [None]:
genres_pct = df_topstudio_model_genres.stack().value_counts(normalize=True, dropna=True).mul(100).round(1)

In [None]:
genres_pct

In [None]:
genres_pct.plot(kind = 'barh', color = 'navy', figsize=(6,8))
plt.title('Genres Produced by Top Studios')
plt.gca().invert_yaxis()
plt.xlabel('Percent of films produced')
plt.show()