In [15]:
# No short films genre

import numpy as np
import pandas as pd
import hvplot.pandas
import holoviews as hv
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../Data/imdb_clean.csv').drop('Unnamed: 0', axis=1)
df['ROI'] = df['gross'] - df['budget']
df.head()

Unnamed: 0,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,actor_1_name,movie_title,...,Sport,Crime,Horror,War,Biography,Music,Documentary,Short,Film-Noir,ROI
0,James Cameron,723,178,0,855.0,Joel David Moore,1000.0,760505847,CCH Pounder,Avatar,...,0,0,0,0,0,0,0,0,0,523505847
1,Gore Verbinski,302,169,563,1000.0,Orlando Bloom,40000.0,309404152,Johnny Depp,Pirates of the Caribbean: At World's End,...,0,0,0,0,0,0,0,0,0,9404152
2,Sam Mendes,602,148,0,161.0,Rory Kinnear,11000.0,200074175,Christoph Waltz,Spectre,...,0,0,0,0,0,0,0,0,0,-44925825
3,Christopher Nolan,813,164,22000,23000.0,Christian Bale,27000.0,448130642,Tom Hardy,The Dark Knight Rises,...,0,0,0,0,0,0,0,0,0,198130642
4,Andrew Stanton,462,132,475,530.0,Samantha Morton,640.0,73058679,Daryl Sabara,John Carter,...,0,0,0,0,0,0,0,0,0,-190641321


In [3]:
def get_genre_gross(df, genre):
    return round(np.mean(df[df[genre] == 1]['gross']), 2)

get_genre_gross(df.copy(), 'Action')

76430991.33

In [4]:
def get_all_genre_gross(df):
    results = {}
    genres = ['Action', 'Adventure', 'Fantasy', 'Sci-Fi', 'Thriller', 'Romance', 
          'Animation', 'Comedy', 'Family', 'Musical', 'Mystery', 'Western', 
          'Drama', 'History', 'Sport', 'Crime', 'Horror', 'War', 'Biography', 
          'Music', 'Documentary', 'Film-Noir']
    
    for genre in genres:
        results[genre] = get_genre_gross(df.copy(), genre)
        
    return pd.DataFrame.from_dict(results, orient='index', columns=['Mean Gross']).sort_values(by='Mean Gross')

def plot_all_genre_gross():
    df_gross = get_all_genre_gross(df.copy())
    return df_gross.hvplot(title='Mean Gross by Genre', rot=90, kind='bar') * hv.HLine(df_gross['Mean Gross'].mean())

plot_all_genre_gross()

In [5]:
def get_genre_budget(df, genre):
    return round(np.mean(df[df[genre] == 1]['budget']), 2)

get_genre_budget(df.copy(), 'Action')

70676747.21

In [6]:
def get_all_genre_budget(df):
    results = {}
    genres = ['Action', 'Adventure', 'Fantasy', 'Sci-Fi', 'Thriller', 'Romance', 
          'Animation', 'Comedy', 'Family', 'Musical', 'Mystery', 'Western', 
          'Drama', 'History', 'Sport', 'Crime', 'Horror', 'War', 'Biography', 
          'Music', 'Documentary', 'Film-Noir']
    
    for genre in genres:
        results[genre] = get_genre_budget(df.copy(), genre)
        
    return pd.DataFrame.from_dict(results, orient='index', columns=['Mean Budget']).sort_values(by='Mean Budget')

def plot_all_genre_budget():
    df_budget = get_all_genre_budget(df.copy())
    return df_budget.hvplot(title='Mean Gross by Budget', rot=90, kind='bar') * hv.HLine(df_budget['Mean Budget'].mean())

plot_all_genre_budget()

In [7]:
def get_genre_roi(df, genre):
    return round(np.mean(df[df[genre] == 1]['ROI']), 2)

get_genre_roi(df.copy(), 'Action')

5754244.12

In [8]:
def get_all_genre_roi(df):
    results = {}
    genres = ['Action', 'Adventure', 'Fantasy', 'Sci-Fi', 'Thriller', 'Romance', 
          'Animation', 'Comedy', 'Family', 'Musical', 'Mystery', 'Western', 
          'Drama', 'History', 'Sport', 'Crime', 'Horror', 'War', 'Biography', 
          'Music', 'Documentary', 'Film-Noir']
    
    for genre in genres:
        results[genre] = get_genre_roi(df.copy(), genre)
        
    return pd.DataFrame.from_dict(results, orient='index', columns=['Mean ROI']).sort_values(by='Mean ROI')

def plot_all_genre_roi():
    df_roi = get_all_genre_roi(df.copy())
    return df_roi.hvplot(title='Mean ROI by Genre', rot=90, kind='bar') * hv.HLine(df_roi['Mean ROI'].mean())

plot_all_genre_roi()

In [9]:
def get_genre_imdb_score(df, genre):
    return round(np.mean(df[df[genre] == 1]['imdb_score']), 2)

get_genre_imdb_score(df.copy(), 'Action')

6.29

In [10]:
def get_all_genre_imdb_score(df):
    results = {}
    genres = ['Action', 'Adventure', 'Fantasy', 'Sci-Fi', 'Thriller', 'Romance', 
          'Animation', 'Comedy', 'Family', 'Musical', 'Mystery', 'Western', 
          'Drama', 'History', 'Sport', 'Crime', 'Horror', 'War', 'Biography', 
          'Music', 'Documentary', 'Film-Noir']
    
    for genre in genres:
        results[genre] = get_genre_imdb_score(df.copy(), genre)
        
    return pd.DataFrame.from_dict(results, orient='index', columns=['Mean IMDB Score']).sort_values(by='Mean IMDB Score')

def plot_all_genre_imdb_score():
    df_imdb_score = get_all_genre_imdb_score(df.copy())
    return df_imdb_score.hvplot(title='Mean IMDB by Genre', rot=90, kind='bar') * hv.HLine(df_imdb_score['Mean IMDB Score'].mean())

plot_all_genre_imdb_score()

In [11]:
def plot_top_ten_roi(df, genre):
    df_roi = df[df[genre] == 1].loc[:, ['ROI', 'movie_title']].sort_values(by='ROI').tail(10)
    return df_roi.hvplot(
            x='movie_title',
            y='ROI',
            rot=25,
            title=f'Top Ten ROI Movies in {genre} Genre',
            kind='bar')
    
plot_top_ten_roi(df.copy(), 'Action')

In [14]:
def plot_top_ten_gross(df, genre):
    df_gross = df[df[genre] == 1].loc[:, ['gross', 'movie_title']].sort_values(by='gross').tail(10)
    return df_gross.hvplot(
            x='movie_title',
            y='gross',
            rot=25,
            title=f'Top Ten Grossing Movies in {genre} Genre',
            kind='bar')
    
plot_top_ten_gross(df.copy(), 'Action')  