In [116]:
import pandas as pd
import numpy as np
import hvplot.pandas
import plotly.express as px
from pathlib import Path
import panel as pn
import wikipedia as wikipedia
from panel.interact import interact
pn.extension("plotly")



In [94]:
data_path = 'IMDB_Dataset_Cleaned.csv'
df = pd.read_csv(data_path)

In [95]:
df['genres'] = df['genres'].apply(lambda x: x.split(','))

In [98]:
unique = ['Action', 'Adventure', 'Fantasy', 'Sci-Fi', 'Thriller', 'Romance', 'Animation', 'Comedy',
          'Family', 'Musical', 'Mystery', 'Western', 'Drama', 'History', 'Sport', 'Crime', 'Horror',
          'War', 'Biography', 'Music', 'Documentary', 'Short', 'Film-Noir' 
         ]

for col in unique:
    df[col] = 0

In [97]:
for idx in range(len(df)):
    genres = df.loc[idx, 'genres']
    for genre in genres:
        df.loc[idx, genre] = 1

df = df.drop(['genres'], axis=1)

In [86]:
return_on_investment = df["gross"].sub(df["budget"])
df['return_on_investment'] = return_on_investment
df['uniform_roi'] = pd.Series(df['return_on_investment']).replace(np.sort(df['return_on_investment']),np.linspace(0,1,len(df['return_on_investment'])))

In [83]:
lead_actor_average_roi = df.groupby(['actor_1_name']).mean()
lead_actor_average_roi = lead_actor_average_roi.sort_values('return_on_investment', ascending=False)
lead_actor_average_roi_top_ten = lead_actor_average_roi.head(10)

In [102]:
roi_actor_bar_plot = lead_actor_average_roi_top_ten.loc[:, 'return_on_investment'].hvplot.bar(title = "Top Ten Average Return on Investment by Lead Actor", rot=90, height=500, yformatter='$%.2f', xlabel="Lead Actor", ylabel="ROI")

In [101]:
roi_actor_scatter = px.scatter(
    lead_actor_average_roi_top_ten,
    x="gross",
    y="budget",
    size="uniform_roi",
    color=lead_actor_average_roi_top_ten.index,
    title="Top Ten Average Return on Investment Quantile by Lead Actor",
    labels={"gross": "Average Gross Profit", "uniform_roi": "Average Return on Investment Quantile", "budget": "Average Budget", "actor_1_name": "Lead Actor"},
)

In [142]:
row_of_lead_actor = pn.Row(roi_actor_bar_plot)
wiki_row = pn.Row(wikipedia.summary("Wayne Elliot Knight", sentences=2), wikipedia.summary("Rupert Everett", sentences=2), wikipedia.summary("Henry Thomas", sentences=3), wikipedia.summary("Kathleen Freeman", sentences=2), wikipedia.summary("Hattie McDaniel", sentences=2))
row_of_lead_actor.append(roi_actor_scatter)
column_of_lead_actor = pn.Column("# Which Actor's Movies have the Highest Average ROI?", row_of_lead_actor, wiki_row)
column_of_lead_actor

In [123]:
pn.serve(column_of_lead_actor)

Launching server at http://localhost:59620


<bokeh.server.server.Server at 0x7f9e0bf02a50>

In [82]:
lead_actor_average_gross = df.groupby(['actor_1_name']).mean()
lead_actor_average_gross = lead_actor_average_roi.sort_values('gross', ascending=False)
lead_actor_average_gross_top_ten = lead_actor_average_gross.head(10)

In [107]:
gross_actor_bar_plot = lead_actor_average_gross_top_ten.loc[:, 'gross'].hvplot.bar(title = "Top Ten Average Gross Profit by Lead Actor", rot=90, height=500, yformatter='$%.2f', xlabel="Lead Actor", ylabel="Gross Profit")

In [108]:
lead_actor_average_budget = df.groupby(['actor_1_name']).mean()
lead_actor_average_budget = lead_actor_average_budget.sort_values('budget', ascending=False)
lead_actor_average_budget_top_ten = lead_actor_average_budget.head(10)

In [109]:
budget_actor_bar_plot = lead_actor_average_budget_top_ten.loc[:, 'budget'].hvplot.bar(title = "Top Ten Average Budget by Lead Actor", rot=90, height=500, yformatter='$%.2f', xlabel="Lead Actor", ylabel="Budget")

In [138]:
row_of_lead_actor2 = pn.Row(gross_actor_bar_plot)
wiki_row2 = pn.Row(wikipedia.summary("Rupert Everett", sentences=2), wikipedia.summary("Wayne Elliot Knight", sentences=2), wikipedia.summary("William Hootkins", sentences=1), wikipedia.summary("Steve Bastoni", sentences=2), wikipedia.summary("Phaldut Sharma", sentences=1))
row_of_lead_actor2.append(budget_actor_bar_plot)
column_of_lead_actor2 = pn.Column("# Which Actor's Movies have the Highest Average Gross Profits as Leads, and which have the Highest Average Budgets?", row_of_lead_actor2, wiki_row2)
column_of_lead_actor2

In [80]:
lead_actor_average_ratings = df.groupby(['actor_1_name']).mean()
lead_actor_average_ratings = lead_actor_average_ratings.sort_values('imdb_score', ascending=False)
lead_actor_average_ratings_top_ten = lead_actor_average_ratings.head(10)

In [161]:
imdb_actor_scatter = px.scatter(
    lead_actor_average_roi_top_ten,
    x="actor_1_facebook_likes",
    y="imdb_score",
    size="return_on_investment",
    color=lead_actor_average_ratings_top_ten.index,
    title="Top Ten Average IMDB Score by Lead Actor",
    labels={"actor_1_facebook_likes": "Lead Actor Facebook Likes", "imdb_score": "Average IMDB Score"},
)

In [162]:
row_of_lead_actor2 = pn.Row(imdb_actor_scatter)
row_of_lead_actor2.append(wikipedia.summary("Scatman Crothers", sentences=2))

row_of_lead_actor3 = pn.Row(imdb_actor_scatter)
wiki_row3 = pn.Row(wikipedia.summary("Scatman Crothers", sentences=2), wikipedia.summary("Takashi Shimura", sentences=2), wikipedia.summary("Paulette Goddard", sentences=3))
column_of_lead_actor3 = pn.Column("# Which Actors have the Highest Average IMDB Score?", row_of_lead_actor3, wiki_row3)
column_of_lead_actor3

In [163]:
panel = pn.Tabs(
    ("Return on Investment", column_of_lead_actor), ("Gross Profit and Budget", column_of_lead_actor2), ("IMDB Score", column_of_lead_actor3))

In [164]:
panel.servable()

In [165]:
pn.serve(panel)

Launching server at http://localhost:59901


<bokeh.server.server.Server at 0x7f9e0d836410>