# Question 4 - Exploring Potential Ratings Drivers

In [1]:
#Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
import seaborn as sns
import os
from glob import glob
import plotly.express as px
%matplotlib inline

In [2]:
#Importing Datasets
#Pulling in my initial cleaned dataset and creating a profit column
df = pd.read_csv('cleaned_genre_exploration.csv')
df['profit'] = df['worldwide_gross']-df['production_budget']
#Pulling all the csv files into one list
csv_files = glob("./zippedData/*.csv.gz")
#and forming a dictionary of cleaned names
type(csv_files)
d = {}
for file in csv_files:
    d[file] = pd.read_csv(file)    
csv_files_dict = {}
# cleaning the filenames
for filename in csv_files:
    filename_cleaned = os.path.basename(filename).replace(".csv", "").replace(".", "_") 
    filename_df = pd.read_csv(filename, index_col=0)
    csv_files_dict[filename_cleaned] = filename_df
#Initializing connection:
conn = sqlite3.connect("movies_db.sqlite")
#Filling the DB with all the tables. . .  Commented out for now as it's already been done.
#Uncomment and run block if running notebook for first time.
"""
def create_sql_table_from_df(df, name, conn):
    try:
        df.to_sql(name, conn)
        print(f"Created table {name}")
    
    except Exception as e:
        print(f"could not make table {name}")
        print(e)
        
for name, table in csv_files_dict.items():
    create_sql_table_from_df(table, name, conn)
""";
#Adding my clean df to the sql db - Commented out as well for now
"""
clean_genre = pd.read_csv('cleaned_genre_exploration.csv')
clean_genre.to_sql('clean_genre', conn)
"""

In [3]:
#Showing table names for later reference
conn.execute("select name from sqlite_master where type='table';").fetchall()

[('bom_movie_gross_gz',),
 ('imdb_name_basics_gz',),
 ('imdb_title_akas_gz',),
 ('imdb_title_basics_gz',),
 ('imdb_title_crew_gz',),
 ('imdb_title_principals_gz',),
 ('imdb_title_ratings_gz',),
 ('tmdb_movies_gz',),
 ('tn_movie_budgets_gz',),
 ('clean_genre',)]

In [4]:
#Initializing cursor:
cur = conn.cursor()

<b>Pulling Dataframes:</b><br>
Now that we have our SQL database all set up, let's pull some useful dataframes for later use.<br><br>
The purpose here is to explore ratings.  What drives them?  Our other questions are focused more on gross revenue,<br>
but what can we do to keep ratings high and thus customers coming back? <br>
Is there any way we can set up Microsoft for success by recommending certain strategies?

<b>General Movie Data - DF4

In [86]:
#DF4 is a cleaned list of movies that shows info such as production budget, average rating, and more
cur.execute("""SELECT production_budget, release_date, primary_title,
               runtime_minutes, G1, original_language, vote_count, vote_average,
               c.domestic_gross, worldwide_gross, m.studio
            
            FROM clean_genre as c
            LEFT JOIN bom_movie_gross_gz as m
            ON primary_title = m.title
            WHERE NOT m.studio = 'None' AND vote_count > 100 AND runtime_minutes >30
            ORDER BY vote_count DESC
            LIMIT 1000;""")
#Limited to movies with more than 100 votes for rating signifigance and runtime over 30 to weed out bad data
df4 = pd.DataFrame(cur.fetchall())
df4.columns = [i[0] for i in cur.description]
display(df4.head())
df4.info()

Unnamed: 0,production_budget,release_date,primary_title,runtime_minutes,G1,original_language,vote_count,vote_average,domestic_gross,worldwide_gross,studio
0,160000000,2010-07-16,Inception,148.0,Action,en,22186,8.3,292576195,835524642,WB
1,58000000,2016-02-12,Deadpool,108.0,Action,en,20175,7.6,363070709,801025593,Fox
2,165000000,2014-11-05,Interstellar,169.0,Adventure,en,18597,8.2,188017894,666379375,Par.
3,170000000,2014-08-01,Guardians of the Galaxy,121.0,Action,en,17958,7.9,333172112,770867516,BV
4,100000000,2012-12-25,Django Unchained,165.0,Drama,en,15725,8.0,162805434,449948323,Wein.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
production_budget    1000 non-null int64
release_date         1000 non-null object
primary_title        1000 non-null object
runtime_minutes      1000 non-null float64
G1                   1000 non-null object
original_language    1000 non-null object
vote_count           1000 non-null int64
vote_average         1000 non-null float64
domestic_gross       1000 non-null int64
worldwide_gross      1000 non-null int64
studio               1000 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 86.1+ KB


<b>Studio Data - DF5

In [113]:
#DF5 is a df grouped by studio so we can see info such as average rating,
#or average worldwide gross revenue.  
#This can help us identify successful studios!
cur.execute("""SELECT m.studio,
               AVG(production_budget) as avg_budg,
               SUM(production_budget) as tot_budg,
               AVG(runtime_minutes) as avg_runtime, 
               SUM(vote_count) as total_votes, 
               AVG(vote_average) as avg_rating,
               AVG(c.domestic_gross) as avg_dom_gross,
               SUM(c.domestic_gross) as tot_dom_gross,
               AVG(worldwide_gross) as avg_world_gross,
               COUNT(tconst) as num_movies
            
            FROM clean_genre as c
            LEFT JOIN bom_movie_gross_gz as m
            ON primary_title = m.title
            WHERE vote_count > 100
            GROUP BY m.studio
            ORDER BY avg_rating DESC;""")
#Dropping studios with less than 100 votes behind their movies
df5 = pd.DataFrame(cur.fetchall())
df5.columns = [i[0] for i in cur.description]
df5.dropna(axis = 0, subset = ['studio'], inplace = True) #Dropping null rows
display(df5.head())
df5.info() #appears to be clean

Unnamed: 0,studio,avg_budg,tot_budg,avg_runtime,total_votes,avg_rating,avg_dom_gross,tot_dom_gross,avg_world_gross,num_movies
0,Orch.,2500000.0,2500000,101.0,915,7.8,5205471.0,5205471,23845530.0,1
1,Cleopatra,8000000.0,8000000,103.0,500,7.5,82703.0,82703,17382690.0,1
2,Neon,11000000.0,11000000,120.0,2904,7.5,30014530.0,30014534,53797410.0,1
3,Osci.,4333333.0,13000000,106.0,1695,7.433333,1796292.0,5388877,6200627.0,3
4,MBox,46300000.0,92600000,120.0,4426,7.3,53171430.0,106342853,127336200.0,2


<class 'pandas.core.frame.DataFrame'>
Int64Index: 71 entries, 0 to 71
Data columns (total 10 columns):
studio             71 non-null object
avg_budg           71 non-null float64
tot_budg           71 non-null int64
avg_runtime        71 non-null float64
total_votes        71 non-null int64
avg_rating         71 non-null float64
avg_dom_gross      71 non-null float64
tot_dom_gross      71 non-null int64
avg_world_gross    71 non-null float64
num_movies         71 non-null int64
dtypes: float64(5), int64(4), object(1)
memory usage: 6.1+ KB


<b>Genre Data - DF6

In [46]:
#DF6 is a df grouped by genre so we can see info such as average rating,
#or average worldwide gross revenue.  
#This can help us identify genres that rate better than others
cur.execute("""SELECT G1,
               AVG(production_budget) as avg_budg,
               SUM(production_budget) as tot_budg,
               AVG(runtime_minutes) as avg_runtime, 
               SUM(vote_count) as total_votes, 
               AVG(vote_average) as avg_rating,
               AVG(domestic_gross) as avg_dom_gross,
               SUM(domestic_gross) as tot_dom_gross,
               AVG(worldwide_gross) as avg_world_gross,
               SUM(worldwide_gross) as tot_world_gross
            
            FROM clean_genre
            WHERE vote_count > 100
            GROUP BY G1
            ORDER BY avg_dom_gross DESC
            LIMIT 10;""")

df6 = pd.DataFrame(cur.fetchall())
df6.columns = [i[0] for i in cur.description]
display(df6.head(10))
df6.info() #appears to be clean

Unnamed: 0,G1,avg_budg,tot_budg,avg_runtime,total_votes,avg_rating,avg_dom_gross,tot_dom_gross,avg_world_gross,tot_world_gross
0,Family,61142860.0,428000000,97.312464,25745,7.371429,259928400.0,1819498894,549230700.0,3844615149
1,Music,48950000.0,97900000,120.0,9034,6.7,143075700.0,286151353,399071400.0,798142768
2,Musical,48950000.0,195800000,140.0,18068,6.7,143075700.0,572302706,399071400.0,1596285536
3,Animation,52884620.0,687500000,75.475942,51833,6.8,119178100.0,1549315301,299903600.0,3898746443
4,Adventure,78240990.0,12596800000,102.685556,436064,6.331056,99935310.0,16089584972,280327400.0,45132715303
5,Mystery,37000000.0,185000000,98.837449,27609,7.14,86733110.0,433665528,188856900.0,944284346
6,Action,79117490.0,35286400000,111.654985,1484974,6.215695,84051160.0,37486818171,229495600.0,102355048703
7,Fantasy,42123080.0,547600000,99.764326,36733,6.146154,68794880.0,894333427,177676400.0,2309793691
8,Comedy,24649230.0,7641260000,100.862691,388860,6.236129,40009240.0,12402863764,74614080.0,23130366000
9,Biography,25439700.0,3358040000,108.075758,201053,6.82803,38089430.0,5027804951,75012330.0,9901628219


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 10 columns):
G1                 10 non-null object
avg_budg           10 non-null float64
tot_budg           10 non-null int64
avg_runtime        10 non-null float64
total_votes        10 non-null int64
avg_rating         10 non-null float64
avg_dom_gross      10 non-null float64
tot_dom_gross      10 non-null int64
avg_world_gross    10 non-null float64
tot_world_gross    10 non-null int64
dtypes: float64(5), int64(4), object(1)
memory usage: 928.0+ bytes


In [62]:
"""
This dataframe was pulled together in SQL and cleaned and arranged 
to show the top 100 directors by total domestic gross.
"""
df7 = pd.read_csv('100_directors.csv')
display(df7.head())
df7.info()

Unnamed: 0.1,Unnamed: 0,nconst,primary_name,category,tot_dom_gross,tot_wor_gross,avg_dom_gross,avg_wor_gross,avg_rating,avg_budg,num_movies
0,0,nm0751577,Anthony Russo,director,1346646789,3902605502,448882300.0,1300869000.0,7.8,240000000.0,3
1,1,nm0751648,Joe Russo,director,1346646789,3902605502,448882300.0,1300869000.0,7.8,240000000.0,3
2,2,nm1349376,Francis Lawrence,director,1149112056,2543191543,229822400.0,508638300.0,6.88,104400000.0,5
3,3,nm0269463,Jon Favreau,director,1142562693,2614568760,285640700.0,653642200.0,7.15,112825000.0,4
4,4,nm0923736,Joss Whedon,director,1128220169,2992084614,282055000.0,748021200.0,7.425,155900000.0,4


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
Unnamed: 0       100 non-null int64
nconst           100 non-null object
primary_name     100 non-null object
category         100 non-null object
tot_dom_gross    100 non-null int64
tot_wor_gross    100 non-null int64
avg_dom_gross    100 non-null float64
avg_wor_gross    100 non-null float64
avg_rating       100 non-null float64
avg_budg         100 non-null float64
num_movies       100 non-null int64
dtypes: float64(4), int64(4), object(3)
memory usage: 8.7+ KB


## Graphs:

We're searching how to maximize ratings for a future movie streaming service using these variables.  <br>
For example, what runtime should we target?  What directors can we work with?<br><br>
Below we charted the following variables:
- Budget
- Runtime
- Studio
- Genre
- Director

Surprisingly, there's not a significant correlation between budget and rating.  As such, it seems wise to not focus  <br>specifically on high budget studios when aiming for high ratings.

### Budget:

In [129]:
#Graph of budget vs vote average
print('Little Correlation Between Budget and Vote Average:')
fig1 = px.scatter(df4, x='production_budget', y='vote_average',
                size = 'domestic_gross', trendline = 'ols', 
                hover_data=['primary_title'])
fig1.show();

Little Correlation Between Budget and Vote Average:


### Runtime:

In [130]:
#Graph of Vote Average over Runtime
print('Ratings over Runtime')
print('Clear correlation between longer runtim and better rating')
fig2 = px.scatter(df4, x='runtime_minutes', y='vote_average',
                size = 'production_budget', trendline = 'ols',
                hover_data=['primary_title'])
fig2.show()

Ratings over Runtime
Clear correlation between longer runtim and better rating


### Studio:

Given that production doesn't correlate meaningfully with ratings, let's search for Studios with experience making movies that can create well rated movies on a modest budget.  From there, we can partner with or acquire some of these studios to build a pipeline of high quality movies!

In [131]:
#Graph of studio success
print('Top 100 Studios Based on Budget, Rating, Number of Movies (color), and Domestic Gross (size):')
fig3 = px.scatter(df5, x='avg_budg', y='avg_rating',
                size = 'avg_dom_gross', color = 'num_movies',
                hover_data=['studio'])
fig3.show()

Top 100 Studios Based on Budget, Rating, Number of Movies (color), and Domestic Gross (size):


### Genre:

Here we see that mystery films and biography films often rate the best.  Obviously we can't focus on these movies and nothing else (you need variety), but it might help to have more of these films than the average studio or streaming service. 

In [132]:
print('Genres to Target on Basis of Average Rating:')
fig4 = px.box(df4, x='G1', y='vote_average',
                hover_data=['G1'])
fig4.show()

Genres to Target on Basis of Average Rating:


### Directors:

In [133]:
#Graph of Successful Directors to work with
#Who has experience, and who can do the most with the smallest budget?
print('Which directors can create the most successful movies with reasonable budgets?')
print('Top 100 directors by domestic gross')
fig5 = px.scatter(df7, x='tot_dom_gross', y='avg_rating', 
                hover_data=['primary_name', 'num_movies'], color = 'avg_budg')
fig5.update_traces(mode='markers', marker_line_width=1, marker_size=15)
fig5.show()

Which directors can create the most successful movies with reasonable budgets?
Top 100 directors by domestic gross


# Conclusions:

- __Budget isn't Everything:__ 
    - When it comes to ratings, there's little to no correlation between budget and rating.
- __Directors to Work With:__ (ordered by domestic gross, avg rating above 7.5, avg budget around or under 100m)
    - Marilyn Barnes - Chicago and Beauty and the Beast
    - Tim Miller - Deadpool
    - Martin Scoresese - The Wolf of Wall Street
- __Runtime to Target:__
    - Longer runtimes tend to rate better (within reason!).  Aim for 120 minutes or more.
- __Studios to Acquire/Partner:__ (medium budget, medium gross revenue level, but above average ratings)
    - MBox - Girl with Dragon Tattoo and Ida, med budget and gross, but great ratings.
    - Neon - I, Tonya, 30mil for 11mil budget, high rating 
    - Lantern Entertainment (previously known as Weinstein company), high movie count, avg domestic gross, better than average ratings
    - TriS - John Wick, Looper, similarly lower budget, higher rated movies with avg domestic gross
- __Genres to Pursue:__
    - Mystery
    - Biography