In [91]:
import pandas as pd
import sqlite3 
import numpy as np
from scipy import stats
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits import mplot3d
import seaborn as sns
import sklearn.metrics as metrics
import statsmodels.api as sm
from lin_reg import best_line
%matplotlib inline

ModuleNotFoundError: No module named 'lin_reg'

In [29]:
# Reading in data files

movie_budgets = pd.read_csv('../data/tn.movie_budgets.csv.gz')
tmdb_movies = pd.read_csv('../data/tmdb.movies.csv.gz')
rt_reviews = pd.read_csv('../data/rt.reviews.tsv.gz', sep = '\t', encoding='latin-1')
rt_movie_info = pd.read_csv('../data/rt.movie_info.tsv.gz', sep = '\t')
movie_gross = pd.read_csv('../data/bom.movie_gross.csv.gz')

con = sqlite3.connect('../data/im.db')
schema_imdb = pd.read_sql(
"""
SELECT *
FROM sqlite_master;
""", con)
schema_imdb

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,movie_basics,movie_basics,2,"CREATE TABLE ""movie_basics"" (\n""movie_id"" TEXT..."
1,table,directors,directors,3,"CREATE TABLE ""directors"" (\n""movie_id"" TEXT,\n..."
2,table,known_for,known_for,4,"CREATE TABLE ""known_for"" (\n""person_id"" TEXT,\..."
3,table,movie_akas,movie_akas,5,"CREATE TABLE ""movie_akas"" (\n""movie_id"" TEXT,\..."
4,table,movie_ratings,movie_ratings,6,"CREATE TABLE ""movie_ratings"" (\n""movie_id"" TEX..."
5,table,persons,persons,7,"CREATE TABLE ""persons"" (\n""person_id"" TEXT,\n ..."
6,table,principals,principals,8,"CREATE TABLE ""principals"" (\n""movie_id"" TEXT,\..."
7,table,writers,writers,9,"CREATE TABLE ""writers"" (\n""movie_id"" TEXT,\n ..."


## Looking at imdb

In [30]:
directors_experienced = pd.read_sql(
"""
SELECT DISTINCT person_id
FROM directors
GROUP BY person_id
HAVING COUNT() >5;
""", con)
directors_experienced

Unnamed: 0,person_id
0,nm0000095
1,nm0000108
2,nm0000110
3,nm0000116
4,nm0000123
...,...
9737,nm9983975
9738,nm9985563
9739,nm9986248
9740,nm9987882


In [31]:
directors_not_experienced = pd.read_sql(
"""
SELECT DISTINCT person_id
FROM directors
GROUP BY person_id
HAVING COUNT() <=5;
""", con)
directors_not_experienced

Unnamed: 0,person_id
0,nm0000080
1,nm0000118
2,nm0000127
3,nm0000128
4,nm0000136
...,...
99506,nm9993281
99507,nm9993379
99508,nm9993380
99509,nm9993381


In [32]:
writers = pd.read_sql(
"""
SELECT DISTINCT person_id
FROM writers
GROUP BY person_id
HAVING COUNT() >5;
""", con)
writers

Unnamed: 0,person_id
0,nm0000095
1,nm0000108
2,nm0000116
3,nm0000118
4,nm0000175
...,...
5906,nm9977220
5907,nm9983975
5908,nm9985563
5909,nm9987882


In [33]:
known_for = pd.read_sql(
"""
SELECT *
FROM known_for
""", con)
known_for

Unnamed: 0,person_id,movie_id
0,nm0061671,tt0837562
1,nm0061671,tt2398241
2,nm0061671,tt0844471
3,nm0061671,tt0118553
4,nm0061865,tt0896534
...,...,...
1638255,nm9990690,tt9090932
1638256,nm9990690,tt8737130
1638257,nm9991320,tt8734436
1638258,nm9991320,tt9615610


In [34]:
movie_ratings = pd.read_sql(
"""
SELECT *
FROM movie_ratings
""", con)
movie_ratings

Unnamed: 0,movie_id,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21
...,...,...,...
73851,tt9805820,8.1,25
73852,tt9844256,7.5,24
73853,tt9851050,4.7,14
73854,tt9886934,7.0,5


In [35]:
movie_akas = pd.read_sql(
"""
SELECT *
FROM movie_akas
""", con)
movie_akas

Unnamed: 0,movie_id,ordering,title,region,language,types,attributes,is_original_title
0,tt0369610,10,Джурасик свят,BG,bg,,,0.0
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0
2,tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,,0.0
3,tt0369610,13,O Mundo dos Dinossauros,BR,,,short title,0.0
4,tt0369610,14,Jurassic World,FR,,imdbDisplay,,0.0
...,...,...,...,...,...,...,...,...
331698,tt9827784,2,Sayonara kuchibiru,,,original,,1.0
331699,tt9827784,3,Farewell Song,XWW,en,imdbDisplay,,0.0
331700,tt9880178,1,La atención,,,original,,1.0
331701,tt9880178,2,La atención,ES,,,,0.0


In [36]:
movie_basics = pd.read_sql(
"""
SELECT *
FROM movie_basics
""", con)
movie_basics

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"
...,...,...,...,...,...,...
146139,tt9916538,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,2019,123.0,Drama
146140,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,,Documentary
146141,tt9916706,Dankyavar Danka,Dankyavar Danka,2013,,Comedy
146142,tt9916730,6 Gunn,6 Gunn,2017,116.0,


In [37]:
principals = pd.read_sql(
"""
SELECT *
FROM principals
WHERE category = 'actor' OR category = 'actress' OR characters != 'None'
GROUP BY person_id
HAVING COUNT() >5
""", con)
principals

Unnamed: 0,movie_id,ordering,person_id,category,job,characters
0,tt1637702,3,nm0000026,archive_footage,,"[""Himself""]"
1,tt1979172,2,nm0000092,actor,,"[""John Cleese"",""Exploding Don"",""David Frost""]"
2,tt1001526,3,nm0000093,actor,,"[""Metro Man""]"
3,tt1849742,3,nm0000095,self,,"[""Himself""]"
4,tt1770734,3,nm0000096,actress,,"[""Kate Fletcher""]"
...,...,...,...,...,...,...
8938,tt2284964,1,nm9575332,actor,,
8939,tt4898004,1,nm9706424,actor,,
8940,tt8310586,3,nm9779533,actor,,
8941,tt10457158,4,nm9799185,actor,,"[""Kondo""]"


In [38]:
persons = pd.read_sql(
"""
SELECT *
FROM persons
""", con)
persons

Unnamed: 0,person_id,primary_name,birth_year,death_year,primary_profession
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator"
...,...,...,...,...,...
606643,nm9990381,Susan Grobes,,,actress
606644,nm9990690,Joo Yeon So,,,actress
606645,nm9991320,Madeline Smith,,,actress
606646,nm9991786,Michelle Modigliani,,,producer


## Testing experienced vs. not experienced Directors and Writers

In [39]:
d_n_e = pd.read_sql(
"""
SELECT m.averagerating, person_id
FROM movie_ratings as m
    JOIN (SELECT DISTINCT movie_id, person_id
FROM directors
GROUP BY person_id
HAVING COUNT() <=5) as d_e
    ON d_e.movie_id = m.movie_id
""", con)
d_n_e.describe(include='all')

Unnamed: 0,averagerating,person_id
count,45250.0,45250
unique,,45250
top,,nm5632132
freq,,1
mean,6.480986,
std,1.488203,
min,1.0,
25%,5.6,
50%,6.6,
75%,7.5,


In [40]:
d_e = pd.read_sql(
"""
SELECT m.averagerating, person_id
FROM movie_ratings as m
    JOIN (SELECT DISTINCT movie_id, person_id
FROM directors
GROUP BY person_id
HAVING COUNT() >5) as d_n_e
    ON d_n_e.movie_id = m.movie_id
""", con)
d_e.describe(include='all')

Unnamed: 0,averagerating,person_id
count,7081.0,7081
unique,,7081
top,,nm6732253
freq,,1
mean,6.096639,
std,1.341364,
min,1.0,
25%,5.3,
50%,6.2,
75%,7.0,


In [41]:
w_n_e = pd.read_sql(
"""
SELECT m.averagerating, person_id
FROM movie_ratings as m
    JOIN (SELECT DISTINCT movie_id, person_id
FROM writers
GROUP BY person_id
HAVING COUNT() <=5) as w_n_e
    ON w_n_e.movie_id = m.movie_id
""", con)
w_n_e.describe(include='all')

Unnamed: 0,averagerating,person_id
count,67670.0,67670
unique,,67670
top,,nm8729167
freq,,1
mean,6.268011,
std,1.448802,
min,1.0,
25%,5.4,
50%,6.4,
75%,7.3,


In [42]:
w_e = pd.read_sql(
"""
SELECT m.averagerating, person_id
FROM movie_ratings as m
    JOIN (SELECT DISTINCT movie_id, person_id
FROM writers
GROUP BY person_id
HAVING COUNT() >5) as w_e
    ON w_e.movie_id = m.movie_id
""", con)
w_e.describe(include='all')

Unnamed: 0,averagerating,person_id
count,4383.0,4383
unique,,4383
top,,nm0061019
freq,,1
mean,6.041273,
std,1.34407,
min,1.0,
25%,5.2,
50%,6.1,
75%,7.0,


## Testing Runtime

In [67]:
movie_budgets.rename(columns={"movie":"primary_title"}, inplace=True)

In [68]:
movie_budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   primary_title      5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [69]:
movie_basics.head()

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"


In [70]:
movie_basics_budget = pd.merge(movie_budgets, movie_basics, how ='left', on='primary_title')

In [71]:
movie_basics_budget.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7221 entries, 0 to 7220
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 7221 non-null   int64  
 1   release_date       7221 non-null   object 
 2   primary_title      7221 non-null   object 
 3   production_budget  7221 non-null   object 
 4   domestic_gross     7221 non-null   object 
 5   worldwide_gross    7221 non-null   object 
 6   movie_id           3815 non-null   object 
 7   original_title     3814 non-null   object 
 8   start_year         3815 non-null   float64
 9   runtime_minutes    3328 non-null   float64
 10  genres             3743 non-null   object 
dtypes: float64(2), int64(1), object(8)
memory usage: 677.0+ KB


In [99]:
movie_basics_budget['runtime_minutes'] = movie_basics_budget['runtime_minutes'].dropna(inplace=True)
movie_basics_budget['production_budget'] = movie_basics_budget['production_budget'].str.replace('$','')
movie_basics_budget['production_budget'] = movie_basics_budget['production_budget'].str.replace(',','')
movie_basics_budget['worldwide_gross'] = movie_basics_budget['worldwide_gross'].str.replace('$','')
movie_basics_budget['worldwide_gross'] = movie_basics_budget['worldwide_gross'].str.replace(',','')


In [104]:
x = sm.add_constant(movie_basics_budget['runtime_minutes'].values.astype(int))
x1 = sm.add_constant(movie_basics_budget['production_budget'].values.astype(int))
y = movie_basics_budget['worldwide_gross'].values.astype(int)

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [105]:
lin_reg_model = sm.OLS(y, x1).fit().summary()
lin_reg_model

0,1,2,3
Dep. Variable:,y,R-squared:,0.552
Model:,OLS,Adj. R-squared:,0.552
Method:,Least Squares,F-statistic:,8905.0
Date:,"Mon, 04 Dec 2023",Prob (F-statistic):,0.0
Time:,15:21:29,Log-Likelihood:,-144230.0
No. Observations:,7221,AIC:,288500.0
Df Residuals:,7219,BIC:,288500.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-8.616e+06,1.69e+06,-5.111,0.000,-1.19e+07,-5.31e+06
x1,3.1201,0.033,94.364,0.000,3.055,3.185

0,1,2,3
Omnibus:,5563.208,Durbin-Watson:,0.826
Prob(Omnibus):,0.0,Jarque-Bera (JB):,267048.398
Skew:,3.257,Prob(JB):,0.0
Kurtosis:,32.071,Cond. No.,63900000.0


In [107]:
lin_reg_model_2 = sm.OLS(y, x).fit().summary()


ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

## Looking at movie_budgets

In [103]:
movie_budgets.head()

Unnamed: 0,id,release_date,primary_title,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [None]:
movie_budgets.rename(columns={"movie":"primary_title"})

In [None]:
movie_budgets.info()

In [None]:
movie_budgets.describe(include='all')

## Looking at tmdb_movies

In [None]:
tmdb_movies.head()

In [None]:
tmdb_movies.info()

In [None]:
tmdb_movies.describe(include='all')

## Looking at rt_reviews

In [None]:
rt_reviews.head()

In [None]:
rt_reviews.info()

In [None]:
rt_reviews.describe(include='all')

## Looking at rt_movie_info

In [None]:
rt_movie_info.head()

In [None]:
rt_movie_info.info()

In [None]:
rt_movie_info.describe(include='all')

## Looking at movie_gross

In [None]:
movie_gross.head()

In [None]:
movie_gross.info()

In [None]:
movie_gross.describe(include='all')