In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import sqlite3

# Import TMDB Data

In [2]:
tmdb_movies = pd.read_csv('../data/tmdb.movies.csv.gz')
tmdb_movies.rename(columns={'title': 'primary_title'}, inplace=True)
tmdb_movies.drop(columns=['genre_ids', 'original_language', 'original_title'], inplace=True)
tmdb_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26517 entries, 0 to 26516
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     26517 non-null  int64  
 1   id             26517 non-null  int64  
 2   popularity     26517 non-null  float64
 3   release_date   26517 non-null  object 
 4   primary_title  26517 non-null  object 
 5   vote_average   26517 non-null  float64
 6   vote_count     26517 non-null  int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 1.4+ MB


# The Numbers Data

In [3]:
tn_movie_budgets = pd.read_csv('../data/tn.movie_budgets.csv.gz')
tn_movie_budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [4]:
tn_movie_budgets['release_date'] = pd.to_datetime(tn_movie_budgets['release_date'])
tn_movie_budgets['release_month'] = pd.to_datetime(tn_movie_budgets['release_date']).dt.month_name()
tn_movie_budgets['release_year'] = pd.to_datetime(tn_movie_budgets['release_date']).dt.year

tn_movie_budgets['domestic_gross'] = tn_movie_budgets['domestic_gross'].str.replace('$', '')
tn_movie_budgets['domestic_gross'] = tn_movie_budgets['domestic_gross'].str.replace(',', '').astype(int) 

tn_movie_budgets['worldwide_gross'] = tn_movie_budgets['worldwide_gross'].str.replace('$', '')
tn_movie_budgets['worldwide_gross'] = tn_movie_budgets['worldwide_gross'].str.replace(',', '').astype(int)

tn_movie_budgets['production_budget'] = tn_movie_budgets['production_budget'].str.replace('$', '')
tn_movie_budgets['production_budget'] = tn_movie_budgets['production_budget'].str.replace(',', '').astype(int)

tn_movie_budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 5782 non-null   int64         
 1   release_date       5782 non-null   datetime64[ns]
 2   movie              5782 non-null   object        
 3   production_budget  5782 non-null   int64         
 4   domestic_gross     5782 non-null   int64         
 5   worldwide_gross    5782 non-null   int64         
 6   release_month      5782 non-null   object        
 7   release_year       5782 non-null   int64         
dtypes: datetime64[ns](1), int64(5), object(2)
memory usage: 361.5+ KB


In [5]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
tn_movie_budgets['release_month'] = pd.Categorical(tn_movie_budgets['release_month'], categories=months, ordered=True)
tn_movie_budgets['release_month'].value_counts()

December     745
October      573
August       496
September    493
November     486
June         479
March        470
April        454
July         440
May          407
February     392
January      347
Name: release_month, dtype: int64

In [6]:
tn_movie_budgets.rename(columns={'movie': 'primary_title'}, inplace=True)
tn_movie_budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 5782 non-null   int64         
 1   release_date       5782 non-null   datetime64[ns]
 2   primary_title      5782 non-null   object        
 3   production_budget  5782 non-null   int64         
 4   domestic_gross     5782 non-null   int64         
 5   worldwide_gross    5782 non-null   int64         
 6   release_month      5782 non-null   category      
 7   release_year       5782 non-null   int64         
dtypes: category(1), datetime64[ns](1), int64(5), object(1)
memory usage: 322.4+ KB


# Removing zeros


In [7]:
tn_movie_budgets_nonzero = tn_movie_budgets[tn_movie_budgets['domestic_gross'] != 0].copy()

# SQL DB

In [8]:
conn = sqlite3.connect('../data/im.db')
pd.read_sql('SELECT name AS "Table Names" FROM sqlite_master WHERE type="table"', conn)

Unnamed: 0,Table Names
0,movie_basics
1,directors
2,known_for
3,movie_akas
4,movie_ratings
5,persons
6,principals
7,writers


In [9]:
# Making the IMDB movie_basics table into a pandas df
imdb_movie_basics = pd.read_sql('SELECT * FROM movie_basics', conn)
imdb_movie_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   movie_id         146144 non-null  object 
 1   primary_title    146144 non-null  object 
 2   original_title   146123 non-null  object 
 3   start_year       146144 non-null  int64  
 4   runtime_minutes  114405 non-null  float64
 5   genres           140736 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 6.7+ MB


In [10]:
# Making the IMDB directors table into a pandas df
imdb_directors = pd.read_sql('SELECT * FROM directors', conn)
imdb_directors.value_counts()

movie_id   person_id
tt2249786  nm4630460    52
           nm1667641    52
           nm3952326    52
           nm3911770    52
           nm3863962    52
                        ..
tt3835080  nm0957772     1
tt3835110  nm7064225     1
tt7083156  nm1962933     1
tt3835134  nm7775933     1
tt4105968  nm1297226     1
Length: 163535, dtype: int64

In [11]:
imdb_movie_akas = pd.read_sql('SELECT * FROM movie_akas', conn)
imdb_movie_akas.drop(columns=['is_original_title'], inplace=True)
imdb_movie_akas.head()

Unnamed: 0,movie_id,ordering,title,region,language,types,attributes
0,tt0369610,10,Джурасик свят,BG,bg,,
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,
2,tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,
3,tt0369610,13,O Mundo dos Dinossauros,BR,,,short title
4,tt0369610,14,Jurassic World,FR,,imdbDisplay,


In [12]:
imdb_writers = pd.read_sql('SELECT * FROM writers', conn)
imdb_writers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255873 entries, 0 to 255872
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   movie_id   255873 non-null  object
 1   person_id  255873 non-null  object
dtypes: object(2)
memory usage: 3.9+ MB


In [13]:
imdb_writers_and_directors2 = pd.read_sql("""
            SELECT *, 
            FROM movie_ratings
            INNER JOIN directors
                USING(movie_id)
            INNER JOIN writers
                USING(movie_id)

            
            """, conn)

DatabaseError: Execution failed on sql '
            SELECT *, 
            FROM movie_ratings
            INNER JOIN directors
                USING(movie_id)
            INNER JOIN writers
                USING(movie_id)

            
            ': near "FROM": syntax error

In [None]:
d_e = pd.read_sql("""
SELECT m.averagerating, person_id, b.primary_title
FROM movie_ratings as m
    JOIN (SELECT DISTINCT movie_id, person_id
FROM directors
GROUP BY person_id
HAVING COUNT() >5) as d_n_e
    ON d_n_e.movie_id = m.movie_id
    JOIN movie_basics AS b
        USING(movie_id)
""", conn)
d_e.info()

In [None]:
# using left join
imdb_writers_and_directors.info()

In [None]:
# using inner join
imdb_writers_and_directors2.info()

In [None]:
pd.read_sql("""
            SELECT 
            FROM movie_basics AS m
            JOIN directors AS d
                USING(movie_id)
            JOIN writers AS w
                USING(movie_id)

            
            """, conn)

# Combining Dataframes


In [None]:
# Combing the IMDB 'movie_basics' table with the TMDB 'tmdb_movies' table

merged_movie_basics = pd.merge(imdb_movie_basics, tmdb_movies, on='primary_title')
merged_movie_basics.info()

In [None]:
merged_exp_directors = pd.merge(d_e, tmdb_movies, on='primary_title')
merged_exp_directors = pd.merge(merged_exp_directors, tn_movie_budgets, on='primary_title')
merged_exp_directors.info()