# Data Collection and Cleaning

## Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
import seaborn as sns
import os
%matplotlib inline

## Importing the CSV Files

In [3]:
from glob import glob
csv_files = glob("./zippedData/*.csv.gz")

In [4]:
#Gross data
df_mov_gross = pd.read_csv('zippedData/bom.movie_gross.csv.gz')
df_mov_gross['foreign_gross'].fillna(0, inplace=True)
#df_mov_gross.astype({'foreign_gross': 'float64'}).dtypes
#display (df_mov_gross.head())
#df_mov_gross.info()
#Drop null studios
#Cleaning notes
#Replaced Foreign Gross NAs with 0s
#Create new column, total gross, drop any rows that have nothing.
#Still need to change to ints or floats, though


In [5]:
#Name Basics
df_names = pd.read_csv('zippedData/imdb.name.basics.csv.gz')
#display(df_names.head())
#display(df_names.info())
#Need to drop everyone with death years
#df_names['primary_profession'].nunique()
#Need to split the primary profession into different columns, then count unique values. 
#also should drop anyone not known for any titles.  Not useful right now.

In [6]:
#Title akas
df_title_akas = pd.read_csv('zippedData/imdb.title.akas.csv.gz')
#display(df_title_akas.head(10))
#df_title_akas.info()
#Basically just need this for title id for other DFs

In [7]:
#Title basics - creating the df
df_title_basics = pd.read_csv('zippedData/imdb.title.basics.csv.gz')
#Creating the split genre column
df_title_basics[['G1','G2','G3']] = df_title_basics.genres.str.split(",",expand=True) 
#Dropping the old genre column
df_title_basics.drop(['genres'], axis=1, inplace = True)
#cleaned runtime minutes by using mean
mean_runtime = df_title_basics['runtime_minutes'].mean()
df_title_basics['runtime_minutes'].fillna(mean_runtime, inplace=True)
#cleaning original title by replacing missing ones with the primary title
df_title_basics['original_title'].fillna('primary_title', inplace=True)

display(df_title_basics.head())
display(df_title_basics.info())
display(df_title_basics.isna().sum())

#All clean up to G1.  Next steps:
#Join ratings
#Join more granular release date
#Join budgets, etc. . .

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,G1,G2,G3
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,Action,Crime,Drama
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,Biography,Drama,
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama,,
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,86.187247,Comedy,Drama,
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,Comedy,Drama,Fantasy


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 8 columns):
tconst             146144 non-null object
primary_title      146144 non-null object
original_title     146144 non-null object
start_year         146144 non-null int64
runtime_minutes    146144 non-null float64
G1                 140736 non-null object
G2                 59378 non-null object
G3                 29436 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 8.9+ MB


None

tconst                  0
primary_title           0
original_title          0
start_year              0
runtime_minutes         0
G1                   5408
G2                  86766
G3                 116708
dtype: int64

In [8]:
#crew
df_crew = pd.read_csv('zippedData/imdb.title.crew.csv.gz')
#display(df_crew.head())
#df_crew.info()
#great info here, should add year, gross, profit, and actors.  
#we could make a great scatter plot or bar chartwith this.  

In [9]:
#principals
df_princ = pd.read_csv('zippedData/imdb.title.principals.csv.gz')
#display(df_princ.head())
#df_princ.info()
#Job and characters aren't very useful
#tconst, nconst, and category are useful though

In [10]:
#ratings
df_rat = pd.read_csv('zippedData/imdb.title.ratings.csv.gz')
#display(df_rat.head())
#df_rat.info()
#useful for ratings.  
#squeaky clean data!

In [11]:
#MOVIES
df_mov = pd.read_csv('zippedData/tmdb.movies.csv.gz')
#display(df_mov.head())
#df_mov.info()
#Squeaky clean data!
#need to drop that first index, though

In [12]:
#BUDGETS
df_budg = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')
display(df_budg.head(2))
#df_budg.info()
#Clean data - great info, would want names of people involved, t const
#also need a new column, total profit

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"


## Joins

In [13]:
df_mov_gross_tconst = df_mov_gross.join(df_title_akas.set_index('title'), rsuffix='_akas', on = 'title', how = 'left')
#display(df_mov_gross_tconst.head())
#display(df_mov_gross_tconst.info())
#

In [15]:
#Joining TITLE BASICS to MOVIES.

df_title_basics.set_index('original_title', inplace = True)
df_mov.set_index('original_title', inplace = True)

df3 = df_mov.join(df_title_basics, rsuffix = '_title', on = 'original_title', how = 'left')
df4 = df3
df4.dropna(axis = 0, subset = ['tconst'], inplace = True)
#df4.isna().sum()
#tconsts are being held as NaN
df4.drop_duplicates(subset = 'tconst', inplace = True)
display(df4.head(2))
#df4.info()
#Duplicate tconst now dropped

Unnamed: 0_level_0,Unnamed: 0,genre_ids,id,original_language,popularity,release_date,title,vote_average,vote_count,tconst,primary_title,start_year,runtime_minutes,G1,G2,G3
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Harry Potter and the Deathly Hallows: Part 1,0,"[12, 14, 10751]",12444,en,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,tt0926084,Harry Potter and the Deathly Hallows: Part 1,2010.0,146.0,Adventure,Fantasy,Mystery
How to Train Your Dragon,1,"[14, 12, 16, 10751]",10191,en,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,tt0892769,How to Train Your Dragon,2010.0,98.0,Action,Adventure,Animation


In [16]:
#Inner Join to only create rows with complete data.
df5 = df4.set_index('title').join(df_budg.set_index('movie'), rsuffix='_budg', how = 'inner')
df5.dropna(axis = 0, subset = ['G1'], inplace = True)
#display(df5.head())
#df5.info()

In [17]:
#Next steps
#Convert dates to date and budgets to ints
def convert_amt_to_int(df, col):
    df[col] = df[col].str.replace("$", "").str.replace(",", "").astype('int')
    return df
money_cols = ['production_budget', 'domestic_gross', 'worldwide_gross']
for col in money_cols:
    df5 = convert_amt_to_int(df5, col)
df5['release_date'] = pd.to_datetime(df5['release_date'])
df5['release_month'] = pd.DatetimeIndex(df5['release_date']).month
display(df5.head())
df5.info()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,popularity,release_date,vote_average,vote_count,tconst,primary_title,...,runtime_minutes,G1,G2,G3,id_budg,release_date_budg,production_budget,domestic_gross,worldwide_gross,release_month
#Horror,14656,"[18, 9648, 27, 53]",301325,de,6.099,2015-11-20,3.3,102,tt3526286,#Horror,...,101.0,Crime,Drama,Horror,16,"Nov 20, 2015",1500000,0,0,11
10 Cloverfield Lane,17422,"[53, 878, 18]",333371,en,17.892,2016-03-11,6.9,4629,tt1179933,10 Cloverfield Lane,...,103.0,Drama,Horror,Mystery,54,"Mar 11, 2016",5000000,72082999,108286422,3
10 Days in a Madhouse,15907,[18],345003,en,0.955,2015-11-20,5.4,7,tt3453052,10 Days in a Madhouse,...,111.0,Drama,,,48,"Nov 11, 2015",12000000,14616,14616,11
12 Strong,24032,"[10752, 18, 36, 28]",429351,en,13.183,2018-01-19,5.6,1312,tt1413492,12 Strong,...,130.0,Action,Drama,History,64,"Jan 19, 2018",35000000,45819713,71118378,1
12 Years a Slave,7911,"[18, 36]",76203,en,16.493,2013-10-30,7.9,6631,tt2024544,12 Years a Slave,...,134.0,Biography,Drama,History,18,"Oct 18, 2013",20000000,56671993,181025343,10


<class 'pandas.core.frame.DataFrame'>
Index: 2756 entries, #Horror to xXx: Return of Xander Cage
Data columns (total 21 columns):
Unnamed: 0           2756 non-null int64
genre_ids            2756 non-null object
id                   2756 non-null int64
original_language    2756 non-null object
popularity           2756 non-null float64
release_date         2756 non-null datetime64[ns]
vote_average         2756 non-null float64
vote_count           2756 non-null int64
tconst               2756 non-null object
primary_title        2756 non-null object
start_year           2756 non-null float64
runtime_minutes      2756 non-null float64
G1                   2756 non-null object
G2                   1949 non-null object
G3                   1330 non-null object
id_budg              2756 non-null int64
release_date_budg    2756 non-null object
production_budget    2756 non-null int64
domestic_gross       2756 non-null int64
worldwide_gross      2756 non-null int64
release_month        2756

## Exporting the Cleaned and Compiled Data

In [18]:
df5.to_csv('cleaned_genre_exploration.csv')
#~2.7k rows of clean data with genre, budget, gross, and more broken out.

## Heads for Reference

In [30]:
#Showing heads for my own reference
display(df_mov_gross.head(2))
display(df_names.head(2))
display(df_title_akas.head(2))
display(df_title_basics.head(2))
display(df_crew.head(2))
display(df_princ.head(2))
display(df_rat.head(2))
display(df_mov.head(2))
display(df_budg.head(2))
display(df5.head(2))

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010


Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"


Unnamed: 0,title_id,ordering,title,region,language,types,attributes,is_original_title
0,tt0369610,10,Джурасик свят,BG,bg,,,0.0
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0


Unnamed: 0_level_0,tconst,primary_title,start_year,runtime_minutes,G1,G2,G3
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Sunghursh,tt0063540,Sunghursh,2013,175.0,Action,Crime,Drama
Ashad Ka Ek Din,tt0066787,One Day Before the Rainy Season,2019,114.0,Biography,Drama,


Unnamed: 0,tconst,directors,writers
0,tt0285252,nm0899854,nm0899854
1,tt0438973,,"nm0175726,nm1802864"


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,


Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559


Unnamed: 0_level_0,Unnamed: 0,genre_ids,id,original_language,popularity,release_date,title,vote_average,vote_count
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Harry Potter and the Deathly Hallows: Part 1,0,"[12, 14, 10751]",12444,en,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
How to Train Your Dragon,1,"[14, 12, 16, 10751]",10191,en,28.734,2010-03-26,How to Train Your Dragon,7.7,7610


Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"


Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,popularity,release_date,vote_average,vote_count,tconst,primary_title,...,runtime_minutes,G1,G2,G3,id_budg,release_date_budg,production_budget,domestic_gross,worldwide_gross,release_month
#Horror,14656,"[18, 9648, 27, 53]",301325,de,6.099,2015-11-20,3.3,102,tt3526286,#Horror,...,101.0,Crime,Drama,Horror,16,"Nov 20, 2015",1500000,0,0,11
10 Cloverfield Lane,17422,"[53, 878, 18]",333371,en,17.892,2016-03-11,6.9,4629,tt1179933,10 Cloverfield Lane,...,103.0,Drama,Horror,Mystery,54,"Mar 11, 2016",5000000,72082999,108286422,3
