# Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Looking at `imdb.title.principles` 

In [2]:
imdb_title = pd.read_csv('Data/imdb.title.principals.csv.gz')
imdb_title.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"


In [3]:
imdb_title.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028186 entries, 0 to 1028185
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   tconst      1028186 non-null  object
 1   ordering    1028186 non-null  int64 
 2   nconst      1028186 non-null  object
 3   category    1028186 non-null  object
 4   job         177684 non-null   object
 5   characters  393360 non-null   object
dtypes: int64(1), object(5)
memory usage: 47.1+ MB


In [4]:
imdb_title.isna().sum()

tconst             0
ordering           0
nconst             0
category           0
job           850502
characters    634826
dtype: int64

In [5]:
imdb_title.columns

Index(['tconst', 'ordering', 'nconst', 'category', 'job', 'characters'], dtype='object')

In [6]:
print(imdb_title.nunique()) 
len(imdb_title)

tconst        143454
ordering          10
nconst        604546
category          12
job             2965
characters    174762
dtype: int64


1028186

In [7]:
imdb_title.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"


## Conclusions


There is a lot of missing data from `job` as well as `characters`. Will check to see if other `.csv` from imdb have this information.

# Looking at the `imdb.title`

In [8]:
imdb_crew = pd.read_csv('Data/imdb.title.crew.csv.gz')
imdb_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0285252,nm0899854,nm0899854
1,tt0438973,,"nm0175726,nm1802864"
2,tt0462036,nm1940585,nm1940585
3,tt0835418,nm0151540,"nm0310087,nm0841532"
4,tt0878654,"nm0089502,nm2291498,nm2292011",nm0284943


In [9]:
imdb_crew.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   tconst     146144 non-null  object
 1   directors  140417 non-null  object
 2   writers    110261 non-null  object
dtypes: object(3)
memory usage: 3.3+ MB


In [10]:
imdb_crew.isna().sum()

tconst           0
directors     5727
writers      35883
dtype: int64

In [11]:
imdb_crew.columns

Index(['tconst', 'directors', 'writers'], dtype='object')

In [12]:
imdb_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0285252,nm0899854,nm0899854
1,tt0438973,,"nm0175726,nm1802864"
2,tt0462036,nm1940585,nm1940585
3,tt0835418,nm0151540,"nm0310087,nm0841532"
4,tt0878654,"nm0089502,nm2291498,nm2292011",nm0284943


## Conclusions

`Writer` and `director` have missing values. Will see how to add values as I look at more data. 

# Looking at `imdb.rating`

In [13]:
rating = pd.read_csv('Data/imdb.title.ratings.csv.gz')
rating.head()

Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


In [14]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73856 entries, 0 to 73855
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         73856 non-null  object 
 1   averagerating  73856 non-null  float64
 2   numvotes       73856 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 1.7+ MB


In [15]:
rating.isna().sum()

tconst           0
averagerating    0
numvotes         0
dtype: int64

In [16]:
rating.columns

Index(['tconst', 'averagerating', 'numvotes'], dtype='object')

In [17]:
rating.head()

Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


## Conclusions

No data missing. No cleaning done.

# Merging the data

In [18]:
merged_data = imdb_title.merge(imdb_crew, on = 'tconst').merge(rating, on = 'tconst')
merged_data.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters,directors,writers,averagerating,numvotes
0,tt0323808,10,nm0059247,editor,,,nm0362736,nm0362736,3.9,2328
1,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]",nm0362736,nm0362736,3.9,2328
2,tt0323808,2,nm2694680,actor,,"[""Steve Thomson""]",nm0362736,nm0362736,3.9,2328
3,tt0323808,3,nm0574615,actor,,"[""Sir Lachlan Morrison""]",nm0362736,nm0362736,3.9,2328
4,tt0323808,4,nm0502652,actress,,"[""Lady Delia Morrison""]",nm0362736,nm0362736,3.9,2328


In [19]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 629755 entries, 0 to 629754
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         629755 non-null  object 
 1   ordering       629755 non-null  int64  
 2   nconst         629755 non-null  object 
 3   category       629755 non-null  object 
 4   job            124945 non-null  object 
 5   characters     248129 non-null  object 
 6   directors      626240 non-null  object 
 7   writers        558935 non-null  object 
 8   averagerating  629755 non-null  float64
 9   numvotes       629755 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 52.9+ MB


In [20]:
merged_data.nunique()

tconst            73685
ordering             10
nconst           359919
category             12
job                2557
characters       121452
directors         51146
writers           54414
averagerating        91
numvotes           7349
dtype: int64

# Good Films

Films that have an `averageraiting` > **8** and have at least **500** votes. 

In [21]:
good_films = merged_data[(merged_data['averagerating'] > 8) & (merged_data['numvotes'] >= 500)]

In [22]:
good_films.sample(5)

Unnamed: 0,tconst,ordering,nconst,category,job,characters,directors,writers,averagerating,numvotes
598771,tt7380226,9,nm0007124,actress,,"[""Minister""]",nm9276879,nm9276879,8.8,694
578943,tt6628102,3,nm0947986,actress,,"[""Asuman Karasu""]",nm0149196,"nm9831079,nm0946324,nm0149196",8.2,11928
263294,tt2659414,5,nm1085908,director,,,nm1085908,"nm1085908,nm5587369,nm4494126,nm3858594",8.2,11282
196042,tt1201607,8,nm0057655,producer,producer,,nm0946734,"nm0460141,nm0746830",8.1,691835
498544,tt6156350,4,nm2962374,archive_footage,,"[""Himself - NYC Workers League""]",nm0193231,nm0193231,8.3,4014


In [23]:
good_films.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4373 entries, 635 to 628842
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         4373 non-null   object 
 1   ordering       4373 non-null   int64  
 2   nconst         4373 non-null   object 
 3   category       4373 non-null   object 
 4   job            1006 non-null   object 
 5   characters     1770 non-null   object 
 6   directors      4369 non-null   object 
 7   writers        3729 non-null   object 
 8   averagerating  4373 non-null   float64
 9   numvotes       4373 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 375.8+ KB


In [24]:
good_films.isna().sum()

tconst              0
ordering            0
nconst              0
category            0
job              3367
characters       2603
directors           4
writers           644
averagerating       0
numvotes            0
dtype: int64

# Final Conclusions

## Still have to merge with the other data from Nate to put it all together.

Only wanted to look at data that has a rating of **8** and above, while having at least **500** votes. 

There are still values missing for `job, character, directors, and writers`. Will determine if they should be dropped at a later time. 

In [25]:
tn_cleaned = pd.read_csv('Data/tn_cleaned.csv')
tn_cleaned

Unnamed: 0.1,Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,release_month,release_day,release_year
0,0,1,Dec 18 2009,Avatar,425000000,760507625,2776345279,12,18,2009
1,1,2,May 20 2011,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,5,20,2011
2,2,3,Jun 7 2019,Dark Phoenix,350000000,42762350,149762350,6,7,2019
3,3,4,May 1 2015,Avengers: Age of Ultron,330600000,459005868,1403013963,5,1,2015
4,4,5,Dec 15 2017,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,12,15,2017
...,...,...,...,...,...,...,...,...,...,...
5777,5777,78,Dec 31 2018,Red 11,7000,0,0,12,31,2018
5778,5778,79,Apr 2 1999,Following,6000,48482,240495,4,2,1999
5779,5779,80,Jul 13 2005,Return to the Land of Wonders,5000,1338,1338,7,13,2005
5780,5780,81,Sep 29 2015,A Plague So Pleasant,1400,0,0,9,29,2015


In [27]:
tn_cleaned['movie'] = tn_cleaned['movie'].apply(lambda x: x.lower())
tn_cleaned

Unnamed: 0.1,Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,release_month,release_day,release_year
0,0,1,Dec 18 2009,avatar,425000000,760507625,2776345279,12,18,2009
1,1,2,May 20 2011,pirates of the caribbean: on stranger tides,410600000,241063875,1045663875,5,20,2011
2,2,3,Jun 7 2019,dark phoenix,350000000,42762350,149762350,6,7,2019
3,3,4,May 1 2015,avengers: age of ultron,330600000,459005868,1403013963,5,1,2015
4,4,5,Dec 15 2017,star wars ep. viii: the last jedi,317000000,620181382,1316721747,12,15,2017
...,...,...,...,...,...,...,...,...,...,...
5777,5777,78,Dec 31 2018,red 11,7000,0,0,12,31,2018
5778,5778,79,Apr 2 1999,following,6000,48482,240495,4,2,1999
5779,5779,80,Jul 13 2005,return to the land of wonders,5000,1338,1338,7,13,2005
5780,5780,81,Sep 29 2015,a plague so pleasant,1400,0,0,9,29,2015


In [30]:
tndb_cleaned = pd.read_csv('Data/tmdb_cleaned.csv')
tndb_cleaned

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,genre_id
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,12.0
1,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,14.0
2,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,10751.0
3,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,14.0
4,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,12.0
...,...,...,...,...,...,...,...,...,...,...,...
47829,26515,"[10751, 12, 28]",366854,en,Trailer Made,0.600,2018-06-22,Trailer Made,0.0,1,10751.0
47830,26515,"[10751, 12, 28]",366854,en,Trailer Made,0.600,2018-06-22,Trailer Made,0.0,1,12.0
47831,26515,"[10751, 12, 28]",366854,en,Trailer Made,0.600,2018-06-22,Trailer Made,0.0,1,28.0
47832,26516,"[53, 27]",309885,en,The Church,0.600,2018-10-05,The Church,0.0,1,53.0


In [31]:
tndb_cleaned['original_title'] = tndb_cleaned['original_title'].apply(lambda x: x.lower())
tndb_cleaned

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,genre_id
0,0,"[12, 14, 10751]",12444,en,harry potter and the deathly hallows: part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,12.0
1,0,"[12, 14, 10751]",12444,en,harry potter and the deathly hallows: part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,14.0
2,0,"[12, 14, 10751]",12444,en,harry potter and the deathly hallows: part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,10751.0
3,1,"[14, 12, 16, 10751]",10191,en,how to train your dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,14.0
4,1,"[14, 12, 16, 10751]",10191,en,how to train your dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,12.0
...,...,...,...,...,...,...,...,...,...,...,...
47829,26515,"[10751, 12, 28]",366854,en,trailer made,0.600,2018-06-22,Trailer Made,0.0,1,10751.0
47830,26515,"[10751, 12, 28]",366854,en,trailer made,0.600,2018-06-22,Trailer Made,0.0,1,12.0
47831,26515,"[10751, 12, 28]",366854,en,trailer made,0.600,2018-06-22,Trailer Made,0.0,1,28.0
47832,26516,"[53, 27]",309885,en,the church,0.600,2018-10-05,The Church,0.0,1,53.0


In [32]:
cleaned_bom = pd.read_csv('Data/cleaned_bom.csv')
cleaned_bom

Unnamed: 0.1,Unnamed: 0,title,studio,domestic_gross,year
0,0,Toy Story 3,BV,415000000.0,2010
1,1,Alice in Wonderland (2010),BV,334200000.0,2010
2,2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,2010
3,3,Inception,WB,292600000.0,2010
4,4,Shrek Forever After,P/DW,238700000.0,2010
...,...,...,...,...,...
3354,3382,The Quake,Magn.,6200.0,2018
3355,3383,Edward II (2018 re-release),FM,4800.0,2018
3356,3384,El Pacto,Sony,2500.0,2018
3357,3385,The Swan,Synergetic,2400.0,2018


In [33]:
cleaned_bom['title'] = cleaned_bom['title'].apply(lambda x: x.lower())
cleaned_bom

Unnamed: 0.1,Unnamed: 0,title,studio,domestic_gross,year
0,0,toy story 3,BV,415000000.0,2010
1,1,alice in wonderland (2010),BV,334200000.0,2010
2,2,harry potter and the deathly hallows part 1,WB,296000000.0,2010
3,3,inception,WB,292600000.0,2010
4,4,shrek forever after,P/DW,238700000.0,2010
...,...,...,...,...,...
3354,3382,the quake,Magn.,6200.0,2018
3355,3383,edward ii (2018 re-release),FM,4800.0,2018
3356,3384,el pacto,Sony,2500.0,2018
3357,3385,the swan,Synergetic,2400.0,2018
