# Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Looking at `imdb.title.principles` 

In [2]:
imdb_title = pd.read_csv('Data/imdb.title.principals.csv.gz')
imdb_title.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"


In [3]:
imdb_title.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028186 entries, 0 to 1028185
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   tconst      1028186 non-null  object
 1   ordering    1028186 non-null  int64 
 2   nconst      1028186 non-null  object
 3   category    1028186 non-null  object
 4   job         177684 non-null   object
 5   characters  393360 non-null   object
dtypes: int64(1), object(5)
memory usage: 47.1+ MB


In [4]:
imdb_title.isna().sum()

tconst             0
ordering           0
nconst             0
category           0
job           850502
characters    634826
dtype: int64

In [5]:
imdb_title.columns

Index(['tconst', 'ordering', 'nconst', 'category', 'job', 'characters'], dtype='object')

In [6]:
print(imdb_title.nunique()) 
len(imdb_title)

tconst        143454
ordering          10
nconst        604546
category          12
job             2965
characters    174762
dtype: int64


1028186

In [7]:
imdb_title.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"


## Conclusions


There is a lot of missing data from `job` as well as `characters`. Will check to see if other `.csv` from imdb have this information.

# Looking at the `imdb.title`

In [8]:
imdb_crew = pd.read_csv('Data/imdb.title.crew.csv.gz')
imdb_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0285252,nm0899854,nm0899854
1,tt0438973,,"nm0175726,nm1802864"
2,tt0462036,nm1940585,nm1940585
3,tt0835418,nm0151540,"nm0310087,nm0841532"
4,tt0878654,"nm0089502,nm2291498,nm2292011",nm0284943


In [9]:
imdb_crew.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   tconst     146144 non-null  object
 1   directors  140417 non-null  object
 2   writers    110261 non-null  object
dtypes: object(3)
memory usage: 3.3+ MB


In [10]:
imdb_crew.isna().sum()

tconst           0
directors     5727
writers      35883
dtype: int64

In [11]:
imdb_crew.columns

Index(['tconst', 'directors', 'writers'], dtype='object')

In [12]:
imdb_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0285252,nm0899854,nm0899854
1,tt0438973,,"nm0175726,nm1802864"
2,tt0462036,nm1940585,nm1940585
3,tt0835418,nm0151540,"nm0310087,nm0841532"
4,tt0878654,"nm0089502,nm2291498,nm2292011",nm0284943


## Conclusions

`Writer` and `director` have missing values. Will see how to add values as I look at more data. 

# Looking at `imdb.rating`

In [13]:
rating = pd.read_csv('Data/imdb.title.ratings.csv.gz')
rating.head()

Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


In [14]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73856 entries, 0 to 73855
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         73856 non-null  object 
 1   averagerating  73856 non-null  float64
 2   numvotes       73856 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 1.7+ MB


In [15]:
rating.isna().sum()

tconst           0
averagerating    0
numvotes         0
dtype: int64

In [16]:
rating.columns

Index(['tconst', 'averagerating', 'numvotes'], dtype='object')

In [17]:
rating.head()

Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


## Conclusions

No data missing. No cleaning done.

# Merging the data

In [18]:
merged_data = imdb_title.merge(imdb_crew, on = 'tconst').merge(rating, on = 'tconst')
merged_data.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters,directors,writers,averagerating,numvotes
0,tt0323808,10,nm0059247,editor,,,nm0362736,nm0362736,3.9,2328
1,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]",nm0362736,nm0362736,3.9,2328
2,tt0323808,2,nm2694680,actor,,"[""Steve Thomson""]",nm0362736,nm0362736,3.9,2328
3,tt0323808,3,nm0574615,actor,,"[""Sir Lachlan Morrison""]",nm0362736,nm0362736,3.9,2328
4,tt0323808,4,nm0502652,actress,,"[""Lady Delia Morrison""]",nm0362736,nm0362736,3.9,2328


In [19]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 629755 entries, 0 to 629754
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         629755 non-null  object 
 1   ordering       629755 non-null  int64  
 2   nconst         629755 non-null  object 
 3   category       629755 non-null  object 
 4   job            124945 non-null  object 
 5   characters     248129 non-null  object 
 6   directors      626240 non-null  object 
 7   writers        558935 non-null  object 
 8   averagerating  629755 non-null  float64
 9   numvotes       629755 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 52.9+ MB


In [20]:
merged_data.nunique()

tconst            73685
ordering             10
nconst           359919
category             12
job                2557
characters       121452
directors         51146
writers           54414
averagerating        91
numvotes           7349
dtype: int64

# Good Films

Films that have an `averageraiting` > **8** and have at least **500** votes. 

In [21]:
good_films = merged_data[(merged_data['averagerating'] > 8) & (merged_data['numvotes'] >= 500)]

In [22]:
good_films.sample(5)

Unnamed: 0,tconst,ordering,nconst,category,job,characters,directors,writers,averagerating,numvotes
319202,tt2700330,9,nm2740351,editor,,,"nm1905310,nm3104562","nm1905310,nm3104562",8.2,1468
49279,tt2215151,9,nm0194125,cinematographer,,,nm0466428,,8.1,6263
508841,tt5963218,9,nm8359421,composer,,,"nm8359417,nm8529623",nm5465931,9.5,6509
197631,tt2140203,2,nm0651534,actor,,"[""Wolf Man""]",nm0396074,"nm0396074,nm0645766",8.1,31619
473106,tt5773402,6,nm3041597,composer,,,nm6856823,nm6856823,9.2,1096


In [23]:
good_films.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4373 entries, 635 to 628842
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         4373 non-null   object 
 1   ordering       4373 non-null   int64  
 2   nconst         4373 non-null   object 
 3   category       4373 non-null   object 
 4   job            1006 non-null   object 
 5   characters     1770 non-null   object 
 6   directors      4369 non-null   object 
 7   writers        3729 non-null   object 
 8   averagerating  4373 non-null   float64
 9   numvotes       4373 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 375.8+ KB


In [24]:
good_films.isna().sum()

tconst              0
ordering            0
nconst              0
category            0
job              3367
characters       2603
directors           4
writers           644
averagerating       0
numvotes            0
dtype: int64

# Final Conclusions

## Still have to merge with the other data from Nate to put it all together.

Only wanted to look at data that has a rating of **8** and above, while having at least **500** votes. 

There are still values missing for `job, character, directors, and writers`. Will determine if they should be dropped at a later time. 