### Imports 

In [21]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) # Confirm folder created
os.listdir("Data/")



['.ipynb_checkpoints',
 'title-akas-us-only.csv',
 'title.basics.tsv.gz',
 'title.ratings.tsv.gz']

In [22]:
import pandas as pd
import numpy as np

### Cleaning AKAs 

In [23]:
#Load akas data 
df_akas='Data/title-akas-us-only.csv'
df = pd.read_csv(df_akas,  low_memory=False)
df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452564 entries, 0 to 1452563
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452564 non-null  object
 1   ordering         1452564 non-null  int64 
 2   title            1452564 non-null  object
 3   region           1452564 non-null  object
 4   language         1452564 non-null  object
 5   types            1452564 non-null  object
 6   attributes       1452564 non-null  object
 7   isOriginalTitle  1452564 non-null  object
dtypes: int64(1), object(7)
memory usage: 88.7+ MB


In [25]:
#Check value count for region (only want US)
df['region'].value_counts()

US    1452564
Name: region, dtype: int64

#### Replace "\N" with np.nan

In [26]:
#Replace and make permanent
df.replace({'\\N':np.nan}, inplace =True)

### Cleaning Title Basics 


In [27]:
#load tile basics data 
fpath='Data/title.basics.tsv.gz'
df_tb = pd.read_csv(fpath, sep='\t', low_memory=False)
df_tb.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [57]:
## Save current dataframe to file.
df_tb.to_csv("Data/title.basics.tsv.gz",compression='gzip',index=False)

In [29]:
# Open saved file and preview again
df_tb = pd.read_csv("Data/title.basics.tsv.gz", low_memory = False)
df_tb.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [30]:
df_tb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10115816 entries, 0 to 10115815
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 694.6+ MB


####  Replace "\N" with np.nan

In [31]:
#Replace and make permanent
df_tb.replace({'\\N':np.nan}, inplace =True)

#### Eliminate movies that are null for runtimeMinutes, genres, startYear

In [32]:
#Check for missing values 
df_tb['startYear'].isna().sum()

1359351

In [33]:
#Drop missing values from genres 
df_tb.dropna(subset=['startYear'], inplace=True)
#Verify missing values were removed 
df_tb['startYear'].isna().sum()

0

In [34]:
#Check for missing values 
df_tb['genres'].isna().sum()

373874

In [35]:
#Drop missing values from genres 
df_tb.dropna(subset=['genres'], inplace=True)
#Verify missing values were removed 
df_tb['genres'].isna().sum()

0

In [36]:
#Check for missing values 
df_tb['runtimeMinutes'].isna().sum()

5611458

In [37]:
#Drop missing values from genres 
df_tb.dropna(subset=['runtimeMinutes'], inplace=True)
#Verify missing values were removed 
df_tb['runtimeMinutes'].isna().sum()

0

#### Keep only titleType==Movie

In [38]:
#Check different title types 
df_tb['titleType'].value_counts()

tvEpisode       1372982
short            604415
movie            381419
video            183784
tvMovie           92307
tvSeries          90872
tvSpecial         18696
tvMiniSeries      17700
tvShort            8629
videoGame           329
Name: titleType, dtype: int64

In [39]:
#Keep only movie titles 
remove_type = ['tvEpisode', 'short', 'video', 'tvSeries', 'tvMovie', 
               'tvMiniSeries', 'tvSpecial','videoGame', 'tvShort','tvPilot']
df_tb = df_tb[df_tb['titleType'].isin(remove_type) == False]
df_tb['titleType'].value_counts()

movie    381419
Name: titleType, dtype: int64

#### Convert the startYear column to float data type

In [40]:
#Check data type
df_tb['startYear'].dtypes

dtype('O')

In [41]:
#Check values for startYear 
df_tb['startYear'].value_counts()

2017    14424
2018    14406
2019    14153
2016    14003
2015    13512
        ...  
1904        1
1897        1
2027        1
1896        1
1894        1
Name: startYear, Length: 131, dtype: int64

In [42]:
#Change datatype 
df_tb['startYear'].astype(float)

8           1894.0
144         1897.0
570         1906.0
587         1907.0
672         1908.0
             ...  
10115666    2019.0
10115707    2015.0
10115734    2007.0
10115756    2017.0
10115766    2013.0
Name: startYear, Length: 381419, dtype: float64

#### Filter the dataframe using startYear. Keep years between 2000-2021 (Including 2000 and 2021)

In [43]:
df_tb_years = df_tb[(df_tb['startYear'] > "2000") & (df_tb['startYear'] < "2021")]
df_tb_years

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61112,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67486,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016,,90,Drama
67664,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
80549,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94,Horror
...,...,...,...,...,...,...,...,...,...
10115666,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,,123,Drama
10115707,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,,57,Documentary
10115734,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,,100,Documentary
10115756,tt9916730,movie,6 Gunn,6 Gunn,0,2017,,116,Drama


#### Eliminate movies that include "Documentary" in the genre

In [44]:
#Check values for genre
df_tb['genres'].value_counts()

Drama                         69886
Documentary                   66498
Comedy                        29353
Comedy,Drama                  10818
Drama,Romance                 10174
                              ...  
Adventure,Musical,War             1
Animation,History,Musical         1
Film-Noir,Romance,Thriller        1
Adventure,Comedy,Film-Noir        1
Crime,Fantasy,Sci-Fi              1
Name: genres, Length: 1367, dtype: int64

In [45]:
# Exclude movies that are included in the documentary category.
is_documentary = df_tb['genres'].str.contains('documentary',case=False)
df_tb = df_tb[~is_documentary]


In [46]:
#Check data type
df_tb['genres'].dtypes

dtype('O')

In [47]:
#Convert genres to string type 
df_tb['genres'] = df_tb['genres'].astype(str)
# Exclude movies that are included in the documentary category.
is_documentary = df_tb['genres'].str.contains('documentary',case=False)
df_tb = df_tb[~is_documentary]

#### Keep only US movies 

In [49]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers_tb =df_tb['tconst'].isin(df['titleId'])
keepers_tb

8            True
570          True
587          True
672          True
930         False
            ...  
10115498     True
10115537    False
10115582     True
10115666    False
10115756    False
Name: tconst, Length: 289009, dtype: bool

In [50]:
df_tb = df_tb[keepers_tb]
df_tb

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
1273,tt0001285,movie,The Life of Moses,The Life of Moses,0,1909,,50,"Biography,Drama,Family"
...,...,...,...,...,...,...,...,...,...
10114955,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
10115349,tt9915872,movie,The Last White Witch,Boku no kanojo wa mahoutsukai,0,2019,,97,"Comedy,Drama,Fantasy"
10115489,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
10115498,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


### Cleaning Ratings

In [51]:
#Load ratings data 
fpath='Data/title.ratings.tsv.gz'
df_tr = pd.read_csv(fpath, sep='\t', low_memory=False)
df_tr.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1989
1,tt0000002,5.8,264
2,tt0000003,6.5,1866
3,tt0000004,5.5,177
4,tt0000005,6.2,2651


In [52]:
## Save current dataframe to file.
df_tr.to_csv("Data/title.ratings.tsv.gz",compression='gzip',index=False)


In [53]:
# Open saved file and preview again
df_tr = pd.read_csv("Data/title.ratings.tsv.gz", low_memory = False)
df_tr.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1989
1,tt0000002,5.8,264
2,tt0000003,6.5,1866
3,tt0000004,5.5,177
4,tt0000005,6.2,2651


#### Replace "\N" with np.nan

In [54]:
#Replace and make permanent
df_tr.replace({'\\N':np.nan}, inplace =True)

#### Keep only movies that were included in your final title basics dataframe

In [55]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers_tr =df_tr['tconst'].isin(df_tb['tconst'])
keepers_tr

0          False
1          False
2          False
3          False
4          False
           ...  
1342913    False
1342914    False
1342915    False
1342916    False
1342917    False
Name: tconst, Length: 1342918, dtype: bool

In [56]:
df_tr = df_tr[keepers_tr]
df_tr

Unnamed: 0,tconst,averageRating,numVotes
8,tt0000009,5.3,205
363,tt0000574,6.0,845
371,tt0000591,4.4,20
424,tt0000679,5.1,68
747,tt0001285,5.4,59
...,...,...,...
1342837,tt9914942,6.6,181
1342864,tt9915872,6.4,9
1342877,tt9916170,7.0,7
1342878,tt9916190,3.7,243
