### Imports 

In [13]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) # Confirm folder created
os.listdir("Data/")



['.ipynb_checkpoints',
 'title-akas-us-only.csv',
 'title.basics.tsv.gz',
 'title.ratings.tsv.gz']

In [2]:
import pandas as pd
import numpy as np

### Cleaning AKAs 

In [72]:
#Load akas data 
df_akas='Data/title-akas-us-only.csv'
df = pd.read_csv(df_akas,  low_memory=False)
df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452564 entries, 0 to 1452563
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452564 non-null  object
 1   ordering         1452564 non-null  int64 
 2   title            1452564 non-null  object
 3   region           1452564 non-null  object
 4   language         1452564 non-null  object
 5   types            1452564 non-null  object
 6   attributes       1452564 non-null  object
 7   isOriginalTitle  1452564 non-null  object
dtypes: int64(1), object(7)
memory usage: 88.7+ MB


In [69]:
#Check value count for region (only want US)
df['region'].value_counts()

US    1452564
Name: region, dtype: int64

#### Replace "\N" with np.nan

In [75]:
#Replace and make permanent
df.replace({'\\N':np.nan}, inplace =True)

### Cleaning Ratings

In [7]:
#Load ratings data 
fpath='Data/title.ratings.tsv.gz'
df_tr = pd.read_csv(fpath, sep='\t', low_memory=False)
df_tr.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1991
1,tt0000002,5.8,265
2,tt0000003,6.5,1862
3,tt0000004,5.5,178
4,tt0000005,6.2,2648


In [85]:
## Save current dataframe to file.
df_tr.to_csv("Data/title.ratings.tsv.gz",compression='gzip',index=False)


In [86]:
# Open saved file and preview again
df_tr = pd.read_csv("Data/title.ratings.tsv.gz", low_memory = False)
df_tr.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0035423,6.4,87333
1,tt0050396,4.6,212
2,tt0062336,6.4,179
3,tt0068865,5.4,75
4,tt0069049,6.7,7793


#### Replace "\N" with np.nan

In [None]:
#Replace and make permanent
df_tr.replace({'\\N':np.nan}, inplace =True)

#### Keep only movies that were included in your final title basics dataframe

In [83]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers_tr =df_tr['tconst'].isin(df_tb['tconst'])
keepers_tr

0          False
1          False
4          False
5          False
6          False
           ...  
1339554     True
1339555     True
1339562     True
1339563     True
1339568    False
Name: tconst, Length: 505066, dtype: bool

In [84]:
df_tr = df_tr[keepers_tr]
df_tr

Unnamed: 0,tconst,averageRating,numVotes
17882,tt0035423,6.4,87333
30227,tt0050396,4.6,212
40632,tt0062336,6.4,179
46345,tt0068865,5.4,75
46502,tt0069049,6.7,7793
...,...,...,...
1339552,tt9916190,3.7,243
1339554,tt9916200,8.1,233
1339555,tt9916204,8.1,267
1339562,tt9916348,8.3,18


### Cleaning Title Basics 


In [48]:
#load tile basics data 
fpath='Data/title.basics.tsv.gz'
df_tb = pd.read_csv(fpath, sep='\t', low_memory=False)
df_tb.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [87]:
## Save current dataframe to file.
df_tb.to_csv("Data/title.basics.tsv.gz",compression='gzip',index=False)

In [88]:
# Open saved file and preview again
df_tb = pd.read_csv("Data/title.basics.tsv.gz", low_memory = False)
df_tb.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001,,20,Short
1,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
2,tt0050396,short,Final Curtain,Final Curtain,0,2012,,20,"Horror,Short"
3,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
4,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016,,90,Drama


In [49]:
df_tb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10094451 entries, 0 to 10094450
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 693.1+ MB


#### Keep only US movies 

In [80]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers_tb =df_tb['tconst'].isin(df['titleId'])
keepers_tb

33800        True
34800        True
49491        True
59199       False
61112        True
            ...  
10094391    False
10094407    False
10094442    False
10094449    False
10094450    False
Name: tconst, Length: 1455716, dtype: bool

In [81]:
df_tb = df_tb[keepers_tb]
df_tb

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
33800,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001,,20,Short
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
49491,tt0050396,short,Final Curtain,Final Curtain,0,2012,,20,"Horror,Short"
61112,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67486,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016,,90,Drama
...,...,...,...,...,...,...,...,...,...
10094139,tt9916204,tvEpisode,Better Angels,Better Angels,0,2019,,42,"Drama,Thriller"
10094144,tt9916214,short,Drown the Clown,Drown the Clown,0,2019,,8,"Drama,Short"
10094164,tt9916254,video,Big Tit Cream Pie 32,Big Tit Cream Pie 32,1,2015,,226,Adult
10094210,tt9916348,video,Ancient World Exposed,Ancient World Exposed,0,2019,,67,History


####  Replace "\N" with np.nan

In [50]:
#Replace and make permanent
df_tb.replace({'\\N':np.nan}, inplace =True)

#### Eliminate movies that are null for runtimeMinutes

In [51]:
#Check for missing values 
df_tb['runtimeMinutes'].isna().sum()

7083914

In [53]:
#Drop missing values from genres 
df_tb.dropna(subset=['runtimeMinutes'], inplace=True)
#Verify missing values were removed 
df_tb['runtimeMinutes'].isna().sum()

0

#### Eliminate movies that are null for genre

In [26]:
#Check for missing values 
df_tb['genres'].isna().sum()

17

In [33]:
#Drop missing values from genres 
df_tb.dropna(subset=['genres'], inplace=True)

#Verify missing values removed 
df_tb['genres'].isna().sum()

#### Keep only titleType==Movie

In [38]:
#Check different title types 
df_tb['titleType'].value_counts()

tvEpisode       7684171
short            947157
movie            654573
video            279177
tvSeries         248018
tvMovie          142728
tvMiniSeries      50056
tvSpecial         42863
videoGame         35680
tvShort           10010
tvPilot               1
Name: titleType, dtype: int64

In [39]:
#Keep only movie titles 
remove_type = ['tvEpisode', 'short', 'video', 'tvSeries', 'tvMovie', 
               'tvMiniSeries', 'tvSpecial','videoGame', 'tvShort','tvPilot']
df_tb = df_tb[df_tb['titleType'].isin(remove_type) == False]
df_tb['titleType'].value_counts()

movie    654573
Name: titleType, dtype: int64

#### Convert the startYear column to float data type

In [57]:
#Check data type
df_tb['startYear'].dtypes

dtype('O')

In [54]:
#Check values for startYear 
df_tb['startYear'].value_counts()

2017    138669
2018    136882
2016    130926
2019    125376
2015    125245
         ...  
1889         2
2029         2
1874         1
1883         1
1885         1
Name: startYear, Length: 149, dtype: int64

In [58]:
#Change datatype 
df_tb['startYear'].astype(float)

0           1894.0
1           1892.0
2           1892.0
3           1892.0
4           1893.0
             ...  
10094401    2013.0
10094407    2019.0
10094442    2014.0
10094449    2015.0
10094450    2014.0
Name: startYear, Length: 3010537, dtype: float64

#### Filter the dataframe using startYear. Keep years between 2000-2021 (Including 2000 and 2021)

In [62]:
df_tb_years = df_tb[(df_tb['startYear'] > "2000") & (df_tb['startYear'] < "2021")]
df_tb_years

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
33800,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001,,20,Short
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
49491,tt0050396,short,Final Curtain,Final Curtain,0,2012,,20,"Horror,Short"
59199,tt0060361,short,EMS nr 1,EMS nr 1,0,2016,,14,Short
59204,tt0060366,short,A Embalagem de Vidro,A Embalagem de Vidro,0,2020,,11,"Documentary,Short"
...,...,...,...,...,...,...,...,...,...
10094401,tt9916754,movie,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,0,2013,,49,Documentary
10094407,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0,2019,,43,"Family,Game-Show,Reality-TV"
10094442,tt9916840,tvEpisode,Horrid Henry's Comic Caper,Horrid Henry's Comic Caper,0,2014,,11,"Adventure,Animation,Comedy"
10094449,tt9916856,short,The Wind,The Wind,0,2015,,27,Short


#### Eliminate movies that include "Documentary" in the genre

In [64]:
#Check values for genre
df_tb['genres'].value_counts()

Documentary                123112
Drama                       94284
Drama,Short                 92370
Comedy                      82552
Adult                       75977
                            ...  
Action,Family,Musical           1
Family,Game-Show,Sci-Fi         1
Drama,Game-Show,Short           1
Biography,Horror                1
Biography,Music,News            1
Name: genres, Length: 2034, dtype: int64

In [66]:
# Exclude movies that are included in the documentary category.
is_documentary = df_tb['genres'].str.contains('documentary',case=False)
df_tb = df_tb[~is_documentary]


TypeError: bad operand type for unary ~: 'float'

In [67]:
#Check data type
df_tb['genres'].dtypes

dtype('O')

In [68]:
#Convert genres to string type 
df_tb['genres'] = df_tb['genres'].astype(str)
# Exclude movies that are included in the documentary category.
is_documentary = df_tb['genres'].str.contains('documentary',case=False)
df_tb = df_tb[~is_documentary]