In [2]:
import pandas as pd
import numpy as np

Filtering/Cleaning Steps:
- Title Basics:
- Replace "\N" with np.nan
- Eliminate movies that are null for runtimeMinutes
- Eliminate movies that are null for genre
- keep only titleType==Movie
- keep startYear 2000-2022
- Eliminate movies that include  "Documentary" in genre (see tip below)

In [3]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"


In [5]:
basics = pd.read_csv(basics_url,sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


- Eliminate movies that are null for genre

In [None]:
basics.drop(basics[basics['genres']=='NaN'].index, inplace = True)

In [6]:
basics.replace({'\\N':np.nan}, inplace=True)

- keep only titleType==Movie

In [7]:
# basics.drop(basics[basics['startYear']=='NaN'].index, inplace = True)
basics = basics[basics['startYear'].notna()]


In [8]:
# movie_fil=basics['titleType']=='movie'
basics=basics.loc[(basics.titleType == 'movie')]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,,100,
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,,,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,,,Drama
...,...,...,...,...,...,...,...,...,...
8880091,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,,57,Documentary
8880118,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,,100,Documentary
8880130,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,,,Comedy
8880141,tt9916730,movie,6 Gunn,6 Gunn,0,2017,,116,


- keep 2000-2022

In [11]:
print(basics['startYear'].value_counts())

2018    18331
2017    18313
2016    17881
2019    17832
2021    17281
        ...  
1903        3
2029        3
1904        2
1897        1
1896        1
Name: startYear, Length: 134, dtype: int64


In [12]:
basics['startYear'].unique()

array(['1905', '1906', '1907', '1908', '1909', '1910', '1912', '1911',
       '1913', '1915', '1914', '1919', '1916', '1917', '1936', '1925',
       '1918', '1920', '1922', '1921', '1924', '1923', '1928', '2019',
       '1926', '1927', '1929', '2000', '1993', '1935', '1930', '1942',
       '1932', '1931', '1934', '1939', '1937', '1933', '1950', '1938',
       '1951', '1946', '1996', '1940', '1944', '1947', '1941', '1952',
       '1970', '1957', '1943', '1948', '1945', '2001', '1949', '1953',
       '1954', '1965', '1983', '1980', '1973', '1961', '1955', '1962',
       '1958', '1956', '1977', '1964', '1960', '1959', '1967', '1968',
       '1963', '1971', '1969', '1972', '1966', '1976', '1990', '1979',
       '1981', '2020', '1975', '1978', '1989', '1974', '1986', '1995',
       '1987', '1985', '2018', '1984', '1982', '1988', '1991', '1994',
       '1992', '2005', '2004', '1998', '2016', '2002', '2017', '1997',
       '2021', '1999', '2006', '2008', '2009', '2003', '2007', '2022',
      

In [13]:
basics['startYear']=basics['startYear'].astype(int)
# year_fill=basics[(basics['startYear']>=2000) & (basics['startYear']<=2022)]
basics=basics.loc[((basics['startYear'] >=2000) & (basics['startYear'] <= 2022))]

Eliminate movies that include  "Documentary" in genre

In [14]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False, 
                                               na=False)
basics[~is_documentary]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
11636,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019,,,"Action,Crime"
15179,tt0015414,movie,La tierra de los toros,La tierra de los toros,0,2000,,60,
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
...,...,...,...,...,...,...,...,...,...
8879966,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"
8879998,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,0,2019,,,"Adventure,History,War"
8880050,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,,123,Drama
8880130,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,,,Comedy


In [18]:
basics.dropna(subset=['genres','runtimeMinutes'],inplace=True)

AKAs:
keep only US entries.
Replace "\N" with np.nan

In [19]:
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"

In [20]:
akas =pd.read_csv(akas_url,sep='\t', low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [21]:
akas.replace({'\\N':np.nan}, inplace=True)

In [22]:
# akas=akas.loc['region']=='US'

akas=akas.loc[(akas.region == 'US')]
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
31795580,tt9916702,1,Loving London: The Playground,US,,imdbDisplay,,0
31795618,tt9916720,10,The Demonic Nun,US,,tv,,0
31795620,tt9916720,12,The Nun 2,US,,imdbDisplay,,0
31795637,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


Ratings:
Replace "\N" with np.nan (if any)

In [23]:
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

In [24]:
ratings =pd.read_csv(ratings_url,sep='\t', low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1874
1,tt0000002,5.9,248
2,tt0000003,6.5,1647
3,tt0000004,5.8,160
4,tt0000005,6.2,2475


In [25]:
ratings.replace({'\\N':np.nan})

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1874
1,tt0000002,5.9,248
2,tt0000003,6.5,1647
3,tt0000004,5.8,160
4,tt0000005,6.2,2475
...,...,...,...
1238771,tt9916690,6.5,6
1238772,tt9916720,5.1,209
1238773,tt9916730,8.7,6
1238774,tt9916766,6.7,19


In [26]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers

34805       True
61119       True
67672       True
77968      False
86806       True
           ...  
8879966     True
8880050    False
8880091    False
8880118    False
8880151    False
Name: tconst, Length: 209955, dtype: bool

In [27]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86806,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
91077,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020,,74,"Horror,Music,Thriller"
...,...,...,...,...,...,...,...,...,...
8879554,tt9915436,movie,Vida em Movimento,Vida em Movimento,0,2019,,70,Documentary
8879733,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
8879873,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
8879882,tt9916190,movie,Safeguard,Safeguard,0,2020,,90,"Action,Adventure,Thriller"


In [28]:
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['title_akas.csv.gz', 'title_basics.csv.gz', 'title_ratings.csv.gz']

In [29]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [30]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [31]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [32]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020,,74,"Horror,Music,Thriller"
