# The Data

- IMDB Provides Several Files with varied information for Movies, TV Shows, Made for TV Movies, etc.

  - Overview/Data Dictionary: https://www.imdb.com/interfaces/
  - Downloads page: https://datasets.imdbws.com/



- Files of focus:
 - title.basics.tsv.gz
 - title.ratings.tsv.gz
 - title.akas.tsv.gz

# Specifications

- Exclude any movie with missing values for genre or runtime
- Include only full-length movies (titleType = "movie").
- Include only fictional movies (not from documentary genre)
- Include only movies that were released 2000 - 2021 (include 2000 and 2021)
- Include only movies that were released in the United States

In [1]:
import pandas as pd
import numpy as np

In [2]:
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

In [3]:
akas = pd.read_csv(akas_url,sep='\t', low_memory=False)

In [4]:
basics = pd.read_csv(basics_url,sep='\t', low_memory=False)

In [5]:
ratings = pd.read_csv(ratings_url,sep='\t', low_memory=False)

# Encoding null values from \N to np.nan

In [6]:
akas = akas.replace({'\\N':np.nan})

In [7]:
basics = basics.replace({'\\N':np.nan})

In [8]:
ratings = ratings.replace({'\\N':np.nan})

# Eliminate movies that are null for runtimeMinutes, genre, startyear

In [9]:
basics = basics.dropna(subset = ['runtimeMinutes', 'genres', 'startYear'])

# Include only full-length movies (titleType = "movie").

In [10]:
basics = basics.loc[basics['titleType'] == 'movie']

# Include only movies that were released 2000 - 2021 (including 2000 and 2021)

In [11]:
basics['startYear'] = basics['startYear'].astype(int)

In [12]:
basics = basics.loc[(basics['startYear'] >= 2000) & (basics['startYear'] <= 2021)]

# Only include fictional movies (not including documentaries)

In [13]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

# Include only movies that were released in the United States

In [14]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = basics['tconst'].isin(akas['titleId'])
keepers

34805      True
61119      True
67672      True
77968      True
86806      True
           ... 
8870259    True
8870268    True
8870307    True
8870352    True
8870436    True
Name: tconst, Length: 135550, dtype: bool

In [15]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77968,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86806,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
8870259,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
8870268,tt9916190,movie,Safeguard,Safeguard,0,2020,,90,"Action,Adventure,Thriller"
8870307,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,,84,Thriller
8870352,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"


In [16]:
akas = akas.loc[akas['region'] == 'US']

In [17]:
basics = basics.loc[basics['tconst'].isin(akas['titleId'])]

In [18]:
ratings = ratings.loc[ratings['tconst'].isin(basics['tconst'])]

In [19]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86806,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
91077,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020,,74,"Horror,Music,Thriller"


In [20]:
#making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

[]

In [21]:
akas.to_csv("Data/akas.csv.gz",compression='gzip',index=False)

In [22]:
basics.to_csv("Data/basics.csv.gz",compression='gzip',index=False)

In [23]:
ratings.to_csv("Data/ratings.csv.gz",compression='gzip',index=False)

In [24]:
basics = pd.read_csv("Data/basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020,,74,"Horror,Music,Thriller"
