# Project 3 Part 1
- Michael Vincent
- 9/12

## Imports

In [1]:
# Imports
import numpy as np
import pandas as pd

## Load the data

In [2]:
# Load the data
akas_path = 'title.akas.tsv.gz'
basics_path = 'title.basics.tsv.gz'
ratings_path = 'title.ratings.tsv.gz'
akas = pd.read_csv(akas_path, sep = '\t', low_memory = False)
basics = pd.read_csv(basics_path, sep = '\t', low_memory = False)
ratings = pd.read_csv(ratings_path, sep = '\t', low_memory = False)

display(akas.head())
display(basics.head())
display(ratings.head())

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1910
1,tt0000002,5.8,256
2,tt0000003,6.5,1713
3,tt0000004,5.6,169
4,tt0000005,6.2,2527


## Data Preprocessing

> These preprocessing steps have been given to us by our stakeholder, so we are not going through our usual data exploration steps.

In [3]:
# imdb uses \N for NaN. We will replace \N with np.nan
akas.replace('\\N', np.nan, inplace = True)
basics.replace('\\N', np.nan, inplace = True)
ratings.replace('\\N', np.nan, inplace = True) 

In [4]:
# Eliminate movies that are null for runtimeMinutes
basics.dropna(subset = 'runtimeMinutes', inplace = True)

# Make sure the values were dropped
print(basics['runtimeMinutes'].isna().sum())

0


In [5]:
# Eliminate movies that are null for genre
basics.dropna(subset = 'genres', inplace = True)

# Make sure the values were dropped
print(basics['genres'].isna().sum())

0


In [6]:
# Only kee titleType of 'movie'
filter = basics['titleType'] == 'movie'
basics = basics[filter]

# Make sure the only titleType is 'movie'
basics['titleType'].value_counts()

movie    367884
Name: titleType, dtype: int64

In [7]:
# Keep startYear between 2000 and 2022

# Start by dropping null years
basics.dropna(subset = 'startYear', inplace = True)

# Filter out years before 2000 and after 2022
filter = (basics['startYear'].astype(int) >= 2000) &\
         (basics['startYear'].astype(int) <= 2022)
basics = basics[filter]

# Make sure the years were filtered out
basics['startYear'].value_counts()

2017    14208
2018    14147
2016    13830
2019    13819
2015    13330
2014    12982
2013    12278
2021    11757
2012    11542
2020    11261
2011    10687
2010    10129
2009     9277
2022     8196
2008     8075
2007     6889
2006     6435
2005     5761
2004     5135
2003     4531
2002     4087
2001     3816
2000     3595
Name: startYear, dtype: int64

In [8]:
# Eliminate genres that include "Documentary" in the genre category.
filter = basics['genres'].str.contains('documentary', case = False)
basics = basics[~filter]

# Make sure the changes were made
basics['genres'].str.contains('documentary', case = False).sum()

0

In [9]:
# Drop all non-US movies in the akas data frame
filter = akas['region'] == 'US'
akas = akas[filter]

# Make sure the changes were made
akas['region'].value_counts()

US    1348566
Name: region, dtype: int64

In [10]:
# Filter out non-US movies from the basics data frame
keepers = basics['tconst'].isin(akas['titleId'])
basics = basics[keepers]

In [11]:
# Filter out non-US movies from the titles data frame
keepers = ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers]

In [12]:
# Run .info() on each data frame
akas.info()
print()
basics.info()
print()
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1348566 entries, 5 to 33164340
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1348566 non-null  object
 1   ordering         1348566 non-null  int64 
 2   title            1348566 non-null  object
 3   region           1348566 non-null  object
 4   language         3702 non-null     object
 5   types            963958 non-null   object
 6   attributes       44938 non-null    object
 7   isOriginalTitle  1347191 non-null  object
dtypes: int64(1), object(7)
memory usage: 92.6+ MB

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82471 entries, 34792 to 9221105
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          82471 non-null  object
 1   titleType       82471 non-null  object
 2   primaryTitle    82471 non-null  object
 3   originalTitle   82471 non-

In [13]:
# Save the processed data frames
akas.to_csv('Data/title_akas.csv.gz', compression = 'gzip', index = False)
basics.to_csv('Data/title_basics.csv.gz', compression = 'gzip', index = False)
ratings.to_csv('Data/title_ratings.csv.gz', compression = 'gzip', index = False)

In [14]:
# Confirm the correct files were saved.
akas = pd.read_csv('Data/title_akas.csv.gz')
basics = pd.read_csv('Data/title_basics.csv.gz')
ratings = pd.read_csv('Data/title_ratings.csv.gz')

display(akas.head())
display(basics.head())
display(ratings.head())

akas.shape

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1910
1,tt0000002,5.8,256
2,tt0000005,6.2,2527
3,tt0000006,5.1,173
4,tt0000007,5.4,789


(1348566, 8)