# Cleaning and Prep Work

Specifications<br>
Your stakeholder only wants you to include information for movies based on the following specifications:

Exclude:<br>
- any movie with missing values for genre or runtime
- Include only full-length movies (titleType = "movie").
- Include only fictional movies (not from documentary genre)
- Include only movies that were released 2000 - 2021 (include 2000 and 2021)
- Include only movies that were released in the United States

Deliverable:<br>
- Save to CSV in repo
- Commit your changes to repo
- Publish repo and Push Changes.

## Imports

In [1]:
import pandas as pd
import numpy as np
import os

## Data

### Save Raw Version 
Save to data folder in repository
Code here for reference but not needed going forward...

In [2]:
# # Filepaths
# basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
# akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'
# ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

# # Read in Dataframe
# basics = pd.read_csv(basics_url, sep='\t', low_memory = False)
# akas = pd.read_csv(akas_url, sep='\t', low_memory = False)
# ratings = pd.read_csv(ratings_url, sep='\t', low_memory = False)

In [3]:
# os.makedirs('data/',exist_ok=True)
# os.listdir("data/")

['title_akas.csv.gz', 'title_basics.csv.gz', 'title_ratings.csv.gz']

In [4]:
# basics.to_csv("data/title_basics.csv.gz",compression='gzip',index=False)
# akas.to_csv("data/title_akas.csv.gz",compression='gzip',index=False)
# ratings.to_csv("data/title_ratings.csv.gz",compression='gzip',index=False)

### Load for Use

In [5]:
basics = pd.read_csv("data/title_basics.csv.gz", low_memory = False)
akas = pd.read_csv("data/title_akas.csv.gz", low_memory = False)
ratings = pd.read_csv("data/title_ratings.csv.gz", low_memory = False)

basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


## Cleaning

### Ratings DF

In [6]:
display(ratings.head())
print('\n\n')
ratings.info()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1874
1,tt0000002,5.9,248
2,tt0000003,6.5,1647
3,tt0000004,5.8,160
4,tt0000005,6.2,2475





<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1237390 entries, 0 to 1237389
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1237390 non-null  object 
 1   averageRating  1237390 non-null  float64
 2   numVotes       1237390 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.3+ MB


In [7]:
# Replace '/n' with NaN
ratings = ratings.replace({'\\N':np.nan})

### AKAs DF

In [8]:
# Replace '/n' with NaN
akas = akas.replace({'\\N':np.nan})

In [9]:
akas = akas[akas['region'] == 'US']

### Basics DF

In [10]:
display(basics.head())
print('\n\n')
basics.info()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"





<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8870622 entries, 0 to 8870621
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 609.1+ MB


In [11]:
# Replace '/n' with NaN
basics = basics.replace({'\\N':np.nan})

In [12]:
display(akas.head())
print('\n\n')
akas.info()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0





<class 'pandas.core.frame.DataFrame'>
Int64Index: 1316473 entries, 5 to 31770106
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1316473 non-null  object
 1   ordering         1316473 non-null  int64 
 2   title            1316473 non-null  object
 3   region           1316473 non-null  object
 4   language         3507 non-null     object
 5   types            1023430 non-null  object
 6   attributes       43841 non-null    object
 7   isOriginalTitle  1315098 non-null  object
dtypes: int64(1), object(7)
memory usage: 90.4+ MB


In [13]:
# Filter and Clean
genre_notna = basics['genres'].notna() #drop rows from genre w NaN
runtime_notna = basics['runtimeMinutes'].notna() #drop rows from runtime w NaN
type_movie = basics['titleType'] == 'movie' # select movie type
after2000 = basics['startYear'] >= '2000' # select years after 2000
before2022 = basics['startYear'] <= '2021'# select years before 2022

basics = basics[genre_notna & runtime_notna & type_movie & after2000 & before2022]

is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [14]:
keepers =basics['tconst'].isin(akas['titleId'])
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86806,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
91077,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020,,74,"Horror,Music,Thriller"
...,...,...,...,...,...,...,...,...,...
8869758,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
8870154,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
8870294,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
8870303,tt9916190,movie,Safeguard,Safeguard,0,2020,,90,"Action,Adventure,Thriller"


## Save Cleaned DFs

In [15]:
basics.to_csv("data/title_basics.csv.gz",compression='gzip',index=False)
akas.to_csv("data/title_akas.csv.gz",compression='gzip',index=False)
ratings.to_csv("data/title_ratings.csv.gz",compression='gzip',index=False)