### Imports 

In [1]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) # Confirm folder created
os.listdir("Data/")



['.ipynb_checkpoints',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title-akas-us-only.csv',
 'title.basics.csv.gz',
 'title.basics.tsv.gz',
 'title.ratings.csv.gz',
 'title.ratings.tsv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json']

In [2]:
import pandas as pd
import numpy as np

### Cleaning AKAs 

In [3]:
#Load akas data 
df_akas='Data/title-akas-us-only.csv'
df = pd.read_csv(df_akas,  low_memory=False)
df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452564 entries, 0 to 1452563
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452564 non-null  object
 1   ordering         1452564 non-null  int64 
 2   title            1452564 non-null  object
 3   region           1452564 non-null  object
 4   language         1452564 non-null  object
 5   types            1452564 non-null  object
 6   attributes       1452564 non-null  object
 7   isOriginalTitle  1452564 non-null  object
dtypes: int64(1), object(7)
memory usage: 88.7+ MB


In [5]:
#Check value count for region (only want US)
df['region'].value_counts()

US    1452564
Name: region, dtype: int64

#### Replace "\N" with np.nan

In [6]:
#Replace and make permanent
df.replace({'\\N':np.nan}, inplace =True)

### Cleaning Title Basics 


In [8]:
#load tile basics data 
fpath='Data/title.basics.tsv.gz'
basics = pd.read_csv(fpath, sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [9]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10118048 entries, 0 to 10118047
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 694.8+ MB


####  Replace "\N" with np.nan

In [10]:
#Replace and make permanent
basics.replace({'\\N':np.nan}, inplace =True)

#### Eliminate movies that are null for runtimeMinutes, genres, startYear

In [11]:
#Check for missing values 
basics['startYear'].isna().sum()

1359478

In [13]:
#Drop missing values from genres 
basics.dropna(subset=['startYear'], inplace=True)
#Verify missing values were removed 
basics['startYear'].isna().sum()

0

In [14]:
#Check for missing values 
basics['genres'].isna().sum()

373857

In [15]:
#Drop missing values from genres 
basics.dropna(subset=['genres'], inplace=True)
#Verify missing values were removed 
basics['genres'].isna().sum()

0

In [17]:
#Check for missing values 
basics['runtimeMinutes'].isna().sum()

5613109

In [18]:
#Drop missing values from genres 
basics.dropna(subset=['runtimeMinutes'], inplace=True)
#Verify missing values were removed 
basics['runtimeMinutes'].isna().sum()

0

#### Keep only titleType==Movie

In [19]:
#Check different title types 
basics['titleType'].value_counts()

tvEpisode       1373212
short            604528
movie            381473
video            183818
tvMovie           92316
tvSeries          90885
tvSpecial         18703
tvMiniSeries      17709
tvShort            8629
videoGame           331
Name: titleType, dtype: int64

In [35]:
#Keep only movies where titleType==Movie
basics = basics.loc[basics['titleType'] == 'movie']


#### Filter the dataframe using startYear. Keep years between 2000-2021 (Including 2000 and 2021)

In [36]:
# Replace startYear '\\N' with NaN
basics['startYear'] = pd.to_numeric(basics['startYear'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basics['startYear'] = pd.to_numeric(basics['startYear'], errors='coerce')


In [37]:
#Filter the dataframe using startYear. Keep years between 2000-2021
basics = basics[basics["startYear"]>= 2000]

In [38]:
#Filter the dataframe using startYear. Keep years between 2000-2021
basics = basics[basics["startYear"]<= 2021]

#### Convert the startYear column to float data type

In [39]:
#Check data type
basics['startYear'].dtypes

dtype('int64')

In [26]:
#Change datatype 
basics['startYear'].astype(float)

13081       2021.0
33800       2001.0
34800       2001.0
49491       2012.0
55754       2021.0
             ...  
10117998    2013.0
10118004    2019.0
10118039    2014.0
10118046    2015.0
10118047    2014.0
Name: startYear, Length: 1854980, dtype: float64

#### Eliminate movies that include "Documentary" in the genre

In [40]:
#Check values for genre
basics['genres'].value_counts()

Drama                          16134
Comedy                          6736
Comedy,Drama                    3813
Horror                          3739
Drama,Romance                   2523
                               ...  
Crime,Thriller,War                 1
History,Sci-Fi                     1
Comedy,Mystery,Sport               1
Crime,Western                      1
Mystery,Reality-TV,Thriller        1
Name: genres, Length: 842, dtype: int64

In [41]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]


In [42]:
#Check data type
basics['genres'].dtypes

dtype('O')

In [43]:
#Convert genres to string type 
basics['genres'] = basics['genres'].astype(str)
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

#### Keep only US movies 

In [44]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers_basics = basics['tconst'].isin(df['titleId'])
keepers_basics

34800       True
61112       True
67486       True
67664       True
86791       True
            ... 
10117187    True
10117581    True
10117721    True
10117730    True
10117814    True
Name: tconst, Length: 81888, dtype: bool

In [45]:
basics = basics[keepers_basics]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61112,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67486,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016,,90,Drama
67664,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86791,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
10117187,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
10117581,tt9915872,movie,The Last White Witch,Boku no kanojo wa mahoutsukai,0,2019,,97,"Comedy,Drama,Fantasy"
10117721,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
10117730,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [46]:
## Save current dataframe to file.
basics.to_csv("Data/title.basics.csv.gz",compression='gzip',index=False)

In [47]:
# Open saved file and preview again
basics = pd.read_csv("Data/title.basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016,,90,Drama
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


### Cleaning Ratings

In [49]:
#Load ratings data 
fpath='Data/title.ratings.tsv.gz'
ratings = pd.read_csv(fpath, sep='\t', low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1990
1,tt0000002,5.8,265
2,tt0000003,6.5,1867
3,tt0000004,5.5,177
4,tt0000005,6.2,2653


#### Replace "\N" with np.nan

In [50]:
#Replace and make permanent
ratings.replace({'\\N':np.nan}, inplace =True)

#### Keep only movies that were included in your final title basics dataframe

In [51]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers_ratings =ratings['tconst'].isin(basics['tconst'])
keepers_ratings

0          False
1          False
2          False
3          False
4          False
           ...  
1343366    False
1343367    False
1343368    False
1343369    False
1343370    False
Name: tconst, Length: 1343371, dtype: bool

In [52]:
ratings = ratings[keepers_ratings]
ratings

Unnamed: 0,tconst,averageRating,numVotes
17888,tt0035423,6.4,87411
40643,tt0062336,6.4,179
46356,tt0068865,5.4,75
46513,tt0069049,6.7,7806
63467,tt0088751,5.2,339
...,...,...,...
1343290,tt9914942,6.6,181
1343317,tt9915872,6.4,9
1343330,tt9916170,7.0,7
1343331,tt9916190,3.7,243


In [53]:
## Save current dataframe to file.
ratings.to_csv("Data/title.ratings.csv.gz",compression='gzip',index=False)


In [54]:
# Open saved file and preview again
ratings = pd.read_csv("Data/title.ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0035423,6.4,87411
1,tt0062336,6.4,179
2,tt0068865,5.4,75
3,tt0069049,6.7,7806
4,tt0088751,5.2,339
