# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os, json, math, time

from tqdm.notebook import tqdm_notebook

In [2]:
# Using tmdbsimple Package
#!pip install tmdbsimple

## Already pre-loaded

## Load URLs

# Download CSVs

In [3]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

In [4]:
# run csv files for IMDB information
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [5]:
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

In [6]:
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

## Specifications
Your stakeholder only wants you to include information for movies based on the following specifications:

- [ ] Exclude any movie with missing values for genre or runtime
- [ ] Include only full-length movies (titleType = "movie").
- [ ] Include only fictional movies (not from documentary genre)
- [ ] Include only movies that were released 2000 - 2021 (include 2000 and 2021)
- [ ] Include only movies that were released in the United States


## Deliverable
After filtering out movies that do not meet the stakeholder's specifications:

- Before saving, run a final .info() for each of the dataframes to show a summary of how many movies remain and the datatypes of each feature
- Save each file to a compressed csv file "Data/" folder inside your repository.
- Commit your changes to your repository in GitHub desktop and Publish repository / Push Changes.
- Submit the link to your repository

## Handling \N Placeholder Values
According to the data dictionary, null values have been encoding as \N.
- You will want to find those and replace them with np.nan.
- However, the backslash (\) character is a special one that tells the computer to ignore whatever character comes next.
    - So if we were to say df.replace({'\N':np.nan}), the computer would see \N as an empty string.
    - To fix this, add a second backslash character, which will tell the computer that you actually WANTED to use a literal \.
    - df.replace({'\\N':np.nan})
    - Don't forget to make these replacements permanent!

## Required Preprocessing - Details
- Filtering/Cleaning Steps:
    - Title Basics:
        - [x] Replace "\N" with np.nan
        - [x] Eliminate movies that are null for runtimeMinutes
        - [x] Eliminate movies that are null for genre
        - [x] Keep only titleType==Movie
        - [x] Keep startYear 2000-2021
        - [x] Eliminate movies that include "Documentary" in genre (see tip below)
        - [x] Keep only US movies (Use AKAs table, see "Filtering one dataframe based on another" section below)
- AKAs:
        - [x] Keep only US movies.
        - [x] Replace "\N" with np.nan
- Ratings:
        - [x] Replace "\N" with np.nan (if any)
        - [ ] Keep only US movies (Use AKAs table, see 
        "Filtering one dataframe based on another" section below)

### Tip: Excluding Documentaries

To filter out documentaries, you will need to check if the movie's value in the Genre column contains the word documentary. **(Instead of =='documentary')**
You will also want to use the **~** operator to take the inverse of your Trues/Falses.
Example:

```
# Exclude movies that are included in the documentary category.
is_documentary = df['genres'].str.contains('documentary',case=False)
df = df[~is_documentary]
```

### Filtering One Dataframe Based On Another
Next you will filter the basics df to only include the movies that are present in your filter akas dataframe. This is how you will ultimately be able to filter the movies by region being in the US.

Here is how you can achieve this:

```
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers
```
---
Now filter basics:

```
basics = basics[keepers]
basics
```

## LOAD DATA HEADS

In [7]:
# checking basics null values
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

There is 32 missing values in the basics file

In [8]:
# checking akas null values
akas.isna().sum()

titleId              0
ordering             0
title                5
region             108
language             0
types                0
attributes           0
isOriginalTitle      0
dtype: int64

There is 113 missing values in the akas file

In [9]:
# checking titles null values
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

There is 0 missing values in the titles file

## BASICS PREPROCESSING

In [10]:
# replacing missing values with nan
basics.replace({'\\N':np.nan}, inplace=True)

In [11]:
# double checking nan value counts
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1290492
endYear           9477711
runtimeMinutes    6782768
genres             434472
dtype: int64

In [12]:
# dropping sub-columns that 
basics = basics.dropna(subset=['runtimeMinutes', 'genres', 'startYear'])

In [13]:
# double check null value counts
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear               0
endYear           2518556
runtimeMinutes          0
genres                  0
dtype: int64

Can keep na values for endYear currently,
Replace akas nan values

In [14]:
basics['titleType'].value_counts()

tvEpisode       1213602
short            583319
movie            369944
video            178063
tvMovie           90403
tvSeries          88160
tvSpecial         17582
tvMiniSeries      16670
tvShort            8556
videoGame           317
Name: titleType, dtype: int64

### Include only full-length movies

In [15]:
# creating a filter for only movie types
movie_filter = basics['titleType']=="movie"

In [16]:
basics = basics.loc[movie_filter]

In [17]:
# sanity check each filter
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"


In [18]:
basics['titleType'].value_counts()

movie    369944
Name: titleType, dtype: int64

### Year Filtering 2000-2021

In [19]:
# Change dtype to float
#basics['startYear'] = basics['startYear'].astype('float')

In [20]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"


In [21]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 369944 entries, 8 to 9579697
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          369944 non-null  object
 1   titleType       369944 non-null  object
 2   primaryTitle    369944 non-null  object
 3   originalTitle   369944 non-null  object
 4   isAdult         369944 non-null  object
 5   startYear       369944 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  369944 non-null  object
 8   genres          369944 non-null  object
dtypes: object(9)
memory usage: 28.2+ MB


In [22]:
# Change'startYear' to float
basics['startYear'] = basics['startYear'].astype('float')

In [23]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 369944 entries, 8 to 9579697
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          369944 non-null  object 
 1   titleType       369944 non-null  object 
 2   primaryTitle    369944 non-null  object 
 3   originalTitle   369944 non-null  object 
 4   isAdult         369944 non-null  object 
 5   startYear       369944 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  369944 non-null  object 
 8   genres          369944 non-null  object 
dtypes: float64(1), object(8)
memory usage: 28.2+ MB


In [24]:
# Create filters with startYear 2000, and endYear 2022
# Keep start year 2000-2021
year_filter2000_2022 = (basics['startYear']>=2000) & (basics['startYear']<=2022)

In [25]:
basics = basics[year_filter2000_2022]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021.0,,133,Documentary
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
76059,tt0077684,movie,Histórias de Combóios em Portugal,Histórias de Combóios em Portugal,0,2022.0,,46,Documentary


### Eliminate movies that include "Documentary" (basics)

In [26]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('Documentary',case=False)
basics = basics[~is_documentary]

In [27]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
77964,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


## AKAS PREPROCESSING

In [28]:
# Replace akas null values
akas.replace({'\\N':np.nan}, inplace=True)

In [29]:
# US filter
us_filter = akas['region'] == 'US'

In [30]:
akas = akas[us_filter]
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


## Keepers Filtering (basics to akas)

In [31]:
keepers = basics['tconst'].isin(akas['titleId'])
keepers

34803       True
61116       True
67669       True
77964      False
86801       True
           ...  
9579420     True
9579429     True
9579468    False
9579513     True
9579597    False
Name: tconst, Length: 146329, dtype: bool

In [32]:
# Now filter basics
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
...,...,...,...,...,...,...,...,...,...
9578884,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
9579280,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
9579420,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9579429,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


## RATINGS PREPROCESSING

In [33]:
# replace titles null values
ratings.replace({'\\N':np.nan}, inplace=True)

# New Data Files

In [34]:
# saving compressed csv.gz files

basics.to_csv("Data/title_basics.csv.gz", compression='gzip', index=False)
akas.to_csv("Data/title_akas.csv.gz", compression='gzip', index=False)
ratings.to_csv("Data/title.ratings.tsv.gz", compression='gzip', index=False)

In [35]:
# Confirm saved file can be previewed
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


# Part II | Adding tmdb_api.json

In [36]:
import json
with open('/Users/njfor/.secret/tmdb_api.json') as f:
    login = json.load(f)
## Display the keys of the loaded dictionary
login.keys()

dict_keys(['api-key'])

In [37]:
import tmdbsimple as tmdb
tmdb.API_KEY = login['api-key']

## Querying Movies by ID (Example: The Matrix)

In [38]:
## make a movie object using the .Movies function from tmdb
movie = tmdb.Movies(603)

# 603 = The Matrix

In [39]:
# extract data from movie object
info = movie.info()
info

{'adult': False,
 'backdrop_path': '/waCRuAW5ocONRehP556vPexVXA9.jpg',
 'belongs_to_collection': {'id': 2344,
  'name': 'The Matrix Collection',
  'poster_path': '/bV9qTVHTVf0gkW0j7p7M0ILD4pG.jpg',
  'backdrop_path': '/bRm2DEgUiYciDw3myHuYFInD7la.jpg'},
 'budget': 63000000,
 'genres': [{'id': 28, 'name': 'Action'},
  {'id': 878, 'name': 'Science Fiction'}],
 'homepage': 'http://www.warnerbros.com/matrix',
 'id': 603,
 'imdb_id': 'tt0133093',
 'original_language': 'en',
 'original_title': 'The Matrix',
 'overview': 'Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth.',
 'popularity': 77.213,
 'poster_path': '/f89U3ADr1oiB1s9GkdPOEpXUk5H.jpg',
 'production_companies': [{'id': 79,
   'logo_path': '/tpFpsqbleCzEE2p5EgvUq6ozfCA.png',
   'name': 'Village Roadshow Pictures',
   'origin_country': 'US'},
  {'id': 174,
   'logo_path': '/IuAlhI9eVC9Z8UQWOIDdWRKSEJ.png'

# CF: get_movie_with_rating 

In [40]:
# custom function to get_movies_with_rating
def get_movie_with_rating(movie_id):
    """Adapted from source = https://github.com/celiao/tmdbsimple"""
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    
    # save the .info .releases dictionaries
    info = movie.info()
    
    releases = movie.releases()
    # Loop through countires in releases
    for c in releases['countries']:
        # if the country abbreviation==US
        if c['iso_3166_1' ]=='US':
            ## save a "certification" key in info w/ the certification
            info['certification'] = c['certification']
            
    return info

## Testing function (Example: The Avengers)

In [41]:
test = get_movie_with_rating("tt0848228") # put your function name here
test

{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 122.502,
 'poster_path': '/tYqp6vEOo8YlVWrYQvt9nyOhsA2.jpg',
 'production_companies': [{'id': 420,
   'logo_pat

## Saving Our Error Messages

In [42]:
## testing our function by looping through a list of ids
import pandas as pd
test_ids = ["tt0848228", "tt0115937","tt0848228","tt0332280"]
results = []
errors = []
for movie_id in test_ids:
    
    try:
        movie_info = get_movie_with_rating(movie_id)
        results.append(movie_info)
        
    except Exception as e: 
        errors.append([movie_id, e])
    
pd.DataFrame(results)

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",220000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.marvel.com/movies/the-avengers,24428,tt0848228,en,The Avengers,...,1518815515,143,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Some assembly required.,The Avengers,False,7.707,28131,PG-13
1,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",220000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.marvel.com/movies/the-avengers,24428,tt0848228,en,The Avengers,...,1518815515,143,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Some assembly required.,The Avengers,False,7.707,28131,PG-13
2,False,/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg,,29000000,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",http://www.newline.com/properties/notebookthe....,11036,tt0332280,en,The Notebook,...,115603229,123,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Behind every great love is a great story.,The Notebook,False,7.879,10118,PG-13


In [43]:
# Check error list
print(f"- Number of errors: {len(errors)}")
errors

- Number of errors: 1


[['tt0115937',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0115937?api_key=dacfc8ec089e1a3da2f7c298778f49b1')]]

# BEFORE THE LOOPS (Designate a folder)

In [44]:
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.ipynb_checkpoints',
 'title.ratings.tsv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'tmdb_api_results_2001.json']

# CF: write_json

In [45]:
def write_json(new_data, filename):
    """Appends a list of records (new_data) to a json file (filename).
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""
    
    with open(filename, 'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
            file_data.append(new_data)
        # Sets file's current position at offset.
        filel.seek(0)
        # convert back to json.
        json.dump(file_data, file)

# Create Required Lists for the Loop

In [46]:
# Define a list of the Years to Extract from the API
YEARS_TO_GET = [2000,2001]

In [47]:
errors = []

## Start OUTER Loop

### Set Up Progress Bar

In [48]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):i = 0

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

## Select a JSON_FILE filename to save the results in progress.

In [49]:
# Define the JSON file to store results for the year
JSON_FILE=f'{FOLDER}tmdb_api_results_{YEAR}.json'

# Check if file already exists
file_exists = os.path.isfile(JSON_FILE)

In [50]:
# If file doesn't exists: create it
if file_exists == False:
    # save an empty dict with just "imdb_id" to the new json file
    with open(JSON_FILE,'w') as f:
        json.dump([{'imdb_id':0}],f)

## Define/Filter the IDs to call

In [51]:
# Saving new year as the current df
df = basics.loc[ basics['startYear']==YEAR].copy()
# saving movie ids to list
movie_ids = df['tconst'].copy()

# Check for & remove any previously downloaded movie id's

- Load in any existing API resultsl with pd.read_json
- Check to see if any of the movie_ids to get are already in the JSON_FILE
- Filter out only movies that are missing from the JSON_FILE to use in the for loop:

In [52]:
# load existing data from .json into a new dataframe called "previous_df"
previous_df = pd.read_json(JSON_FILE)

In [53]:
## filter out any ids that are already in the JSON_FILE
movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

## Creation of "movie_idsmovie_ids_to_get"
We have now deine the "movie_ids_to_get", It includes the ids from our dataframe in the year we are seeking & it excludes any that we have already made calls for.

We will use this list for our inner loop of API calls.

# Start INNER LOOP

## Iterate through the list of Movie IDs & make the calls

- **Make sure both functions are defined in your code file before you try to call them!**

In [54]:
# Get index & movie_id from list
# INNER LOOP
for movie_id in tqdm_notebook(movie_ids_to_get,
                             desc=f'Movies from {YEAR}',
                             position=1,
                             leave=True):
    try:
        # Retrieve the data for the movie_id
        temp = get_movie_with_rating(movie_id)
        # Append/Extend results to existing file using a pre-made function
        write_json(temp,JSON_FILE)
        # Short 20ms sleep to preven overwhelming the server
        time.sleep(0.02) 
        
    except Exception as e:
        errors.append([movie_id, e])

Movies from 2001:   0%|          | 0/1543 [00:00<?, ?it/s]

## After the Inner Loop
**Save the year's results as csv.gz file**

In [55]:
final_year_df = pd.read_json(JSON_FILE)
final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)