In [1]:
import pandas as pd
import numpy as np

Filtering/Cleaning Steps:
- Title Basics:
- Replace "\N" with np.nan
- Eliminate movies that are null for runtimeMinutes
- Eliminate movies that are null for genre
- keep only titleType==Movie
- keep startYear 2000-2022
- Eliminate movies that include  "Documentary" in genre (see tip below)

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"


In [4]:
basics = pd.read_csv(basics_url,sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [None]:
basics.dropna(subset=['genres','runtimeMinutes'],inplace=True)

- Eliminate movies that are null for genre

In [None]:
# basics.drop(basics[basics['genres']=='NaN'].index, inplace = True)

In [None]:
basics.replace({'\\N':np.nan}, inplace=True)

- keep only titleType==Movie

In [None]:
# basics.drop(basics[basics['startYear']=='NaN'].index, inplace = True)
basics = basics[basics['startYear'].notna()]


In [None]:
# movie_fil=basics['titleType']=='movie'
basics=basics.loc[(basics.titleType == 'movie')]
basics

- keep 2000-2022

In [None]:
print(basics['startYear'].value_counts())

In [None]:
basics['startYear'].unique()

In [None]:
basics['startYear']=basics['startYear'].astype(int)
# year_fill=basics[(basics['startYear']>=2000) & (basics['startYear']<=2022)]
basics=basics.loc[((basics['startYear'] >=2000) & (basics['startYear'] <= 2022))]

Eliminate movies that include  "Documentary" in genre

In [None]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False, 
                                               na=False)
basics[~is_documentary]

AKAs:
keep only US entries.
Replace "\N" with np.nan

In [None]:
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"

In [None]:
akas =pd.read_csv(akas_url,sep='\t', low_memory=False)
akas.head()

In [None]:
akas.replace({'\\N':np.nan}, inplace=True)

In [None]:
# akas=akas.loc['region']=='US'

akas=akas.loc[(akas.region == 'US')]
akas

Ratings:
Replace "\N" with np.nan (if any)

In [None]:
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

In [None]:
ratings =pd.read_csv(ratings_url,sep='\t', low_memory=False)
ratings.head()

In [None]:
ratings.replace({'\\N':np.nan})

In [None]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers

In [None]:
basics = basics[keepers]
basics

In [None]:
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

In [None]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [None]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [None]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [5]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118.0,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70.0,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122.0,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100.0,"Comedy,Horror,Sci-Fi"
4,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020,,74.0,"Horror,Music,Thriller"


- Part 2: Extraction TMDB API

In [6]:
!pip install tmdbsimple



In [7]:
import json
with open('/Users/marya/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key'])

In [8]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key']

In [9]:
movie=tmdb.Movies(650)

In [10]:
info=movie.info()
info

{'adult': False,
 'backdrop_path': '/6SnFa3qfCvh4aARg6y3UevKLROh.jpg',
 'belongs_to_collection': None,
 'budget': 6500000,
 'genres': [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name': 'Crime'}],
 'homepage': '',
 'id': 650,
 'imdb_id': 'tt0101507',
 'original_language': 'en',
 'original_title': 'Boyz n the Hood',
 'overview': 'Boyz n the Hood is the popular and successful film and social criticism from John Singleton about the conditions in South Central Los Angeles where teenagers are involved in gun fights and drug dealing on a daily basis.',
 'popularity': 46.002,
 'poster_path': '/v4ox4aSCNT5vyLXl4Q71JiWwCXW.jpg',
 'production_companies': [{'id': 5,
   'logo_path': '/71BqEFAF4V3qjjMPCpLuyJFB9A.png',
   'name': 'Columbia Pictures',
   'origin_country': 'US'}],
 'production_countries': [{'iso_3166_1': 'US',
   'name': 'United States of America'}],
 'release_date': '1991-07-12',
 'revenue': 57504069,
 'runtime': 112,
 'spoken_languages': [{'english_name': 'English',
   'iso_639_1': 'en

In [12]:
def get_movie_with_rating(movie_id):
    movie = tmdb.Movies(movie_id)
    movie_info = movie.info()
    releases = movie.releases()
    for c in releases['countries']:
        if c['iso_3166_1']=='US':
            movie_info['certification'] = c['certification']
        return movie_info

In [13]:
trial = get_movie_with_rating("tt0848228") 
trial

{'adult': False,
 'backdrop_path': '/nNmJRkg8wWnRmzQDe2FwKbPIsJV.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 263.142,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

In [14]:
import os
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json']

In [16]:
YEARS_TO_GET = [2000, 2001]

In [17]:
YEAR = YEARS_TO_GET

In [22]:
from tqdm.notebook import tqdm_notebook
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json' 
    file_exists = os.path.isfile(JSON_FILE)
# If it does not exist: create it
if file_exists == False:
# save an empty dict with just "imdb_id" to the new json file.
    with open(JSON_FILE,'w') as f:
        json.dump([{'imdb_id':0}],f)
 #Saving new year as the current df
df = basics.loc[ basics['startYear']==YEAR].copy()
# saving movie ids to list
movie_ids = df['tconst'].copy()#.to_list()
previous_df = pd.read_json(JSON_FILE)
# filter out any ids that are already in the JSON_FILE
movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

#Get index and movie id from list
# INNER Loop
for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        # Attempt to retrieve then data for the movie id
    try:
            
        temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
            # Append/extend results to existing file using a pre-made function
        write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
        time.sleep(0.02)
            
        # If it fails,  make a dict with just the id and None for certification.
    except Exception as e:
            continue

final_year_df = pd.read_json(JSON_FILE)
final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)


YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/2321 [00:00<?, ?it/s]