In [1]:
import pandas as pd
import numpy as np

Filtering/Cleaning Steps:
- Title Basics:
- Replace "\N" with np.nan
- Eliminate movies that are null for runtimeMinutes
- Eliminate movies that are null for genre
- keep only titleType==Movie
- keep startYear 2000-2022
- Eliminate movies that include  "Documentary" in genre (see tip below)

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"


In [None]:
basics = pd.read_csv(basics_url,sep='\t', low_memory=False)
basics.head()

In [None]:
basics.dropna(subset=['genres','runtimeMinutes'],inplace=True)

- Eliminate movies that are null for genre

In [None]:
# basics.drop(basics[basics['genres']=='NaN'].index, inplace = True)

In [None]:
basics.replace({'\\N':np.nan}, inplace=True)

- keep only titleType==Movie

In [None]:
# basics.drop(basics[basics['startYear']=='NaN'].index, inplace = True)
basics = basics[basics['startYear'].notna()]


In [None]:
# movie_fil=basics['titleType']=='movie'
basics=basics.loc[(basics.titleType == 'movie')]
basics

- keep 2000-2022

In [None]:
print(basics['startYear'].value_counts())

In [None]:
basics['startYear'].unique()

In [None]:
basics['startYear']=basics['startYear'].astype(int)
# year_fill=basics[(basics['startYear']>=2000) & (basics['startYear']<=2022)]
basics=basics.loc[((basics['startYear'] >=2000) & (basics['startYear'] <= 2022))]

Eliminate movies that include  "Documentary" in genre

In [None]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False, 
                                               na=False)
basics[~is_documentary]

AKAs:
keep only US entries.
Replace "\N" with np.nan

In [None]:
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"

In [None]:
akas =pd.read_csv(akas_url,sep='\t', low_memory=False)
akas.head()

In [None]:
akas.replace({'\\N':np.nan}, inplace=True)

In [None]:
# akas=akas.loc['region']=='US'

akas=akas.loc[(akas.region == 'US')]
akas

Ratings:
Replace "\N" with np.nan (if any)

In [None]:
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

In [None]:
ratings =pd.read_csv(ratings_url,sep='\t', low_memory=False)
ratings.head()

In [None]:
ratings.replace({'\\N':np.nan})

In [None]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers

In [None]:
basics = basics[keepers]
basics

In [None]:
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

In [None]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [None]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [None]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [51]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118.0,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70.0,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122.0,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100.0,"Comedy,Horror,Sci-Fi"
4,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020,,74.0,"Horror,Music,Thriller"


- Part 2: Extraction TMDB API

In [None]:
!pip install tmdbsimple

In [None]:
import json
with open('/Users/marya/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

In [None]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key']

In [32]:
movie=tmdb.Movies(650)

In [33]:
info=movie.info()
info

{'adult': False,
 'backdrop_path': '/6SnFa3qfCvh4aARg6y3UevKLROh.jpg',
 'belongs_to_collection': None,
 'budget': 6500000,
 'genres': [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name': 'Crime'}],
 'homepage': '',
 'id': 650,
 'imdb_id': 'tt0101507',
 'original_language': 'en',
 'original_title': 'Boyz n the Hood',
 'overview': 'Boyz n the Hood is the popular and successful film and social criticism from John Singleton about the conditions in South Central Los Angeles where teenagers are involved in gun fights and drug dealing on a daily basis.',
 'popularity': 46.002,
 'poster_path': '/v4ox4aSCNT5vyLXl4Q71JiWwCXW.jpg',
 'production_companies': [{'id': 5,
   'logo_path': '/71BqEFAF4V3qjjMPCpLuyJFB9A.png',
   'name': 'Columbia Pictures',
   'origin_country': 'US'}],
 'production_countries': [{'iso_3166_1': 'US',
   'name': 'United States of America'}],
 'release_date': '1991-07-12',
 'revenue': 57504069,
 'runtime': 112,
 'spoken_languages': [{'english_name': 'English',
   'iso_639_1': 'en

In [34]:
def get_movie_with_rating(movie_id):
    movie = tmdb.Movies(movie_id)
    movie_info = movie.info()
    releases = movie.releases()
    for c in releases['countries']:
        if c['iso_3166_1']=='US':
            movie_info['certification'] = c['certification']
        return movie_info

In [35]:
get_movie_with_rating(650)

{'adult': False,
 'backdrop_path': '/6SnFa3qfCvh4aARg6y3UevKLROh.jpg',
 'belongs_to_collection': None,
 'budget': 6500000,
 'genres': [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name': 'Crime'}],
 'homepage': '',
 'id': 650,
 'imdb_id': 'tt0101507',
 'original_language': 'en',
 'original_title': 'Boyz n the Hood',
 'overview': 'Boyz n the Hood is the popular and successful film and social criticism from John Singleton about the conditions in South Central Los Angeles where teenagers are involved in gun fights and drug dealing on a daily basis.',
 'popularity': 46.002,
 'poster_path': '/v4ox4aSCNT5vyLXl4Q71JiWwCXW.jpg',
 'production_companies': [{'id': 5,
   'logo_path': '/71BqEFAF4V3qjjMPCpLuyJFB9A.png',
   'name': 'Columbia Pictures',
   'origin_country': 'US'}],
 'production_countries': [{'iso_3166_1': 'US',
   'name': 'United States of America'}],
 'release_date': '1991-07-12',
 'revenue': 57504069,
 'runtime': 112,
 'spoken_languages': [{'english_name': 'English',
   'iso_639_1': 'en

In [36]:
import os
folder = "Data/"
os.makedirs(folder, exist_ok=True)
os.listdir(folder)

['title_akas.csv.gz', 'title_basics.csv.gz', 'title_ratings.csv.gz']

In [37]:
years_to_get = [2000,2001]

In [44]:
from tqdm.notebook import tqdm_notebook
for YEAR in tqdm_notebook(years_to_get,desc='YEARS',position=0):
    #Defining the JSON file to store results for year
    JSON_FILE = f'{folder}tmdb_api_results_{YEAR}.json'
# Check if file exists
file_exists = os.path.isfile(JSON_FILE)
# If it does not exist: create it
if file_exists == False:
# save an empty dict with just "imdb_id" to the new json file.
    with open(JSON_FILE,'w') as f:
        json.dump([{'imdb_id':0}],f)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

In [45]:
basics = pd.read_csv('Data/title_basics.csv.gz')


In [48]:
df=basics.loc[basics['startYear'] == YEAR].copy()
movies_ids = df['tconst'].copy()

In [49]:
if_exist_df = pd.read_json(JSON_FILE)

In [50]:
# filter out any ids that are already in the JSON_FILE
movie_ids_to_get = movie_ids[~movie_ids.isin(if_exist_df['imdb_id'])]


NameError: name 'movie_ids' is not defined

In [52]:
def write_json(new_data, filename): 
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""    
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)


In [54]:
#Get index and movie id from list
# INNER Loop
for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        # Attempt to retrieve then data for the movie id
        try:
            temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        # If it fails,  make a dict with just the id and None for certification.
        except Exception as e:
            continue

NameError: name 'movie_ids_to_get' is not defined