### Project 3 IMDB Part 4 - get more data
- Paula Pipkin

In [1]:
import pandas as pd
import os, time,json
import matplotlib.pyplot as plt
import seaborn as sns

import tmdbsimple as tmdb 
from tqdm.notebook import tqdm_notebook



In [3]:
FOLDER = "big_data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['final_tmdb_data_2012.csv.gz',
 'final_tmdb_data_2013.csv.gz',
 'final_tmdb_data_2014.csv.gz',
 'final_tmdb_data_2015.csv.gz',
 'final_tmdb_data_2016.csv.gz',
 'final_tmdb_data_2017.csv.gz',
 'final_tmdb_data_2018.csv.gz',
 'tmdb_api_results_2012.json',
 'tmdb_api_results_2013.json',
 'tmdb_api_results_2014.json',
 'tmdb_api_results_2015.json',
 'tmdb_api_results_2016.json',
 'tmdb_api_results_2017.json',
 'tmdb_api_results_2018.json',
 'tmdb_api_results_2019.json']

In [4]:
def get_movies_cert(movie_id):
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    # save the .info .releases dictionaries
    info = movie.info()
    releases = movie.releases()
    # Loop through countries in releases
    for c in releases['countries']:
        # if the country abbreviation==US
        if c['iso_3166_1' ] =='US':
            ## save a "certification" key in the info dict with the certification
           info['certification'] = c['certification']
    return info

In [5]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)


In [6]:
with open('/Users/paula/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
tmdb.API_KEY =  login['api-key']

In [7]:
# Load in the dataframe from project part 1 as basics:
basics = pd.read_csv('data/title_basics.csv.gz')
basics.head()



Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,126,Drama


In [8]:
# I will use the last 10 completed years, so 2012 - 2021 
YEARS_TO_GET = [*range(2012, 2022, 1)]
errors = [ ]


In [9]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)
    # If it does not exist: create it
    if file_exists == False:
        # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
    #Saving new year as the current df
    df = basics.loc[ basics['startYear']==YEAR].copy()
    # saving movie ids to list
    movie_ids = df['tconst'].copy()
    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
        #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            # Retrieve then data for the movie id
            temp = get_movies_cert(movie_id)  
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        except Exception as e:
            errors.append([movie_id, e])
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

print(f"- Total errors: {len(errors)}")






    











YEARS:   0%|          | 0/3 [00:00<?, ?it/s]

Movies from 2019:   0%|          | 0/4721 [00:00<?, ?it/s]

Movies from 2020:   0%|          | 0/4868 [00:00<?, ?it/s]

Movies from 2021:   0%|          | 0/4841 [00:00<?, ?it/s]

- Total errors: 3153


Concatenate dfs and organize data

In [2]:
#from https://stackoverflow.com/questions/20906474/import-multiple-csv-files-into-pandas-and-concatenate-into-one-dataframe

from pathlib import Path

path =  'big_data/' 

# Get the files from the path provided in the OP
files = Path(path).glob('*.csv.gz')  



In [3]:
dfs = list()
for f in files:
    data = pd.read_csv(f)
    # .stem is method for pathlib objects to get the filename w/o the extension
    data['file'] = f.stem
    dfs.append(data)

dfs

[        imdb_id  adult                     backdrop_path  \
 0             0    NaN                               NaN   
 1     tt0249516    0.0  /t7zb6CnRQwhzQSq0apR4ESFYiWN.jpg   
 2     tt0285252    0.0  /pKOHinAVIkyqLliMaTHCSL1T4zU.jpg   
 3     tt0293069    0.0  /cj2isIuskSePAwpb2D6yUBZrhkJ.jpg   
 4     tt0337692    0.0  /5dUOTVeNPU2CmEfFniQ8TE6HChG.jpg   
 ...         ...    ...                               ...   
 3209  tt7547032    0.0                               NaN   
 3210  tt7867026    0.0  /4H3gxcDyFbyD3syAHcfGfu42zJ4.jpg   
 3211  tt8675644    0.0  /bkPwmPPVxSaT4lUztUOccHgWFi2.jpg   
 3212  tt8698020    0.0                               NaN   
 3213  tt9033128    0.0                               NaN   
 
                                   belongs_to_collection      budget  \
 0                                                   NaN         NaN   
 1                                                   NaN  65000000.0   
 2                                                

In [4]:
df = pd.concat(dfs, ignore_index=True)
df.head(3)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification,file
0,0,,,,,,,,,,...,,,,,,,,,,final_tmdb_data_2012.csv
1,tt0249516,0.0,/t7zb6CnRQwhzQSq0apR4ESFYiWN.jpg,,65000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...",http://www.thresholdanimationstudios.com/video...,116977.0,en,Foodfight!,...,87.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Foodfight!,0.0,1.856,108.0,PG-13,final_tmdb_data_2012.csv
2,tt0285252,0.0,/pKOHinAVIkyqLliMaTHCSL1T4zU.jpg,,4000000.0,"[{'id': 35, 'name': 'Comedy'}]",,112074.0,pl,Life's a Beach,...,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Life's a Beach,0.0,2.5,5.0,R,final_tmdb_data_2012.csv


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39156 entries, 0 to 39155
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                39156 non-null  object 
 1   adult                  39146 non-null  float64
 2   backdrop_path          26759 non-null  object 
 3   belongs_to_collection  2318 non-null   object 
 4   budget                 39146 non-null  float64
 5   genres                 39146 non-null  object 
 6   homepage               10512 non-null  object 
 7   id                     39146 non-null  float64
 8   original_language      39146 non-null  object 
 9   original_title         39146 non-null  object 
 10  overview               38353 non-null  object 
 11  popularity             39146 non-null  float64
 12  poster_path            36751 non-null  object 
 13  production_companies   39146 non-null  object 
 14  production_countries   39146 non-null  object 
 15  re

In [6]:
#adding files from 2012 to 2022
final_df = df[['imdb_id','budget', 'revenue', 'certification', 'runtime','genres']]
final_df.head()

Unnamed: 0,imdb_id,budget,revenue,certification,runtime,genres
0,0,,,,,
1,tt0249516,65000000.0,73706.0,PG-13,87.0,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam..."
2,tt0285252,4000000.0,0.0,R,100.0,"[{'id': 35, 'name': 'Comedy'}]"
3,tt0293069,0.0,0.0,,86.0,"[{'id': 53, 'name': 'Thriller'}]"
4,tt0337692,25000000.0,8784318.0,R,137.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '..."


In [7]:
final_df.shape

(39156, 6)

In [8]:
final_df.to_csv('big_data/combined_years.csv.gz', compression="gzip", index=False)