In [2]:
import pandas as pd
from tqdm.notebook import tqdm_notebook
import tmdbsimple as tmdb
import json, os, time

# Functions

In [3]:
# Helper function to retrieve movie info from tmdb with certification info
def get_movie_with_rating(movie_id):
    movie = tmdb.Movies(movie_id)
    info = movie.info()
    releases = movie.releases()

    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            info['certification'] = c['certification']
    return info

In [4]:
def create_json_file(JSON_FILE, delete_if_exists=True):
    file_exists = os.path.isfile(JSON_FILE)
    
    if file_exists:
        
        if delete_if_exists:
            print(f'{JSON_FILE} exists, deleting file')
            os.remove(JSON_FILE)
            create_json_file(JSON_FILE)
        else:
            print(f'{JSON_FILE} already exists')
    
    else:
        print(f'{JSON_FILE} does not exists, creating new file as empty list')
        folder = os.path.dirname(JSON_FILE)
        if len(folder) > 0:
            os.makedirs(folder, exist_ok=True)
        with open(JSON_FILE, 'w') as f:
            json.dump([],f)

In [5]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

# Create temporary download files

In [6]:
# create save file
progress_file_2000 = 'Data/tmdb_in_progress_2000.json'
create_json_file(progress_file_2000, delete_if_exists=True)
progress_file_2001 = 'Data/tmdb_in_progress_2001.json'
create_json_file(progress_file_2001, delete_if_exists=True)

Data/tmdb_in_progress_2000.json exists, deleting file
Data/tmdb_in_progress_2000.json does not exists, creating new file as empty list
Data/tmdb_in_progress_2001.json exists, deleting file
Data/tmdb_in_progress_2001.json does not exists, creating new file as empty list


# Retrieve Movie Information

In [7]:
# retrieve api-key

with open('/Users/caell/.secret/tmdb_api.json') as f:
    tmdb.API_KEY = json.load(f)['api-key']


In [8]:
movies = pd.read_csv('Data/title_basics.csv.gz')
movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [9]:
# Outer loop: iterate over years 2000, 2001
errors = []
years = (2000, 2001)
for year in tqdm_notebook(years, desc='Years', position=0):
    # Filter movies by year
    year_movies = movies[movies['startYear'] == year].copy()
    
    # Set file to save progress to
    fname = f'Data/tmdb_in_progress_{year}.json'

    # Load previous progress
    previous_df = pd.read_json(fname)
    # Remove duplicates
    try:
        year_movies = year_movies[~year_movies['tconst'].isin(previous_df['imdb_id'])]
    except:
        pass
    # extract ids to retrieve
    ids_to_get = year_movies['tconst']
    
    # inner loop: iterate over movies in year
    for movie in tqdm_notebook(ids_to_get,
                               desc=f'movies from {year}',
                               position=1,
                               leave=True):
        
        # Retrieve and save next movie
        try:
            new_movie = get_movie_with_rating(movie)
            write_json(new_movie, fname)
            time.sleep(.02)
            
        except Exception as e:
            errors.append([movie, e])
            



errors

Years:   0%|          | 0/2 [00:00<?, ?it/s]

movies from 2000:   0%|          | 0/1412 [00:00<?, ?it/s]

movies from 2001:   0%|          | 0/1530 [00:00<?, ?it/s]

[['tt0115937',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0115937?api_key=6290f3166491331b1bebf8f61d8a0edf')],
 ['tt0116628',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0116628?api_key=6290f3166491331b1bebf8f61d8a0edf')],
 ['tt0118710',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0118710?api_key=6290f3166491331b1bebf8f61d8a0edf')],
 ['tt0119966',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0119966?api_key=6290f3166491331b1bebf8f61d8a0edf')],
 ['tt0137698',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0137698?api_key=6290f3166491331b1bebf8f61d8a0edf')],
 ['tt0139159',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt013

In [10]:
len(errors)

441

# Convert Movie Info to DataFrames and save them as CSV files

In [11]:
# Transform movie year dictionaries into dataframes
year_2000_movies = pd.read_json('Data/tmdb_in_progress_2000.json')
year_2001_movies = pd.read_json('Data/tmdb_in_progress_2001.json')
year_2001_movies.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,False,,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,240697,tt0266629,en,Hollywood Sex Fantasy,...,0,99,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Some women will do anything to score with a Ho...,Hollywood Sex Fantasy,False,5.225,20,R
1,False,/xtq2MYV3hJjdG23Wi6YnCXkvr4X.jpg,,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",,296854,tt0266824,en,The Painting,...,0,95,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Two worlds. One love. Turbulent times.,The Painting,False,7.3,3,
2,False,/zzFTSEAZcLGSbGipQVflSNUqpij.jpg,"{'id': 90863, 'name': 'Rush Hour Collection', ...",90000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",,5175,tt0266915,en,Rush Hour 2,...,347325802,90,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,Get ready for a second Rush!,Rush Hour 2,False,6.688,3382,PG-13
3,False,/zSAtGMQPOY05hHeIdVEeX72jfz3.jpg,,115000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,1535,tt0266987,en,Spy Game,...,143049560,126,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,It's not how you play the game. It's how the g...,Spy Game,False,6.919,1763,R
4,False,/qzYeWgYPZNALHXqYlyoK8n3m7kh.jpg,,9360460,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,56625,tt0267044,fr,Barnie et ses petites contrariétés,...,0,80,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,,Barnie's Minor Annoyances,False,5.3,21,


In [12]:
# Save each year as separate json files
year_2000_movies.to_csv('Data/year_2000_movies.csv.gz', 
                        compression='gzip', 
                        index=False)
year_2001_movies.to_csv('Data/year_2001_movies.csv.gz', 
                        compression='gzip', 
                        index=False)

# Concatenate Year DataFrames and Check Some Basic Stats

In [13]:
years_movies = pd.concat([year_2000_movies, year_2001_movies])
years_movies.to_csv('Data/tmdb_results_combined.csv.gz', 
                    compression='gzip', 
                    index=False)
years_movies.shape

(691, 26)

In [14]:
years_movies['imdb_id'].duplicated().sum()

0

In [15]:
years_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 691 entries, 0 to 690
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  691 non-null    bool   
 1   backdrop_path          309 non-null    object 
 2   belongs_to_collection  45 non-null     object 
 3   budget                 691 non-null    int64  
 4   genres                 691 non-null    object 
 5   homepage               691 non-null    object 
 6   id                     691 non-null    int64  
 7   imdb_id                691 non-null    object 
 8   original_language      691 non-null    object 
 9   original_title         691 non-null    object 
 10  overview               691 non-null    object 
 11  popularity             691 non-null    float64
 12  poster_path            596 non-null    object 
 13  production_companies   691 non-null    object 
 14  production_countries   691 non-null    object 
 15  releas

# Answer EDA Questions

### How many movies have SOME financial information?

In [16]:
financial = (years_movies['budget'] > 0) | (years_movies['revenue'] > 0)
print(f'{len(years_movies[financial])} movies have financial information')

105 movies have financial information


### How many movies are in each certification category?

In [17]:
print(f'The number of movies in each certification category are:')
years_movies["certification"].value_counts()

The number of movies in each certification category are:


         236
R         62
NR        19
PG-13     18
PG        13
G          6
NC-17      2
-          1
Name: certification, dtype: int64

### What is the average revenue per certification category

In [18]:
cert_groups = years_movies.groupby('certification').mean()['revenue']
print('The average revenue for each certification category is:')
cert_groups

The average revenue for each certification category is:


  cert_groups = years_movies.groupby('certification').mean()['revenue']


certification
         4.587570e+05
-        0.000000e+00
G        1.659515e+07
NC-17    0.000000e+00
NR       1.052632e+06
PG       1.020267e+07
PG-13    4.906197e+07
R        7.839085e+06
Name: revenue, dtype: float64

### What is the average budget by certification category

In [19]:
cert_groups = years_movies.groupby('certification').mean()['budget']
print('The average budget for each certification category is:')
cert_groups

The average budget for each certification category is:


  cert_groups = years_movies.groupby('certification').mean()['budget']


certification
         4.116178e+05
-        0.000000e+00
G        8.333333e+06
NC-17    0.000000e+00
NR       6.210526e+05
PG       9.807693e+06
PG-13    1.729488e+07
R        5.065475e+06
Name: budget, dtype: float64