In [25]:
import pandas as pd
from tqdm.notebook import tqdm_notebook
import tmdbsimple as tmdb
import json, os, time

# Functions

In [5]:
# Helper function to retrieve movie info from tmdb with certification info
def get_movie_with_rating(movie_id):
    movie = tmdb.Movies(movie_id)
    info = movie.info()
    releases = movie.releases()

    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            info['certification'] = c['certification']
    return info

In [78]:
def create_json_file(JSON_FILE, delete_if_exists=True):
    file_exists = os.path.isfile(JSON_FILE)
    
    if file_exists:
        
        if delete_if_exists:
            print(f'{JSON_FILE} exists, deleting file')
            os.remove(JSON_FILE)
            create_json_file(JSON_FILE)
        else:
            print(f'{JSON_FILE} already exists')
    
    else:
        print(f'{JSON_FILE} does not exists, creating new file as empty list')
        folder = os.path.dirname(JSON_FILE)
        if len(folder) > 0:
            os.makedirs(folder, exist_ok=True)
        with open(JSON_FILE, 'w') as f:
            json.dump([],f)

In [76]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

# Create temporary download files

In [79]:
# create save file
progress_file_2000 = 'Data/tmdb_in_progress_2000.json'
create_json_file(progress_file_2000, delete_if_exists=True)
progress_file_2001 = 'Data/tmdb_in_progress_2001.json'
create_json_file(progress_file_2001, delete_if_exists=True)

Data/tmdb_in_progress_2000.json exists, deleting file
Data/tmdb_in_progress_2000.json does not exists, creating new file as empty list
Data/tmdb_in_progress_2001.json exists, deleting file
Data/tmdb_in_progress_2001.json does not exists, creating new file as empty list


# Retrieve Movie Information

In [36]:
# retrieve api-key

with open('/Users/caell/.secret/tmdb_api.json') as f:
    tmdb.API_KEY = json.load(f)['api-key']


In [82]:
# Outer loop: iterate over years 2000, 2001
errors = []
years = (2000, 2001)
for year in tqdm_notebook(years, desc='Years', position=0):
    # Filter movies by year
    year_movies = movies[movies['startYear'] == year].copy()
    
    # Set file to save progress to
    fname = f'Data/tmdb_in_progress_{year}.json'

    # Load previous progress
    previous_df = pd.read_json(fname)
    # Remove duplicates
    try:
        year_movies = year_movies[~year_movies['tconst'].isin(previous_df['imdb_id'])]
    except:
        pass
    # extract ids to retrieve
    ids_to_get = year_movies['tconst']
    
    # inner loop: iterate over movies in year
    for movie in tqdm_notebook(ids_to_get,
                               desc=f'movies from {year}',
                               position=1,
                               leave=True):
        
        # Retrieve and save next movie
        try:
            new_movie = get_movie_with_rating(movie)
            write_json(new_movie, fname)
            time.sleep(.02)
            
        except Exception as e:
            errors.append([movie, e])
            



errors

Years:   0%|          | 0/2 [00:00<?, ?it/s]

movies from 2000:   0%|          | 0/1404 [00:00<?, ?it/s]

movies from 2001:   0%|          | 0/1520 [00:00<?, ?it/s]

[['tt0115937',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0115937?api_key=6290f3166491331b1bebf8f61d8a0edf')],
 ['tt0116628',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0116628?api_key=6290f3166491331b1bebf8f61d8a0edf')],
 ['tt0118710',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0118710?api_key=6290f3166491331b1bebf8f61d8a0edf')],
 ['tt0119966',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0119966?api_key=6290f3166491331b1bebf8f61d8a0edf')],
 ['tt0137698',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt0137698?api_key=6290f3166491331b1bebf8f61d8a0edf')],
 ['tt0139159',
  requests.exceptions.HTTPError('404 Client Error: Not Found for url: https://api.themoviedb.org/3/movie/tt013

In [83]:
len(errors)

444

# Convert Movie Info to DataFrames and save them as CSV files

In [84]:
# Transform movie year dictionaries into dataframes
year_2000_movies = pd.read_json('Data/tmdb_in_progress_2000.json')
year_2001_movies = pd.read_json('Data/tmdb_in_progress_2001.json')
year_2001_movies.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,False,/ab5yL8zgRotrICzGbEl10z24N71.jpg,,48000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",,11232,tt0035423,en,Kate & Leopold,...,76019048,118,"[{'english_name': 'Italian', 'iso_639_1': 'it'...",Released,If they lived in the same century they'd be pe...,Kate & Leopold,False,6.322,1148,PG-13
1,False,,,0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",,151007,tt0114447,en,The Silent Force,...,0,90,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,They left him for dead... They should have fin...,The Silent Force,False,5.0,3,
2,False,/9NZAirJahVilTiDNCHLFcdkwkiy.jpg,,22000000,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",,10696,tt0118589,en,Glitter,...,5271666,104,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"In music she found her dream, her love, herself.",Glitter,False,4.6,118,PG-13
3,False,/mWxJEFRMvkG4UItYJkRDMgWQ08Y.jpg,,1000000,"[{'id': 27, 'name': 'Horror'}, {'id': 9648, 'n...",,17140,tt0118652,en,The Attic Expeditions,...,0,100,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,His search for peace of mind... will leave his...,The Attic Expeditions,False,5.1,28,R
4,False,/7xrlSPGDO4CDT6IHTctDlkYxTzw.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,37857,tt0119004,en,Don's Plum,...,6297,89,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Don's Plum,False,5.4,63,


In [85]:
# Save each year as separate json files
year_2000_movies.to_csv('Data/year_2000_movies.csv.gz', 
                        compression='gzip', 
                        index=False)
year_2001_movies.to_csv('Data/year_2001_movies.csv.gz', 
                        compression='gzip', 
                        index=False)

# Concatenate Year DataFrames and Check Some Basic Stats

In [105]:
years_movies = pd.concat([year_2000_movies, year_2001_movies])
years_movies.to_csv('Data/tmdb_results_combined.csv.gz', 
                    compression='gzip', 
                    index=False)
years_movies.shape

(2498, 26)

In [103]:
years_movies['imdb_id'].duplicated().sum()

0

In [106]:
years_movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2498 entries, 0 to 1289
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  2498 non-null   bool   
 1   backdrop_path          1309 non-null   object 
 2   belongs_to_collection  197 non-null    object 
 3   budget                 2498 non-null   int64  
 4   genres                 2498 non-null   object 
 5   homepage               2498 non-null   object 
 6   id                     2498 non-null   int64  
 7   imdb_id                2498 non-null   object 
 8   original_language      2498 non-null   object 
 9   original_title         2498 non-null   object 
 10  overview               2498 non-null   object 
 11  popularity             2498 non-null   float64
 12  poster_path            2215 non-null   object 
 13  production_companies   2498 non-null   object 
 14  production_countries   2498 non-null   object 
 15  rele

# Answer EDA Questions

### How many movies have SOME financial information?

In [107]:
financial = (years_movies['budget'] > 0) | (years_movies['revenue'] > 0)
print(f'{len(years_movies[financial])} movies have financial information')

623 movies have financial information


### How many movies are in each certification category?

In [108]:
print(f'The number of movies in each certification category are:')
years_movies["certification"].value_counts()

The number of movies in each certification category are:


           858
R          453
PG-13      180
NR          67
PG          64
G           24
NC-17        6
Unrated      1
-            1
Name: certification, dtype: int64

### What is the average revenue per certification category

In [109]:
cert_groups = years_movies.groupby('certification').mean()['revenue']
print('The average revenue for each certification category is:')
cert_groups

The average revenue for each certification category is:


certification
           1.611388e+06
-          0.000000e+00
G          7.216332e+07
NC-17      0.000000e+00
NR         2.255067e+06
PG         6.143535e+07
PG-13      7.186567e+07
R          1.678400e+07
Unrated    0.000000e+00
Name: revenue, dtype: float64

### What is the average budget by certification category

In [110]:
cert_groups = years_movies.groupby('certification').mean()['budget']
print('The average budget for each certification category is:')
cert_groups

The average budget for each certification category is:


certification
           8.425661e+05
-          0.000000e+00
G          2.383333e+07
NC-17      0.000000e+00
NR         1.598509e+06
PG         2.459766e+07
PG-13      3.098838e+07
R          9.976831e+06
Unrated    0.000000e+00
Name: budget, dtype: float64