# Movie Project
- Clint Atterberry

**Business Problem**

*For this project, you have been hired to produce a MySQL database on Movies from a subset of IMDB's publicly available dataset. Ultimately, you will use this database to analyze what makes a movie successful, and will provide recommendations to the stakeholder on how to make a successful movie.*

# Setup

## Import

In [47]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Additional Imports
import os, json, math, time
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook

In [48]:
# Install tmdbsimple (only need to run once)
!pip install tmdbsimple



## Load API

In [49]:
# load login keys
with open('/Users/oneda/.secret/movie_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['client-id', 'api-key'])

In [50]:
# login with API-Key
tmdb.API_KEY =  login['api-key']

## Load Previous Data

In [51]:
# Load previous data
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.ipynb_checkpoints',
 'Basics_Data.tsv',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'Ratings_Data.csv',
 'titles_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json']

## Open title basics

In [52]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy


# Working with The Movie Database

In [53]:
movie = tmdb.Movies(120)

In [54]:
info = movie.info()
info

{'adult': False,
 'backdrop_path': '/tqj7NKj11keFuLzPsBDMUq2dOUO.jpg',
 'belongs_to_collection': {'id': 119,
  'name': 'The Lord of the Rings Collection',
  'poster_path': '/oENY593nKRVL2PnxXsMtlh8izb4.jpg',
  'backdrop_path': '/bccR2CGTWVVSZAG0yqmy3DIvhTX.jpg'},
 'budget': 93000000,
 'genres': [{'id': 12, 'name': 'Adventure'},
  {'id': 14, 'name': 'Fantasy'},
  {'id': 28, 'name': 'Action'}],
 'homepage': 'http://www.lordoftherings.net/',
 'id': 120,
 'imdb_id': 'tt0120737',
 'original_language': 'en',
 'original_title': 'The Lord of the Rings: The Fellowship of the Ring',
 'overview': 'Young hobbit Frodo Baggins, after inheriting a mysterious ring from his uncle Bilbo, must leave his home in order to keep it from falling into the hands of its evil creator. Along the way, a fellowship is formed to protect the ringbearer and make sure that the ring arrives at its final destination: Mt. Doom, the only place where it can be destroyed.',
 'popularity': 217.296,
 'poster_path': '/6oom5QYQ2y

The movie was loaded with no issues.

## Looking at New Info

In [55]:
info['budget']

93000000

In [56]:
info['revenue']

871368364

In [57]:
info['imdb_id']

'tt0120737'

In [58]:
# info['certification'] # produces error

In [59]:
# using IMDB_id
movie = tmdb.Movies('tt1361336')
info = movie.info()
info['budget']

50000000

In [60]:
releases = movie.releases()

In [61]:
movie_info = tmdb.Movies('tt1361336')
# Create Loop to add certification data
for c in movie.countries:
    if c['iso_3166_1'] == 'US':
        print(c['certification'])

PG
PG
PG


In [62]:
# Save to db
for c in releases['countries']:
    if c['iso_3166_1'] == 'US':
        info['certification'] = c['certification']

In [63]:
info

{'adult': False,
 'backdrop_path': '/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg',
 'belongs_to_collection': None,
 'budget': 50000000,
 'genres': [{'id': 35, 'name': 'Comedy'},
  {'id': 10751, 'name': 'Family'},
  {'id': 16, 'name': 'Animation'}],
 'homepage': 'https://www.tomandjerrymovie.com',
 'id': 587807,
 'imdb_id': 'tt1361336',
 'original_language': 'en',
 'original_title': 'Tom & Jerry',
 'overview': 'Tom the cat and Jerry the mouse get kicked out of their home and relocate to a fancy New York hotel, where a scrappy employee named Kayla will lose her job if she can’t evict Jerry before a high-class wedding at the hotel. Her solution? Hiring Tom to get rid of the pesky mouse.',
 'popularity': 172.288,
 'poster_path': '/8XZI9QZ7Pm3fVkigWJPbrXCMzjq.jpg',
 'production_companies': [{'id': 174,
   'logo_path': '/IuAlhI9eVC9Z8UQWOIDdWRKSEJ.png',
   'name': 'Warner Bros. Pictures',
   'origin_country': 'US'},
  {'id': 8922,
   'logo_path': '/yZWehAyjfKi4KvKeg1bkJ1bm5H8.png',
   'name': 'Turner En

Certification was saved to info

## Making a Function for Certification

In [64]:
def get_movie_with_rating(movie_id):
    movie = tmdb.Movies(movie_id)
    info = movie.info()
    releases = movie.releases()
    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            info['certification'] = c['certification']
    return info

In [65]:
# Test with The Avengers movie
test = get_movie_with_rating("tt0848228") #put your function name here
test

{'adult': False,
 'backdrop_path': '/nNmJRkg8wWnRmzQDe2FwKbPIsJV.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 233.53,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path'

In [66]:
# Test with The Notebook
test = get_movie_with_rating("tt0332280") #put your function name here
test

{'adult': False,
 'backdrop_path': '/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg',
 'belongs_to_collection': None,
 'budget': 29000000,
 'genres': [{'id': 10749, 'name': 'Romance'}, {'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.newline.com/properties/notebookthe.html',
 'id': 11036,
 'imdb_id': 'tt0332280',
 'original_language': 'en',
 'original_title': 'The Notebook',
 'overview': "An epic love story centered around an older man who reads aloud to a woman with Alzheimer's. From a faded notebook, the old man's words bring to life the story about a couple who is separated by World War II, and is then passionately reunited, seven years later, after they have taken different paths.",
 'popularity': 73.519,
 'poster_path': '/rNzQyW4f8B8cQeg7Dgj3n6eT5k9.jpg',
 'production_companies': [{'id': 12,
   'logo_path': '/iaYpEp3LQmb8AfAtmTvpqd4149c.png',
   'name': 'New Line Cinema',
   'origin_country': 'US'},
  {'id': 1565, 'logo_path': None, 'name': 'Avery Pix', 'origin_country': 'US'},
  {'id': 26

Success!

# Creating Loop

In [75]:
# # https://github.com/coding-dojo-data-science/data-enrichment-helper-functions

# def read_and_fix_json(JSON_FILE):
# #     """Attempts to read in json file of records and fixes the final character
# #     to end with a ] if it errors.
    
# #     Args:
# #         JSON_FILE (str): filepath of JSON file
        
# #     Returns:
# #         DataFrame: the corrected data from the bad json file
# #     """
#     try: 
#         previous_df =  pd.read_json(JSON_FILE)
    
#     ## If read_json throws an error
#     except:
        
#         ## manually open the json file
#         with open(JSON_FILE,'r+') as f:
#             ## Read in the file as a STRING
#             bad_json = f.read()
            
#             ## if the final character doesn't match first, select the right bracket
#             first_char = bad_json[0]
#             final_brackets = {'[':']', 
#                            "{":"}"}
#             ## Select expected final brakcet
#             final_char = final_brackets[first_char]
            
#             ## if the last character in file doen't match the first char, add it
#             if bad_json[-1] != final_char:
#                 good_json = bad_json[:-1]
#                 good_json+=final_char
#             else:
#                 raise Exception('ERROR is not due to mismatched final bracket.')
            
#             ## Rewind to start of file and write new good_json to disk
#             f.seek(0)
#             f.write(good_json)
           
#         ## Load the json file again now that its fixed
#         previous_df =  pd.read_json(JSON_FILE)
        
#     return previous_df
	
	
	
	

# def write_json(new_data, filename): 
# #     """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""    
    
#     with open(filename,'r+') as file:
#         # First we load existing data into a dict.
#         file_data = json.load(file)
#         ## Choose extend or append
#         if (type(new_data) == list) & (type(file_data) == list):
#             file_data.extend(new_data)
#         else:
#              file_data.append(new_data)
#         # Sets file's current position at offset.
#         file.seek(0)
#         # convert back to json.
#         json.dump(file_data, file)

In [68]:
def write_json(new_data, filename): 
#Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/

    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [69]:
# problem wants the years 2000 and 2001
YEARS_TO_GET = ['2000', '2001']

In [70]:
# Load in the dataframe from project part 1 as basics:
basics = pd.read_csv('Data/title_basics.csv.gz', low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy


In [83]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    file_exists = os.path.isfile(JSON_FILE)# If it does not exist: create it

    if file_exists == False:
    # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)

        basics = pd.read_csv('Data/title_basics.csv.gz', low_memory = False)

        df = basics.loc[basics['startYear']==YEAR].copy()
        movie_ids = df['tconst'].copy()
        prev_df = pd.read_json(JSON_FILE)

        movie_ids_to_get = movie_ids[~movie_ids.isin(prev_df['imdb_id'])]

    # INNER LOOP
    for movie_id in tqdm_notebook(movie_ids_to_get, desc=f"Movies from {YEAR}", position=1, leave=True):
        try:
            temp = get_movie_with_rating(movie_id)
            write_json(temp, JSON_FILE)
            time.sleep(0.04)

        except Exception as e:
            continue

final_year_df = pd.read_json(JSON_FILE)
final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression='gzip',
                     index=False)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000: 0it [00:00, ?it/s]

Movies from 2001: 0it [00:00, ?it/s]

In [81]:
read_and_fix_json(JSON_FILE)

Unnamed: 0,imdb_id
0,0
