In [1]:
# Import dependencies
import requests
from pprint import pprint
import os
import csv
import pandas as pd
import numpy as np
from itertools import chain

omdb_key = "9b9d69c4"

In [3]:
# Read in csv file & create data frame
top_grossing_csv = 'Output/Top_Grossing_Films.csv'
top_grossing_df = pd.read_csv(top_grossing_csv)
top_grossing_df

Unnamed: 0,Year,Movie ID,Movie Title,Revenue
0,1927,287262,The Battles of the Coronel and Falkland Islands,6241682
1,1928,173689,Lights of New York,1200000
2,1929,65203,The Broadway Melody,4358000
3,1930,166307,Tom Sawyer,11000000
4,1931,3035,Frankenstein,12000000
...,...,...,...,...
88,2015,140607,Star Wars: The Force Awakens,2068223624
89,2016,271110,Captain America: Civil War,1153296293
90,2017,181808,Star Wars: The Last Jedi,1332539889
91,2018,299536,Avengers: Infinity War,2046239637


In [4]:
# Establish base url for API requests
url = f"http://www.omdbapi.com/?apikey={omdb_key}&t="

In [5]:
# Create new columns to append API Requests data to
top_grossing_df['Movie Genre'] = ""
top_grossing_df['Runtime'] = ""
top_grossing_df['Actors'] = ""
top_grossing_df['IMDB Rating'] = ""
top_grossing_df['Rotten Tomatoes Rating'] = ""

top_grossing_df.head()

Unnamed: 0,Year,Movie ID,Movie Title,Revenue,Movie Genre,Runtime,Actors,IMDB Rating,Rotten Tomatoes Rating
0,1927,287262,The Battles of the Coronel and Falkland Islands,6241682,,,,,
1,1928,173689,Lights of New York,1200000,,,,,
2,1929,65203,The Broadway Melody,4358000,,,,,
3,1930,166307,Tom Sawyer,11000000,,,,,
4,1931,3035,Frankenstein,12000000,,,,,


In [6]:
# Cleaning data and replacing false titles of movies
top_grossing_df = top_grossing_df.replace(to_replace = ["La figlia di Frankenstein"], value ="Lady Frankenstein")
top_grossing_df = top_grossing_df.replace(to_replace = ["Harry Potter and the Philosopher's Stone"], value ="Harry Potter and the Sorcerer's Stone")

In [7]:
# Iterate through data frame to make API requests and append to columns in data frame
for index, row in top_grossing_df.iterrows():
    
    query_url = url + (row['Movie Title'])
    
    response_url = requests.get(query_url).json()
    
    try:
        top_grossing_df.loc[index, 'Movie Genre'] = response_url['Genre']
        top_grossing_df.loc[index, 'Actors'] = response_url['Actors']
        top_grossing_df.loc[index, 'Runtime'] = response_url['Runtime']
        top_grossing_df.loc[index, 'IMDB Rating'] = response_url['Ratings'][0]['Value']
        top_grossing_df.loc[index, 'Rotten Tomatoes Rating'] = response_url['Ratings'][1]['Value']
        
    except (KeyError,IndexError):
        print(f"Problem with movie: '{row['Movie Title']}'")

Problem with movie: 'The Battles of the Coronel and Falkland Islands'
Problem with movie: 'Lights of New York'
Problem with movie: 'Tom Sawyer'
Problem with movie: 'Seeds of Destiny'
Problem with movie: 'Spartacus'
Problem with movie: 'Hawaii'
Problem with movie: 'Lady Frankenstein'
Problem with movie: 'To Fly!'
Problem with movie: 'Star Wars: The Force Awakens'
Problem with movie: 'Star Wars: The Last Jedi'


In [8]:
# Verify results from API requests
top_grossing_df.head()

Unnamed: 0,Year,Movie ID,Movie Title,Revenue,Movie Genre,Runtime,Actors,IMDB Rating,Rotten Tomatoes Rating
0,1927,287262,The Battles of the Coronel and Falkland Islands,6241682,,,,,
1,1928,173689,Lights of New York,1200000,"Crime, Drama, Music, Romance",57 min,"Helene Costello, Cullen Landis, Mary Carr, Whe...",5.7/10,
2,1929,65203,The Broadway Melody,4358000,"Drama, Musical, Romance",100 min,"Charles King, Anita Page, Bessie Love",5.7/10,35%
3,1930,166307,Tom Sawyer,11000000,"Adventure, Musical, Family",103 min,"Johnny Whitaker, Celeste Holm, Warren Oates, J...",6.5/10,
4,1931,3035,Frankenstein,12000000,"Drama, Horror, Sci-Fi",70 min,"Colin Clive, Mae Clarke, John Boles, Boris Kar...",7.8/10,100%


In [9]:
# Create a new data frame to cross reference against other data based on actors
actor_list_df = top_grossing_df[['Year','Movie Title', 'Actors']]
actor_list_df.head()

Unnamed: 0,Year,Movie Title,Actors
0,1927,The Battles of the Coronel and Falkland Islands,
1,1928,Lights of New York,"Helene Costello, Cullen Landis, Mary Carr, Whe..."
2,1929,The Broadway Melody,"Charles King, Anita Page, Bessie Love"
3,1930,Tom Sawyer,"Johnny Whitaker, Celeste Holm, Warren Oates, J..."
4,1931,Frankenstein,"Colin Clive, Mae Clarke, John Boles, Boris Kar..."


In [10]:
# Split actors column and duplicate rows so that each actor has own row for cross referencing
def column_splitter(column_to_split):
    return list(chain.from_iterable(column_to_split.str.split(',')))

lengths = actor_list_df['Actors'].str.split(',').map(len)

actor_list_df = pd.DataFrame({
    'Year': np.repeat(actor_list_df['Year'],lengths),
    'Movie Title': np.repeat(actor_list_df['Movie Title'],lengths),
    'Actors': column_splitter(actor_list_df['Actors'])
})

actor_list_df.head()

Unnamed: 0,Year,Movie Title,Actors
0,1927,The Battles of the Coronel and Falkland Islands,
1,1928,Lights of New York,Helene Costello
1,1928,Lights of New York,Cullen Landis
1,1928,Lights of New York,Mary Carr
1,1928,Lights of New York,Wheeler Oakman


In [11]:
# Export data frame to csv file for further analysis
top_grossing_df.to_csv('Final_Grossing_Data.csv', index = False)