In [275]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import random
import time

In [300]:
#Loops through the years, scrapes Box Office Mojo & returns the result with True or Issue with False

all_movies_html = dict()

for year in range(2010,2020,1):
    all_movies_html[year] = scrape_movie_list_opening_weekend(year)
    time.sleep(.5+2*random.random())

In [325]:
all_movies_clean = dict()
all_errors_clean =dict()

for year in range(2010,2020,1):
    
    if all_movies_html[year][0]:

        movies_log = list()
        error_log = list() 
        counter = 1

        for table_row in all_movies_html[year][1]:

            if table_row.find('td'):

                table_row_data = table_row.find_all('td')
                movie_info_clean = HTML_table_row_to_dict(table_row_data)
                
                if type(movie_info_clean) == dict:
                    movies_log.append([movie_info_clean])

                else:
        #             (counter, movie_info_clean[2]['Title'],len(movie_info_clean[2]),list(movie_info_clean[2])[-1])
#                     print(movie_info_clean)
                    movie_info_clean[1]['Counter'] = counter
                    movie_info_clean[1]['Error Message'] = movie_info_clean[0]
                    error_log.append([movie_info_clean[1]])
                counter += 1
                
        all_movies_clean[year] = movies_log
        all_errors_clean[year] = error_log


In [327]:
for keys,values in all_errors_clean.items():
    print(f"{keys}: {len(values)}")
print("----") 
# for keys,values in all_movies_clean.items():
#     print(f"{keys}: {len(values)}")
    
    
print("---")
print(all_errors_clean[2019])

2010: 25
2011: 43
2012: 43
2013: 46
2014: 25
2015: 37
2016: 32
2017: 35
2018: 36
2019: 36
----
---


IndexError: list index out of range

In [305]:
def scrape_movie_list_opening_weekend(year):
    """
    Scrapes the list of movies present in a table format in Box Office Mojo
    Checks if the required table data was retrieved from the website
    Returns the parsed HTML text for the table portion
    
    Input: Year
    Output: Tuple containing True or False. 
            if True, returns table rows in HTML format containing movie list.
            if False, returns Error message or reason for False
    """
     
    url = "https://www.boxofficemojo.com/year/{}/?sort=openingWeekendGross&grossesOption=totalGrosses"

# Gets data from Box Office Mojo. Checks if status code is not 200. If not 200, returns status code & False
# If status code 200, parses the text & looks for table, table row & movie title in second row, to confirm
# formatting is similar. If not, returns the same, else errors out.

    try:
        response = requests.get(url.format(year))
        if response.status_code != 200:
            return (False, "For year {} Got status code: {}".format(year,response.status_code))
        
        else:
            soup = BeautifulSoup(response.text, 'html5lib')
            
            try:
                movie_title = soup.find('div', id='table').find('tbody').find_all('tr')[1].find_all('td')[1].text
                if movie_title:
                    return (True, soup.find('div', id='table').find('tbody').find_all('tr')[1:])
                else:
                    return (False, "No movie title in second <tr> row. Different HTML formatting")
            except Exception as error_message:
                return (False, "Retrieving table/movie title gave following exception: {}".format(error_message))
            
    except Exception as error_message:
        return (False, "Error Somewhere in entire block with message: {}".format(error_message))

In [323]:
def HTML_table_row_to_dict(table_row_data):
    """
    Takes in table_row_data for a single table row in HTML syntax for Box Office Mojo 
    and returns a dictionary with necessary values 
    """
    
    temp_dict = dict()
    try:

        temp_dict['Release Link'] = "https://www.boxofficemojo.com/"+table_row_data[1].find('a').get('href')
        temp_dict['Title'] = table_row_data[1].text
        temp_dict['Domestic Gross'] = currency_to_int(table_row_data[5])
        temp_dict['Max Theatres'] = table_row_data[6].text.strip().replace(',','')
        temp_dict['Opening Weekend Gross'] = currency_to_int(table_row_data[7])
        temp_dict['Domestic to Opening Gross'] = percent_to_float(table_row_data[8].text)
        temp_dict['Opening Weekend Theatres'] = table_row_data[9].text
        temp_dict['Release Date'] = table_row_data[10].text + " "+str(year)
        temp_dict['Studio Link'] = table_row_data[12].find('a').get('href')
        temp_dict['Studio Name'] = table_row_data[12].text.strip()
    except Exception as ex:
        return (ex, temp_dict)
    

    #https://www.boxofficemojo.com/
    # tag 2 - class="mojo-field-type-release" : get a href release link & title from .text
    # tag 6 - class="mojo-field-type-money mojo-estimatable" : .text # Gross
    # tag 7 - class"mojo-field-type-positive_integer" : .text #max theatres
    # tag 8 - mojo-field-type-money mojo-sort-column : .text #opening weekend
    # tag 9 - mojo-field-type-percent: .text #%
    # tag 10 - mojo-field-type-positive_integer: .text #opening theatres
    # tag 11 - mojo-field-type-date a-nowrap: .text :release date
    # tag 12 - mojo-field-type-studio: .text & a href link & .text #studio
    return temp_dict


def currency_to_int(amount):
    """
    Converts a given curreny string to an integer
    
    Input: amount --> '$123,562,324'
    Output: Output --> 123562324
    """
    try:
        return int(amount.strip('$ ').replace(',',''))
    except:
        return 0

def percent_to_float(percentage):
    """
    Converts a given string percentage to float
    
    Input: 90.4%
    Ouput: 0.904
    """
    
    try:
        return float(percentage.strip('%-'))/100
    except:
        return 0.0


In [307]:
#NOT SURE IF I WILL NEED THE BELOW FUNCTION as a FUNCTION


def scrape_movies_list(start_year,end_year):
    """
    Scrapes the movies list from box office mojo and returns the relevant info in HTML format
    
    Input: Start Year --> eg: 2010
           End Year --> eg: 2019
    Output: Dictionary with keys for each year & value containing HTML format of all movie table rows"""
    for year in range(start_year,end_year+1,1):
        continue
    return None


In [None]:
#Initial Testing Code
# Delete in the end

soup = BeautifulSoup(response.text, 'html5lib')
all_tr = soup.find('div', id='table').find('tbody').find_all('tr')
all_tr[1].find_all('td')[1].text

movies_2020 = list()
error_log = list()
counter = 1

for table_row in all_tr:
    
    if table_row.find('td'):
        
        table_row_data = table_row.find_all('td')
        
        movie_info_clean = HTML_table_row_to_dict(table_row_data)
        if type(movie_info_clean) == dict:
            movies_2020.append([movie_info_clean])
            
        else:
#             (counter, movie_info_clean[2]['Title'],len(movie_info_clean[2]),list(movie_info_clean[2])[-1])
            movie_info_clean[1]['Counter'] = counter
            movie_info_clean[1]['Error Message'] = movie_info_clean[0]
            error_log.append([movie_info_clean[1]])
        counter += 1

all_tr[210].find_all('td')