In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime

In [3]:
# Request to website and download HTML contents
movie_url = "https://www.boxofficemojo.com/daily/2024/?view=year"

# Make a request
req = requests.get(movie_url)
content = req.text

# Use Beautiful Soup
soup = BeautifulSoup(content)


In [4]:
# Inspect page data to determine out to set up the scraping algorithm
rows = soup.findAll('tr')
data = rows[46].findAll('td')
data

[<td class="a-text-left mojo-header-column mojo-truncate mojo-field-type-date_interval mojo-sort-column"><a class="a-link-normal" href="/date/2024-03-31/?ref_=bo_di_table_46">Mar 31</a><div class="a-section a-spacing-none"><span class="a-size-small a-color-secondary">Easter Sunday</span></div></td>,
 <td class="a-text-left mojo-field-type-date_interval"><a class="a-link-normal" href="/date/2024-03-31/?ref_=bo_di_table_46">Sunday</a></td>,
 <td class="a-text-right mojo-field-type-date_interval"><a class="a-link-normal" href="/date/2024-03-31/?ref_=bo_di_table_46">91</a></td>,
 <td class="a-text-right mojo-field-type-money">$29,220,346</td>,
 <td class="a-text-right mojo-number-negative mojo-number-delta mojo-field-type-percent_delta mojo-estimatable">-36.7%</td>,
 <td class="a-text-right mojo-number-delta mojo-field-type-percent_delta mojo-estimatable">+10.2%</td>,
 <td class="a-text-right mojo-field-type-positive_integer">42</td>,
 <td class="a-text-left mojo-field-type-release mojo-ce

In [5]:
# Inspect page data to determine out to set up the scraping algorithm

for i in range(len(data)):
    print(data[i].text)
    

Mar 31Easter Sunday
Sunday
91
$29,220,346
-36.7%
+10.2%
42
Godzilla x Kong: The New Empire
$16,784,910


In [6]:
#Special occasion that is listed
print(data[0].findAll('a')[0].text)
print(data[0].findAll('span')[0].text)

Mar 31
Easter Sunday


In [7]:
len(data[0].findAll('span'))

1

In [8]:
#Test data construction

appended_data = []
for row in rows:
    data_row = {}
    data = row.findAll('td')
    if len(data) == 0:
        continue
    if len(data[0].findAll('span')) > 0:
        #special weekend
        data_row['occasion'] = data[0].findAll('span')[0].text
        data_row['date'] = data[0].findAll('a')[0].text
    else:
        #normal weekend
        data_row['occasion'] = ""
        data_row['date'] = data[0].text
        
    data_row['day_of_week'] = data[1].text
    data_row['day_number'] = data[2].text
    data_row['top_10_gross'] = data[3].text
    data_row['top_10_gross_change_yesterday'] = data[4].text
    data_row['top_10_gross_change_week'] = data[5].text
    data_row['number_releases'] = data[6].text
    data_row['movie_title'] = data[7].text
    data_row['gross'] = data[8].text
    appended_data.append(data_row)
year_data = pd.DataFrame(appended_data, columns = ['date','occasion', 'day_of_week', 'day_number', 'top_10_gross', 'top_10_gross_change_yesterday', 'top_10_gross_change_week', 'number_releases', 'movie_title', 'gross']) 


In [9]:
year_data.head(3)

Unnamed: 0,date,occasion,day_of_week,day_number,top_10_gross,top_10_gross_change_yesterday,top_10_gross_change_week,number_releases,movie_title,gross
0,May 15,,Wednesday,136,"$6,608,650",-35.7%,+36.2%,20,Kingdom of the Planet of the Apes,"$3,467,083"
1,May 14,,Tuesday,135,"$10,283,507",+30.6%,+34.1%,21,Kingdom of the Planet of the Apes,"$5,992,353"
2,May 13,,Monday,134,"$7,873,794",-71%,+39%,22,Kingdom of the Planet of the Apes,"$4,455,596"


In [22]:
# Create write function
def scrape_year_summary(year):
    url=f'https://www.boxofficemojo.com/daily/{year}/?view=year'
    req=requests.get(url)
    content=req.text
    soup=BeautifulSoup(content)
    rows=soup.findAll('tr')
    appended_data = []

    for row in rows:
        data_row = {}
        data = row.findAll('td')
        if len(data) == 0:
            continue
        if len(data[0].findAll('span')) > 0:
        #special weekend
            data_row['occasion'] = data[0].findAll('span')[0].text
            data_row['date'] = data[0].findAll('a')[0].text
        else:
        #normal weekend
            data_row['occasion'] = ""
            data_row['date'] = data[0].text

        data_row['day_of_week'] = data[1].text
        data_row['day_number'] = data[2].text
        data_row['top_10_gross'] = data[3].text
        data_row['top_10_gross_change_yesterday'] = data[4].text
        data_row['top_10_gross_change_week'] = data[5].text
        data_row['number_releases'] = data[6].text
        data_row['movie_title'] = data[7].text
        data_row['gross'] = data[8].text
        appended_data.append(data_row)

    year_data = pd.DataFrame(appended_data, columns = ['date','occasion', 'day_of_week', 'day_number', 'top_10_gross', 'top_10_gross_change_yesterday', 'top_10_gross_change_week', 'number_releases', 'movie_title', 'gross']) 
    year_data.to_csv(f'/Users/beek/Desktop/boxofficemojo_scraping/year_summary_datasource/year_summary_{year}.csv', index=False)

In [23]:
#Get current year
todays_date = datetime.date.today()
current_year = todays_date.year

years = range(1982, current_year+1)

for year in years:
    print(year)
    scrape_year_summary(year)

1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
