In [1]:
import pandas as pd
from datetime import datetime, timedelta
import os
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas_gbq

In [2]:
# define first week ending date of report to pull
report_weekend_date = '2017-05-28'

# define last week of report to pull
end_date = '2023-05-21'

# function to add a week to a string date formatted yyyy-mm-dd
def add_week_to_date_str(date_str):
    return (datetime.strptime(date_str,'%Y-%m-%d') + timedelta(weeks=1)).strftime('%Y-%m-%d')

#print(next_weekend_date)
base_url = 'https://playbill.com/grosses?week='


# loop through reporting weeks and scrape data from site into pandas dataframe
while(report_weekend_date <= end_date):
    url = f'{base_url}{report_weekend_date}'
    
    #only process weeks that Broadway shows were running (i.e. exclude weeks during Pandemic shutdown)
    if(report_weekend_date <= '2020-03-08' or  report_weekend_date >= '2021-08-08'):
        print(url)
        response = requests.get(url)
        if response.status_code == 200:
            html_content = response.text
        else:
            print("Failed to fetch the webpage.")
            break

        soup = BeautifulSoup(html_content, 'html5lib') # oddly, html.parser was not finding the header row tr tag of the table
        break

    
    # increment reporting week
    report_weekend_date = add_week_to_date_str(report_weekend_date)



https://playbill.com/grosses?week=2017-05-28


In [3]:
def generate_week_data():
    tdelta_begin_wk = timedelta(days = -6)
    tdelta_next_week_end_date = timedelta(days = 7)

    season_weeks = [
        ['2017-18',datetime(2017,5, 28),0,53],
        ['2018-19',datetime(2018,6, 3),0,52],
        ['2019-20',datetime(2019,6, 2),0,41],
        ['2020-21',datetime(2020,6, 1),-1,52],
        ['2021-22',datetime(2021,5, 30),10,52],
        ['2022-23',datetime(2022,5, 29),0,52]
    ]

    week_df_columns = ['season','week_num','wk_start_date','week_end_date']
    week_data = []

    for row in season_weeks:
        season = row[0]
        season_start_date_we = row[1]
        num_weeks_offset = row[2]
        week_end_date = season_start_date_we + timedelta(days=7*num_weeks_offset)
        num_weeks = row[3]

        if not(season == '2020-21'): #exclude the 2020-21 season because of Pandemic
            
            for i in range(num_weeks_offset+1,num_weeks+1):
                week_start_date = week_end_date + tdelta_begin_wk
                
                week_data.append([season,i,week_start_date,week_end_date])
    
                week_end_date += tdelta_next_week_end_date
            
    df_week_data = pd.DataFrame(week_data,columns=week_df_columns)

    return df_week_data

In [4]:


df_week_data = generate_week_data()
df_week_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   season         240 non-null    object        
 1   week_num       240 non-null    int64         
 2   wk_start_date  240 non-null    datetime64[ns]
 3   week_end_date  240 non-null    datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(1)
memory usage: 7.6+ KB


In [5]:
data = []

if soup:
    #print("there is soup")

    report_we_date = soup.find(id='vault-search-results-sort-select').find('option', selected=True).text
    report_we_date_dt = datetime.strptime(report_we_date,'%Y-%m-%d')

    # get the week number
    weeknum = soup.find('div',class_='week-count').find('span').text
    print(f"weeknum: {weeknum}")

    table = soup.find('div', class_='vault-grosses-result').find('table')
    #tbody = table.find('tbody')

    #header_row = table.find('thead')


    #print(header_row)

    

    rows = table.find_all('tr')
    #print(tbody)
    for row in rows:

        cells = row.find_all(['th', 'td'])
        row_data = []

        #print(row.parent.name)

        for cell in cells:
            if cell.name=='th':
                row_data.append(cell.a.text.strip())

                if(cell.find('span',class_='subtext')):
                    row_data.append(cell.find('span',class_='subtext').text.strip())
                else:
                    row_data.append(None)

            else:
                if(cell.find('span',class_='data-value')):
                    row_data.append(cell.find('span',class_='data-value').text.strip())
                else:
                    row_data.append(None)

                if(cell.find('span',class_='subtext')):
                    row_data.append(cell.find('span',class_='subtext').text.strip())
                else:
                    row_data.append(None)
        
        data.append(row_data)

    df = pd.DataFrame(data[1:], columns=data[0])
    df['report_we_date'] = report_we_date_dt

else:
    print("there is no soup")


df

weeknum: 1


Unnamed: 0,Show,None,This Week Gross,Potential Gross,Diff $,None.1,Avg Ticket,Top Ticket,Seats Sold,Seats in Theatre,Perfs,Previews,% Cap,None.2,Diff % cap,None.3,report_we_date
0,1984,Hudson Theatre,"$298,437.82","$664,746.00","$85,812.28",,$58.76,$249.00,5079,970,0,7,74.80%,,-12.26%,,2017-05-28
1,Aladdin,New Amsterdam Theatre,"$1,542,463.00","$1,580,992.00","$89,799.00",,$113.46,$227.50,13595,1727,8,0,98.40%,,0.38%,,2017-05-28
2,Anastasia,Broadhurst Theatre,"$1,166,324.10","$1,153,441.00","$46,189.60",,$125.55,$350.00,9290,1143,8,0,101.60%,,0.28%,,2017-05-28
3,Bandstand,Bernard B. Jacobs Theatre,"$596,302.15","$1,034,928.00","-$22,609.95",,$86.25,$227.00,6914,1033,8,0,83.66%,,-0.38%,,2017-05-28
4,Beautiful: The Carole King Musical,Stephen Sondheim Theatre,"$895,880.94","$1,198,122.00","-$67,041.66",,$117.54,$249.00,7622,1026,8,0,92.86%,,-4.29%,,2017-05-28
5,The Book of Mormon,Eugene O'Neill Theatre,"$1,265,001.00","$1,345,936.00","-$24,628.00",,$145.10,$477.00,8718,1066,8,0,102.23%,,0.06%,,2017-05-28
6,A Bronx Tale The Musical,Longacre Theatre,"$770,786.32","$951,630.00","-$42,493.48",,$100.84,$275.00,7644,1044,8,0,91.52%,,-1.58%,,2017-05-28
7,Cats,Neil Simon Theatre,"$821,765.50","$1,426,700.00","$88,532.00",,$90.72,$197.00,9058,1375,8,0,82.35%,,4.96%,,2017-05-28
8,Charlie and the Chocolate Factory,Lunt-Fontanne Theatre,"$1,221,821.00","$1,390,624.00","$73,040.50",,$104.82,$233.00,11656,1496,8,0,97.39%,,-0.61%,,2017-05-28
9,Chicago,Ambassador Theatre,"$711,717.35","$947,952.00","$84,579.19",,$88.94,$187.00,8002,1080,8,0,92.62%,,5.87%,,2017-05-28


In [6]:
# clean up the dataframe

# add a column name for the Venue
df.columns.values[1] = 'Venue'

df

Unnamed: 0,Show,Venue,This Week Gross,Potential Gross,Diff $,None,Avg Ticket,Top Ticket,Seats Sold,Seats in Theatre,Perfs,Previews,% Cap,None.1,Diff % cap,None.2,report_we_date
0,1984,Hudson Theatre,"$298,437.82","$664,746.00","$85,812.28",,$58.76,$249.00,5079,970,0,7,74.80%,,-12.26%,,2017-05-28
1,Aladdin,New Amsterdam Theatre,"$1,542,463.00","$1,580,992.00","$89,799.00",,$113.46,$227.50,13595,1727,8,0,98.40%,,0.38%,,2017-05-28
2,Anastasia,Broadhurst Theatre,"$1,166,324.10","$1,153,441.00","$46,189.60",,$125.55,$350.00,9290,1143,8,0,101.60%,,0.28%,,2017-05-28
3,Bandstand,Bernard B. Jacobs Theatre,"$596,302.15","$1,034,928.00","-$22,609.95",,$86.25,$227.00,6914,1033,8,0,83.66%,,-0.38%,,2017-05-28
4,Beautiful: The Carole King Musical,Stephen Sondheim Theatre,"$895,880.94","$1,198,122.00","-$67,041.66",,$117.54,$249.00,7622,1026,8,0,92.86%,,-4.29%,,2017-05-28
5,The Book of Mormon,Eugene O'Neill Theatre,"$1,265,001.00","$1,345,936.00","-$24,628.00",,$145.10,$477.00,8718,1066,8,0,102.23%,,0.06%,,2017-05-28
6,A Bronx Tale The Musical,Longacre Theatre,"$770,786.32","$951,630.00","-$42,493.48",,$100.84,$275.00,7644,1044,8,0,91.52%,,-1.58%,,2017-05-28
7,Cats,Neil Simon Theatre,"$821,765.50","$1,426,700.00","$88,532.00",,$90.72,$197.00,9058,1375,8,0,82.35%,,4.96%,,2017-05-28
8,Charlie and the Chocolate Factory,Lunt-Fontanne Theatre,"$1,221,821.00","$1,390,624.00","$73,040.50",,$104.82,$233.00,11656,1496,8,0,97.39%,,-0.61%,,2017-05-28
9,Chicago,Ambassador Theatre,"$711,717.35","$947,952.00","$84,579.19",,$88.94,$187.00,8002,1080,8,0,92.62%,,5.87%,,2017-05-28


In [7]:
df.rename(columns={'% Cap' : 'Pct Cap',f"Diff % cap": 'Diff Pct Cap'},inplace=True)

In [8]:
# drop NA columns
df = df.dropna(axis=1, how='all')

df

Unnamed: 0,Show,Venue,This Week Gross,Potential Gross,Diff $,Avg Ticket,Top Ticket,Seats Sold,Seats in Theatre,Perfs,Previews,Pct Cap,Diff Pct Cap,report_we_date
0,1984,Hudson Theatre,"$298,437.82","$664,746.00","$85,812.28",$58.76,$249.00,5079,970,0,7,74.80%,-12.26%,2017-05-28
1,Aladdin,New Amsterdam Theatre,"$1,542,463.00","$1,580,992.00","$89,799.00",$113.46,$227.50,13595,1727,8,0,98.40%,0.38%,2017-05-28
2,Anastasia,Broadhurst Theatre,"$1,166,324.10","$1,153,441.00","$46,189.60",$125.55,$350.00,9290,1143,8,0,101.60%,0.28%,2017-05-28
3,Bandstand,Bernard B. Jacobs Theatre,"$596,302.15","$1,034,928.00","-$22,609.95",$86.25,$227.00,6914,1033,8,0,83.66%,-0.38%,2017-05-28
4,Beautiful: The Carole King Musical,Stephen Sondheim Theatre,"$895,880.94","$1,198,122.00","-$67,041.66",$117.54,$249.00,7622,1026,8,0,92.86%,-4.29%,2017-05-28
5,The Book of Mormon,Eugene O'Neill Theatre,"$1,265,001.00","$1,345,936.00","-$24,628.00",$145.10,$477.00,8718,1066,8,0,102.23%,0.06%,2017-05-28
6,A Bronx Tale The Musical,Longacre Theatre,"$770,786.32","$951,630.00","-$42,493.48",$100.84,$275.00,7644,1044,8,0,91.52%,-1.58%,2017-05-28
7,Cats,Neil Simon Theatre,"$821,765.50","$1,426,700.00","$88,532.00",$90.72,$197.00,9058,1375,8,0,82.35%,4.96%,2017-05-28
8,Charlie and the Chocolate Factory,Lunt-Fontanne Theatre,"$1,221,821.00","$1,390,624.00","$73,040.50",$104.82,$233.00,11656,1496,8,0,97.39%,-0.61%,2017-05-28
9,Chicago,Ambassador Theatre,"$711,717.35","$947,952.00","$84,579.19",$88.94,$187.00,8002,1080,8,0,92.62%,5.87%,2017-05-28


In [9]:
# convert the currency strings to floats
from decimal import Decimal

def convert_currency_to_float(value):
    # Check if the value is a string and contains a dollar sign
    if isinstance(value, str) and '$' in value:
        # Remove dollar sign and comma, and convert to float
        return Decimal(value.replace(',', '').replace('$', ''))
    return value


def convert_str_numbers_to_numbers(value):
    converted_value = value
    if isinstance(value, str):
        if ',' in value:
            converted_value = converted_value.replace(',','')
        return int(converted_value)
    return converted_value

def convert_pct_to_float(value):
    if isinstance(value, str) and '%' in value:
        return float(value.replace('%', '')) / 100
    return value

currency_columns_to_convert = ['This Week Gross','Potential Gross','Diff $','Avg Ticket','Top Ticket']

number_columns_to_convert = ['Seats Sold','Seats in Theatre','Perfs','Previews']

pct_columns_to_convert = ['Pct Cap','Diff Pct Cap']

df[currency_columns_to_convert] = df[currency_columns_to_convert].applymap(convert_currency_to_float)

df[number_columns_to_convert] = df[number_columns_to_convert].applymap(convert_str_numbers_to_numbers)

df[pct_columns_to_convert] = df[pct_columns_to_convert].applymap(convert_pct_to_float)


df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[currency_columns_to_convert] = df[currency_columns_to_convert].applymap(convert_currency_to_float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[number_columns_to_convert] = df[number_columns_to_convert].applymap(convert_str_numbers_to_numbers)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

Unnamed: 0,Show,Venue,This Week Gross,Potential Gross,Diff $,Avg Ticket,Top Ticket,Seats Sold,Seats in Theatre,Perfs,Previews,Pct Cap,Diff Pct Cap,report_we_date
0,1984,Hudson Theatre,298437.82,664746.0,85812.28,58.76,249.0,5079,970,0,7,0.748,-0.1226,2017-05-28
1,Aladdin,New Amsterdam Theatre,1542463.0,1580992.0,89799.0,113.46,227.5,13595,1727,8,0,0.984,0.0038,2017-05-28
2,Anastasia,Broadhurst Theatre,1166324.1,1153441.0,46189.6,125.55,350.0,9290,1143,8,0,1.016,0.0028,2017-05-28
3,Bandstand,Bernard B. Jacobs Theatre,596302.15,1034928.0,-22609.95,86.25,227.0,6914,1033,8,0,0.8366,-0.0038,2017-05-28
4,Beautiful: The Carole King Musical,Stephen Sondheim Theatre,895880.94,1198122.0,-67041.66,117.54,249.0,7622,1026,8,0,0.9286,-0.0429,2017-05-28
5,The Book of Mormon,Eugene O'Neill Theatre,1265001.0,1345936.0,-24628.0,145.1,477.0,8718,1066,8,0,1.0223,0.0006,2017-05-28
6,A Bronx Tale The Musical,Longacre Theatre,770786.32,951630.0,-42493.48,100.84,275.0,7644,1044,8,0,0.9152,-0.0158,2017-05-28
7,Cats,Neil Simon Theatre,821765.5,1426700.0,88532.0,90.72,197.0,9058,1375,8,0,0.8235,0.0496,2017-05-28
8,Charlie and the Chocolate Factory,Lunt-Fontanne Theatre,1221821.0,1390624.0,73040.5,104.82,233.0,11656,1496,8,0,0.9739,-0.0061,2017-05-28
9,Chicago,Ambassador Theatre,711717.35,947952.0,84579.19,88.94,187.0,8002,1080,8,0,0.9262,0.0587,2017-05-28


In [10]:
# convert datetime fields into dates
#df['report_we_date'] = pd.to_datetime(df['report_we_date']).dt.date()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Show              34 non-null     object        
 1   Venue             34 non-null     object        
 2   This Week Gross   34 non-null     object        
 3   Potential Gross   34 non-null     object        
 4   Diff $            34 non-null     object        
 5   Avg Ticket        34 non-null     object        
 6   Top Ticket        34 non-null     object        
 7   Seats Sold        34 non-null     int64         
 8   Seats in Theatre  34 non-null     int64         
 9   Perfs             34 non-null     int64         
 10  Previews          34 non-null     int64         
 11  Pct Cap           34 non-null     float64       
 12  Diff Pct Cap      34 non-null     float64       
 13  report_we_date    34 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float6

In [11]:
df_merged = pd.merge(df,df_week_data,how='left',left_on='report_we_date',right_on='week_end_date')

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34 entries, 0 to 33
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Show              34 non-null     object        
 1   Venue             34 non-null     object        
 2   This Week Gross   34 non-null     object        
 3   Potential Gross   34 non-null     object        
 4   Diff $            34 non-null     object        
 5   Avg Ticket        34 non-null     object        
 6   Top Ticket        34 non-null     object        
 7   Seats Sold        34 non-null     int64         
 8   Seats in Theatre  34 non-null     int64         
 9   Perfs             34 non-null     int64         
 10  Previews          34 non-null     int64         
 11  Pct Cap           34 non-null     float64       
 12  Diff Pct Cap      34 non-null     float64       
 13  report_we_date    34 non-null     datetime64[ns]
 14  season            34 non-nul

In [12]:
df_final = df_merged[[
    'Show',
    'Venue',
    'season',
    'week_num',
    'wk_start_date',
    'week_end_date',
    'This Week Gross',
    'Potential Gross',
    'Diff $',
    'Avg Ticket',
    'Top Ticket',
    'Seats Sold',
    'Seats in Theatre',
    'Perfs',
    'Previews',
    'Pct Cap',
    'Diff Pct Cap'
]].copy()

df_final.rename(columns={
    'Show' : 'show_name',
    'Venue' : 'venue_name',
    'season':'season',
    'week_num' : 'week_num',
    'wk_start_date' : 'week_start_date',
    'week_end_date' : 'week_end_date',
    'This Week Gross' : 'tw_gross',
    'Potential Gross' : 'potential_gross',
    'Diff $' : 'diff_tw_lw_gross',
    'Avg Ticket' : 'avg_ticket_price',
    'Top Ticket' : 'max_ticket_price',
    'Seats Sold' : 'seats_sold',
    'Seats in Theatre' : 'seats_in_theater',
    'Perfs' : 'performances',
    'Previews' : 'preview_performances',
    'Pct Cap' : 'pct_capacity',
    'Diff Pct Cap' : 'diff_tw_lw_pct_capacity'
},inplace=True)

df_final.head()

Unnamed: 0,show_name,venue_name,season,week_num,week_start_date,week_end_date,tw_gross,potential_gross,diff_tw_lw_gross,avg_ticket_price,max_ticket_price,seats_sold,seats_in_theater,performances,preview_performances,pct_capacity,diff_tw_lw_pct_capacity
0,1984,Hudson Theatre,2017-18,1,2017-05-22,2017-05-28,298437.82,664746.0,85812.28,58.76,249.0,5079,970,0,7,0.748,-0.1226
1,Aladdin,New Amsterdam Theatre,2017-18,1,2017-05-22,2017-05-28,1542463.0,1580992.0,89799.0,113.46,227.5,13595,1727,8,0,0.984,0.0038
2,Anastasia,Broadhurst Theatre,2017-18,1,2017-05-22,2017-05-28,1166324.1,1153441.0,46189.6,125.55,350.0,9290,1143,8,0,1.016,0.0028
3,Bandstand,Bernard B. Jacobs Theatre,2017-18,1,2017-05-22,2017-05-28,596302.15,1034928.0,-22609.95,86.25,227.0,6914,1033,8,0,0.8366,-0.0038
4,Beautiful: The Carole King Musical,Stephen Sondheim Theatre,2017-18,1,2017-05-22,2017-05-28,895880.94,1198122.0,-67041.66,117.54,249.0,7622,1026,8,0,0.9286,-0.0429


In [15]:
proj_id = 'broadway-data-393814'
dest_table = 'broadway_grosses.weekly_grosses'

table_schema = [
    #{   'name':'week_start_date', 'type':'DATE'},
    #{   'name':'week_end_date', 'type':'DATE'},
    {   'name':'tw_gross', 'type':'NUMERIC'},
    {   'name':'potential_gross', 'type':'NUMERIC'},
    {   'name':'diff_tw_lw_gross', 'type':'NUMERIC'},
    {   'name':'avg_ticket_price', 'type':'NUMERIC'},
    {   'name':'max_ticket_price', 'type':'NUMERIC'}
]

pandas_gbq.to_gbq(dataframe=df_final, project_id=proj_id, destination_table = dest_table,if_exists='append')

100%|██████████| 1/1 [00:00<?, ?it/s]


In [None]:
pandas_gbq.to_gbq(dataframe=df_final, project_id=proj_id, destination_table = dest_table,table_schema=table_schema,if_exists='append')

100%|██████████| 1/1 [00:00<?, ?it/s]
