In [1]:
import pandas as pd
import os
from datetime import date, timedelta

def date_range_list(start_date, end_date):
    # Return list of datetime.date objects between start_date and end_date (inclusive).
    date_list = []
    curr_date = start_date
    while curr_date <= end_date:
        date_list.append(curr_date)
        curr_date += timedelta(days=1)
    return date_list[:-1]

Things to note:
* 2017 has more because it also includes 2018
* The 'Friday before AFL grand final' was only introduced in 2013 (but maybe still a day of note to that previously?)
* Other discrepancies in the number of public holidays per year relate to, for instance, Xmas falling on a Saturday so the bank holiday being rolled over

In [2]:
all_phs = pd.DataFrame(None)
for year in [2011, 2012, 2013]:
    ph = pd.read_csv("../Data/PublicHolidays/melbourne_public_holidays_{}.csv".format(year), names=['Date', 'Holiday Name'], 
                     header =None)
    ph['Date']= pd.to_datetime(ph['Date'])
    all_phs = pd.concat((all_phs, ph), axis =0)
for year in range(2014, 2018):
    ph = pd.read_csv("../Data/PublicHolidays/australianpublicholidays-{}{}.csv".format(year, (year-2000)+1))
    ph = ph[ph['Applicable To'].str.contains("VIC|NAT")]
    ph['Date'] =(pd.to_datetime(ph['Date'], format = '%Y%m%d'))
    if year != 2017:
        ph = ph[ph['Date'].dt.year == year]
    ph = ph[['Date', 'Holiday Name']]  
    all_phs = pd.concat((all_phs, ph), axis =0)
for year in range(2019,2023):
    ph = pd.read_csv("../Data/PublicHolidays/australian_public_holidays_{}.csv".format(year))
    ph = ph[ph['Jurisdiction'].str.contains("vic")]
    ph['Date'] =(pd.to_datetime(ph['Date'], format = '%Y%m%d'))
    ph = ph[['Date', 'Holiday Name']]
#     print(len(ph))
    all_phs = pd.concat((all_phs, ph), axis =0)

### Format data

In [3]:
# Convert date to datetime
all_phs['datetime'] = pd.to_datetime(all_phs['Date'])
# Rename column to indicate it relates to public holidays, and set values to 1
all_phs.rename(columns={'Holiday Name':'public_holiday'}, inplace=True)
all_phs['public_holiday'] = 1
# Drop date column 
all_phs = all_phs.drop(['Date'], axis=1)

In [5]:
all_phs.to_csv('../Cleaned_data/publicholidays.csv', index = False)

### School holidays
https://www.education.wa.edu.au/past-term-dates  
2012 from https://www.littleaussietravellers.com.au/australian-school-term-dates-2012/
Extras for 2012 (because otherwise holiday at start of year (in previous school year) are excluded) from: https://www.abcdiamond.com.au/school-holidays-in-australia/

In [163]:
school_hols = pd.read_csv("../Data/SchoolHolidays.csv", header = None)

In [169]:
def date_range_list(start_date, end_date):
    # Return list of datetime.date objects between start_date and end_date (inclusive).
    date_list = []
    curr_date = start_date
    while curr_date <= end_date:
        date_list.append(curr_date)
        curr_date += timedelta(days=1)
    return date_list#[:-1]

def split (row):
    two_dates = row.split('-')
    date1, date2 = two_dates[0], two_dates[1]
    if '/' in row:
        date1 = pd.to_datetime(date1, format = '%d/%m/%Y')
        date2 = pd.to_datetime(date2, format = '%d/%m/%Y')
    else:
        date1 = pd.to_datetime(date1)
        date2 = pd.to_datetime(date2)
    days_between = date_range_list(date1, date2)
    return days_between

#### Create dataframe containing all the dates which are school holidays

In [198]:
all_holidays=[]
for column in school_hols.columns:
    days = school_hols[column].apply(split)
    this_holiday = sum(days, [])
    all_holidays.extend(this_holiday)
# Add extra dates from 2012
extras_2012 = date_range_list(pd.to_datetime('01/01/2012', format = '%d/%m/%Y'), pd.to_datetime('01/02/2012', format = '%d/%m/%Y'))
all_holidays.extend(extras_2012)
# Covnert to dataframe and sort in date order
all_holidays_df = pd.DataFrame({'Dates':all_holidays})
all_holidays_df.sort_values(by='Dates', inplace=True)    

In [199]:
for year in range(2012, 2022):
    one_year = all_holidays_df[all_holidays_df['Dates'].dt.year == year]
    print(year, len(one_year))

2012 91
2013 88
2014 88
2015 88
2016 86
2017 87
2018 88
2019 89
2020 94
2021 90


In [201]:
all_holidays_df.to_csv('../Cleaned_data/schoolholidays.csv', index = False)