In [2]:
import pandas as pd
import os
from datetime import date, timedelta

def date_range_list(start_date, end_date):
    # Return list of datetime.date objects between start_date and end_date (inclusive).
    date_list = []
    curr_date = start_date
    while curr_date <= end_date:
        date_list.append(curr_date)
        curr_date += timedelta(days=1)
    return date_list[:-1]

Things to note:
* 2017 has more because it also includes 2018
* The 'Friday before AFL grand final' was only introduced in 2013 (but maybe still a day of note to that previously?)
* Other discrepancies in the number of public holidays per year relate to, for instance, Xmas falling on a Saturday so the bank holiday being rolled over

In [10]:
all_phs = pd.DataFrame(None)
for year in [2011, 2012, 2013]:
    ph = pd.read_csv("../../Data/Holidays/melbourne_public_holidays_{}.csv".format(year), names=['Date', 'Holiday Name'], 
                     header =None)
    ph['Date']= pd.to_datetime(ph['Date'], format = '%d/%m/%Y')
    all_phs = pd.concat((all_phs, ph), axis =0)
for year in range(2014, 2018):
    ph = pd.read_csv("../../Data/Holidays/australianpublicholidays-{}{}.csv".format(year, (year-2000)+1))
    ph = ph[ph['Applicable To'].str.contains("VIC|NAT")]
    ph['Date'] =(pd.to_datetime(ph['Date'], format = '%Y%m%d'))
    if year != 2017:
        ph = ph[ph['Date'].dt.year == year]
    ph = ph[['Date', 'Holiday Name']]  
    all_phs = pd.concat((all_phs, ph), axis =0)
for year in range(2019,2023):
    ph = pd.read_csv("../../Data/Holidays/australian_public_holidays_{}.csv".format(year))
    ph = ph[ph['Jurisdiction'].str.contains("vic")]
    ph['Date'] =(pd.to_datetime(ph['Date'], format = '%Y%m%d'))
    ph = ph[['Date', 'Holiday Name']]
#     print(len(ph))
    all_phs = pd.concat((all_phs, ph), axis =0)

### Format data

In [11]:
# Convert date to datetime
all_phs['datetime'] = pd.to_datetime(all_phs['Date'], format ='%Y-%m-%d' )
# Rename column to indicate it relates to public holidays, and set values to 1
all_phs.rename(columns={'Holiday Name':'public_holiday'}, inplace=True)
all_phs['public_holiday'] = 1
# Drop date column 
all_phs = all_phs.drop(['Date'], axis=1)

In [41]:
all_phs.to_csv('../../Cleaned_data/HolidaysData/publicholidays.csv', index = False)

### School holidays
https://www.education.wa.edu.au/past-term-dates  
2012 from https://www.littleaussietravellers.com.au/australian-school-term-dates-2012/
Extras for 2012 (because otherwise holiday at start of year (in previous school year) are excluded) from: https://www.abcdiamond.com.au/school-holidays-in-australia/

In [7]:
school_hols = pd.read_csv("../../Data/Holidays/SchoolHolidays.csv", header = None)
school_hols

Unnamed: 0,0,1,2,3
0,30/03/2012-15/04/2012,"Jun 30, 2012 - Jul 15, 2012","Sep 22, 2012 - Oct 7, 2012","Dec 22, 2012 - Jan 28, 2013"
1,29/03/2013-14/04/2013,"Jun 29, 2013 - Jul 14, 2013","Sep 21, 2013 - Oct 6, 2013","Dec 21, 2013 - Jan 27, 2014"
2,05/04/2014-21/04/2014,"Jun 28, 2014 - Jul 13, 2014","Sep 20, 2014 - Oct 5, 2014","Dec 20, 2014 - Jan 27, 2015"
3,28/03/2015-12/04/2015,"Jun 27, 2015 - Jul 12, 2015","Sep 19, 2015 - Oct 4, 2015","Dec 19, 2015 - Jan 26, 2016"
4,25/03/2016-10/04/2016,"Jun 25, 2016 - Jul 10, 2016","Sep 17, 2016 - Oct 2, 2016","Dec 21, 2016 - Jan 29, 2017"
5,01/04/2017-17/04/2017,"Jul 01, 2017 - Jul 16, 2017","Sep 23, 2017 - Oct 8, 2017","Dec 23, 2017 - Jan 29, 2018"
6,30/03/2018-15/04/2018,"Jun 30, 2018 - Jul 15, 2018","Sep 22, 2018 - Oct 7, 2018","Dec 22, 2018 - Jan 29, 2019"
7,06/04/2019-22/04/2019,"Jun 29, 2019 - Jul 14, 2019","Sep 21, 2019 - Oct 6, 2019","Dec 21, 2019 - Jan 28, 2020"
8,25/03/2020-14/04/2020,"Jun 27, 2020 - Jul 12, 2020","Sep 19, 2020 - Oct 4, 2020","Dec 19, 2020 - Jan 27, 2021"
9,02/04/2021-18/04/2021,"Jun 26, 2021 - Jul 11, 2021","Sep 18, 2021 - Oct 3, 2021","Dec 18, 2021 - Jan 30, 2022"


In [13]:
all_holidays_df

Unnamed: 0,Dates
976,2012-01-01
977,2012-01-02
978,2012-01-03
979,2012-01-04
980,2012-01-05
...,...
971,2023-01-25
972,2023-01-26
973,2023-01-27
974,2023-01-28


In [4]:
def date_range_list(start_date, end_date):
    # Return list of datetime.date objects between start_date and end_date (inclusive).
    date_list = []
    curr_date = start_date
    while curr_date <= end_date:
        date_list.append(curr_date)
        curr_date += timedelta(days=1)
    return date_list#[:-1]

def split (row):
    two_dates = row.split('-')
    date1, date2 = two_dates[0], two_dates[1]
    if '/' in row:
        date1 = pd.to_datetime(date1, format = '%d/%m/%Y')
        date2 = pd.to_datetime(date2, format = '%d/%m/%Y')
    else:
        date1 = pd.to_datetime(date1)
        date2 = pd.to_datetime(date2)
    days_between = date_range_list(date1, date2)
    return days_between

#### Create dataframe containing all the dates which are school holidays

In [5]:
all_holidays=[]
for column in school_hols.columns:
    days = school_hols[column].apply(split)
    this_holiday = sum(days, [])
    all_holidays.extend(this_holiday)
# Add extra dates from 2012
extras_2012 = date_range_list(pd.to_datetime('01/01/2012', format = '%d/%m/%Y'), pd.to_datetime('01/02/2012', format = '%d/%m/%Y'))
all_holidays.extend(extras_2012)
# Covnert to dataframe and sort in date order
all_holidays_df = pd.DataFrame({'Dates':all_holidays})
all_holidays_df.sort_values(by='Dates', inplace=True)    

In [6]:
for year in range(2012, 2022):
    one_year = all_holidays_df[all_holidays_df['Dates'].dt.year == year]
    print(year, len(one_year))

2012 91
2013 88
2014 88
2015 88
2016 86
2017 87
2018 88
2019 89
2020 94
2021 90


In [None]:
# all_holidays_df.to_csv('../Cleaned_data/HolidaysData/schoolholidays.csv', index = False)