## Vaccination data processing
This notebook shows the code that was run once to download and adapt the data on vaccination roll-out in Australia
used in the vaccination extension to the base model.

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from aust_covid.constants import set_project_base_path

project_paths = set_project_base_path("../")
RUNS_PATH = project_paths["RUNS_PATH"]

In [None]:
# Scrape links to pages that house link for vaccine data
url = "https://www.health.gov.au/resources/collections/covid-19-vaccination-vaccination-data?language=en"
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, "html.parser")
urls = []
for link in soup.find_all("a"):
    urls.append(link.get("href"))
data_urls = ["https://www.health.gov.au" + u for u in urls if "/resources/publications" in u]

In [None]:
# Extract the urls that download the excel file
excel_urls = []
for d in data_urls:
    data_reqs = requests.get(d)
    data_soup = BeautifulSoup(data_reqs.text, "html.parser")
    download_urls = []
    for link in data_soup.find_all("a"):
        download_urls.append(link.get("href"))
    download = ["https://www.health.gov.au" + u for u in download_urls if "xlsx" in u]
    excel_urls.append(download[0])

In [None]:
# Filter url download list to exclude those from 2023
keywordfilter = set(["2023"])
excel_urls_21_22 = [u for u in excel_urls if not any(word in u for word in keywordfilter)]

In [None]:
# Read in dataframe, transpose and add date column
df = pd.DataFrame()
for (counter, l) in enumerate(excel_urls_21_22):
    df.reset_index(inplace=True, drop=True)
    temp_df = pd.read_excel(l)
    temp_df = temp_df.rename(columns={temp_df.columns[0]:"variable", temp_df.columns[1]:"value"})
    t_df = temp_df[["variable", "value"]].set_index("variable").T
    t_df = t_df. dropna(axis=1)
    t_df.reset_index()
    # select second date when two dates exist
    date = re.findall(r"(?<=data-).*?(?<=\d{4})", l)
    if len(date)>1:
        t_df["date"] = pd.to_datetime(date[1])
    else:
        t_df["date"] = pd.to_datetime(date)
    print(l)
    df = pd.concat([df, t_df], axis=0, ignore_index=True)