In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

In [2]:
import os
if not os.path.exists("./docs"):
    os.makedirs("./docs")

In [31]:
from selenium.webdriver.chrome.options import Options

# Setup Chrome options
chrome_options = Options()
chrome_options.add_experimental_option('prefs', {
  "download.default_directory": "/Users/home/projects/selenium/docs/high_court_annual_reports",  # Change to your desired download directory
  "download.prompt_for_download": False,  # To auto download the file
  "download.directory_upgrade": True,
  "plugins.always_open_pdf_externally": True  # It will not show PDF directly in chrome
})


In [35]:
# Set up the webdriver. This assumes you're using Chrome; adjust as necessary.
driver = webdriver.Chrome("chromedriver", options=chrome_options)

# Open the webpage
driver.get(
    "https://www.hcourt.gov.au/publications/annual-reports/annual-reports"
)

In [36]:
from selenium.webdriver.support import expected_conditions as EC
import time

# Find all <a> tags with text beginning with "PDF"
pdf_links = WebDriverWait(driver, 10).until(
    EC.presence_of_all_elements_located((By.XPATH, "//a[starts-with(text(), 'PDF')]"))
)

# Iterate over each link
for i in range(len(pdf_links)):
    # Need to find the links again because the DOM might have changed
    pdf_links = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.XPATH, "//a[starts-with(text(), 'PDF')]"))
    )

    # Click the link
    pdf_links[i].click()

    # Wait for a bit to make sure the file is fully downloaded before the next iteration
    time.sleep(10)  # Adjust this sleep time based on your internet speed and file size

# Close the browser
driver.quit()

In [30]:
driver.quit()

In [5]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(driver.page_source, "html.parser")

In [6]:
driver.close()

In [7]:
soup

<html class="fonts supports-avif" lang="en-au" xmlns="http://www.w3.org/1999/xhtml"><head prefix="og: https://ogp.me/ns#"><style type="text/css">:root, :host {
  --fa-font-solid: normal 900 1em/1 "Font Awesome 6 Solid";
  --fa-font-regular: normal 400 1em/1 "Font Awesome 6 Regular";
  --fa-font-light: normal 300 1em/1 "Font Awesome 6 Light";
  --fa-font-thin: normal 100 1em/1 "Font Awesome 6 Thin";
  --fa-font-duotone: normal 900 1em/1 "Font Awesome 6 Duotone";
  --fa-font-sharp-solid: normal 900 1em/1 "Font Awesome 6 Sharp";
  --fa-font-sharp-regular: normal 400 1em/1 "Font Awesome 6 Sharp";
  --fa-font-sharp-light: normal 300 1em/1 "Font Awesome 6 Sharp";
  --fa-font-brands: normal 400 1em/1 "Font Awesome 6 Brands";
}

svg:not(:root).svg-inline--fa, svg:not(:host).svg-inline--fa {
  overflow: visible;
  box-sizing: content-box;
}

.svg-inline--fa {
  display: var(--fa-display, inline-block);
  height: 1em;
  overflow: visible;
  vertical-align: -0.125em;
}
.svg-inline--fa.fa-2xs {
  

In [10]:
template = "https://www.hcourt.gov.au/assets/corporate/annual-reports/HCA_Annual_Report_20YY-YY.pdf"

In [11]:
import os
import requests

def download_file(url, folder):
    # Check if the folder exists, if not, create it
    if not os.path.isdir(folder):
        os.makedirs(folder)
        
    # Get the file name from the url
    file_name = url.split("/")[-1]
    
    # Create the complete file path
    file_path = os.path.join(folder, file_name)
    
    # Download the file
    response = requests.get(url)
    
    # If the download was successful, save the file
    if response.status_code == 200:
        with open(file_path, 'wb') as f:
            f.write(response.content)
    else:
        print(f"Failed to download file. Server responded with: {response.status_code}.")
        
    return file_path

url = "https://www.hcourt.gov.au/assets/corporate/annual-reports/HCA_Annual_Report_2019-20.pdf"
folder = "../docs/high_court_annual_reports/"
download_file(url, folder)


'../docs/high_court_annual_reports/HCA_Annual_Report_2019-20.pdf'

In [20]:
import os
import asyncio
import aiohttp

async def download_file(session, url, folder):
    # Get the file name from the url
    file_name = url.split("/")[-1]

    # Create the complete file path
    file_path = os.path.join(folder, file_name)

    async with session.get(url) as response:
        if response.status == 200:
            with open(file_path, 'wb') as f:
                f.write(await response.read())
        else:
            print(f"Failed to download file. Server responded with: {response.status}.")

        return file_path

async def download_all_reports(start_year, end_year, base_folder):
    base_url = "https://www.hcourt.gov.au/assets/corporate/annual-reports/HCA_Annual_Report_{}-{}.pdf"
    tasks = [] # https://www.hcourt.gov.au/assets/corporate/annual-reports/hca-annual-report-2014-15.pdf

    async with aiohttp.ClientSession() as session:
        for year in range(start_year, end_year):
            try:
                # Create the url
                url = base_url.format(year, str(year + 1)[2:])
                print(f"Downloading file for the year {year}-{year+1}")
                print(f"URL: {url}")

                # Schedule the file download
                tasks.append(download_file(session, url, base_folder))
            except Exception as e:
                pass
                # print(f"Failed to download file for the year {year}-{year+1}. Error: {str(e)}")
            try:
                tasks.append(download_file(session, url.lower(), base_folder))
                print(f"Trying again for downloading file for the year {year}-{year+1}")
                print(f"URL: {url.lower()}")
            except Exception as e:
                print(f"Failed to download file for the year {year}-{year+1}. Error: {str(e)}")
                print(f"URL: {url.lower()}")

        # Run all tasks concurrently
        await asyncio.gather(*tasks)


In [None]:
# Call the function
await download_all_reports(1997, 2022, "../docs/high_court_annual_reports/")

https://www.hcourt.gov.au/assets/corporate/annual-reports/hca-annual-report-2014-15.pdf

https://www.hcourt.gov.au/assets/corporate/annual-reports/HCA_Annual_Report_2014-15.pdf


In [1]:
import os
import re

def rename_files_in_directory(directory_path):
    # Get a list of all filenames in the directory
    files = os.listdir(directory_path)

    for filename in files:
        # Ignore files that already start with "HCA_Annual_Report"
        if not filename.startswith("HCA_Annual_Report"):
            # Find the year in the filename
            match = re.search(r'\d{4}', filename)
            if match:
                year = match.group()
                # Calculate the next year's short form
                next_year_short = str(int(year[2:]) + 1).zfill(2)
                # Construct the new filename
                new_filename = f"HCA_Annual_Report_{year}-{next_year_short}.pdf"
                # Rename the file
                os.rename(os.path.join(directory_path, filename), os.path.join(directory_path, new_filename))



In [2]:
# Call the function on your directory
rename_files_in_directory("docs/high_court_annual_reports/")

In [3]:
import os
import re

def rename_files_in_directory(directory_path):
    # Get a list of all filenames in the directory
    files = os.listdir(directory_path)

    for filename in files:
        # Ignore files that do not start with "HCA_Annual_Report"
        if filename.startswith("HCA_Annual_Report"):
            # Find the year in the filename
            match = re.search(r'\d{4}-\d{2}', filename)
            if match:
                year = match.group()
                year_start, year_end = map(int, year.split('-'))
                # Check if the year is between 1998 and 2011
                if 1998 <= year_start <= 2011:
                    # Decrement the year by one
                    new_year_start = year_start - 1
                    new_year_end = int(str(new_year_start)[2:]) + 1
                    # Construct the new filename
                    new_filename = filename.replace(year, f"{new_year_start}-{str(new_year_end).zfill(2)}")
                    # Rename the file
                    os.rename(os.path.join(directory_path, filename), os.path.join(directory_path, new_filename))




In [4]:
# Call the function on your directory
rename_files_in_directory("docs/high_court_annual_reports/")