In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait, Select

In [8]:
# Set up Chrome options
chrome_options = Options()
chrome_options.page_load_strategy = 'normal'

In [10]:
# Set up the Chrome driver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.set_page_load_timeout(30)

page_url = 'https://www.compare-school-performance.service.gov.uk/download-data'
try:
    driver.get(page_url)
except TimeoutException:
    print("The page took too long to load!")

# Step 1: Accept Cookies and Hide Message
try:
    # Wait for the "Accept Cookies" button to be clickable
    accept_cookies_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.ID, "acceptAnalyticsCookiesTrue"))
    )
    # Click the "Accept Cookies" button
    accept_cookies_button.click()
    print("Cookies accepted.")
    
    # Wait for the "Hide this message" button to be clickable
    hide_message_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.ID, "confirm-cookie-settings"))
    )
    # Click the "Hide this message" button
    hide_message_button.click()
    print("Message hidden.")
except TimeoutException:
    print("Cookie-related buttons took too long to appear or be clickable.")
except Exception as e:
    print(f"Error interacting with cookie-related buttons: {e}")

Cookies accepted.
Message hidden.


In [11]:
def process_year(year_text):
    """Process the scraping steps for a given year."""
    try:
        # Step 2: Select year from the dropdown
        dropdown_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "form-control"))
        )
        select = Select(dropdown_element)
        select.select_by_visible_text(year_text)
        print(f"Selected year: {year_text}")

        # Step 3: Click the "Continue" button
        continue_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, "button"))
        )
        continue_button.click()
        print("Continue button clicked.")

        # Step 4: Select Radio Button
        radio_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "AllEnglandRadio"))
        )
        radio_button.click()
        print("Radio button selected.")

        # Step 5: Click the "Continue" button again
        continue_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, "button"))
        )
        continue_button.click()
        print("Continue button clicked.")

        # Step 6: Select Checkbox
        checkbox = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "Datatype_gen_0"))
        )
        checkbox.click()
        print("Checkbox selected.")

        # Step 7: Click the "Continue" button again
        continue_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, "button"))
        )
        continue_button.click()
        print("Continue button clicked.")

        # Step 8: Click the Download Link
        download_link = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "download-file-format-by-csv-link"))
        )
        download_link.click()
        print("CSV download link clicked.")

        # Step 9: Click the "Back to download" Button
        back_to_download_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.LINK_TEXT, "Back to download"))
        )
        back_to_download_button.click()
        print("Back to download button clicked.")

    except TimeoutException as e:
        print(f"Timeout occurred during year {year_text}: {e}")
    except Exception as e:
        print(f"An error occurred during year {year_text}: {e}")


In [12]:
try:
   # Extract all year options
    dropdown_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "form-control"))
    )
    select = Select(dropdown_element)
    year_options = [option.text for option in select.options]  # Extract year text

    # Iterate through all years
    for year in year_options:
        print(f"Starting process for year: {year}")
        process_year(year)
        driver.get(page_url)  # Navigate back to the initial page

except Exception as e:
    print(f"An error occurred: {e}")

Starting process for year: Please select
Selected year: Please select
Continue button clicked.
Timeout occurred during year Please select: Message: 
Stacktrace:
	GetHandleVerifier [0x0117FD53+23747]
	(No symbol) [0x01107D54]
	(No symbol) [0x00FDBE53]
	(No symbol) [0x0101FCA6]
	(No symbol) [0x0101FEEB]
	(No symbol) [0x0105D852]
	(No symbol) [0x01041E44]
	(No symbol) [0x0105B41E]
	(No symbol) [0x01041B96]
	(No symbol) [0x01013F3C]
	(No symbol) [0x01014EBD]
	GetHandleVerifier [0x0145AC73+3017699]
	GetHandleVerifier [0x0146B93B+3086507]
	GetHandleVerifier [0x014640F2+3055714]
	GetHandleVerifier [0x01215AF0+637536]
	(No symbol) [0x01110A5D]
	(No symbol) [0x0110DA28]
	(No symbol) [0x0110DBC5]
	(No symbol) [0x011007F0]
	BaseThreadInitThunk [0x7567FCC9+25]
	RtlGetAppContainerNamedObjectPath [0x7765809E+286]
	RtlGetAppContainerNamedObjectPath [0x7765806E+238]

Starting process for year: 2023 to 2024
Selected year: 2023 to 2024
Continue button clicked.
Radio button selected.
Continue button clic

KeyboardInterrupt: 

# Cleaning Data