In [None]:

### Script to scrape published data from NHSD webpages and output a compiled and procesed CSV
### of FTE days available and FTE days lost by staff group and organisation
### Takes up to 10 minutes to run

import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup

# Allows unverified SSLs
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# The monthly publication of sickness absences sometimes vary in format (sometimes monthly "provisional", sometimes just monthly, sometimes quarterly)
# These three base URLs pick up on all variations. The code iterates through all possible URL formats for each month (and quarter). Where an invalid
# URL is created, the code will output "Failed to access [URL]", where it is valid it will output "Downloaded and parsed: [URL]". All data about 
# failed/successful access is outputted in a seperate CSV.

base_urls = [
    "https://digital.nhs.uk/data-and-information/publications/statistical/nhs-sickness-absence-rates/{month}-{year}-provisional-statistics",
    "https://digital.nhs.uk/data-and-information/publications/statistical/nhs-sickness-absence-rates/{month}-{year}",
    "https://digital.nhs.uk/data-and-information/publications/statistical/nhs-sickness-absence-rates/{month1}-{year1}-to-{month2}-{year2}-provisional-statistics"
]

accessed_data = []
dfs = []

## Function to get quarterly month values

def get_month_range_quarterly(month):
    quarters = {
        "january": ("january", "march"),
        "april": ("april", "june"),
        "july": ("july", "september"),
        "november": ("november", "december")
    }
    return quarters.get(month, (None, None))

## Iterates over years and months - specify years in range (remember need to +1 to upper range)

for year in range(2018, 2024):
    for month in range(1, 13):
        month_name = datetime(year, month, 1).strftime('%B').lower()

        for base_url in base_urls:
            ## Gets correct year for quarter
            if "{month1}-{year1}-to-{month2}-{year2}" in base_url:
                start_month, end_month = get_month_range_quarterly(month_name)
                if not start_month:
                    continue
                month1 = start_month
                month2 = end_month
                year1 = year
                year2 = year
                if end_month == "december":
                    year2 += 1
                url = base_url.format(month1=month1, year1=year1, month2=month2, year2=year2)
            else:
                url = base_url.format(month=month_name, year=year)

            # Download the monthly webpage
            response = requests.get(url)
            if response.status_code == 200:
                # Parse the HTML content of the monthly webpage
                soup = BeautifulSoup(response.content, "html.parser")
                # Find all the CSV links on the monthly webpage
                csv_links = soup.select('a[href$=".csv"]')

                # Append the accessed webpage to the accessed_data list
                accessed_data.append({"URL": url, "Status": "Accessed", "CSV Count": len(csv_links)})

                # Iterate over the CSV links and download the files
                for link in csv_links:
                    csv_url = link["href"]
                    # Download the CSV file
                    response_csv = requests.get(csv_url)
                    if response_csv.status_code == 200:
                        # Read the CSV data into a DataFrame
                        df = pd.read_csv(csv_url)
                        # Add a new column with the downloaded URL
                        df["Downloaded From"] = csv_url
                        # Append the downloaded data to the dfs list
                        dfs.append(df)
                        # Append the downloaded CSV URL to the downloaded_data list
                        accessed_data.append({"URL": csv_url, "Status": "Downloaded"})
                        print(f"Downloaded and parsed: {csv_url}")
                    else:
                        # Append the failed CSV URL to the downloaded_data list
                        accessed_data.append({"URL": csv_url, "Status": "Failed"})
                        print(f"Failed to download CSV from {csv_url}")
                break  # Exit the loop if CSV files were found and downloaded
            else:
                # Append the failed webpage to the accessed_data list
                accessed_data.append({"URL": url, "Status": "Failed", "CSV Count": 0})
                print(f"Failed to access webpage: {url}")

accessed_df = pd.DataFrame(accessed_data)
combined_df = pd.concat(dfs, ignore_index=True)

# Optional: Save the accessed_df to separate CSV file
accessed_df.to_csv("accessed_data.csv", index=False)


In [None]:
combined_df.info()

In [3]:
# Filter the 'Downloaded From' for cells containing "benchmarking" because that picks up on
# sickness absence benchmarking data which is in the format we want. 
absence_df = combined_df[combined_df['Downloaded From'].str.contains('benchmarking')].dropna(axis = 1, how = 'all').dropna(axis = 0, how = 'all').reset_index(drop=True)

In [4]:
# Filter the 'Downloaded From' for cells containing "REASON" because that picks up on
# sickness absence reason data. 
reason_df = combined_df[combined_df['Downloaded From'].str.contains('REASON')].dropna(axis = 1, how = 'all').dropna(axis = 0, how = 'all').reset_index(drop=True)

In [5]:
covid19_df = combined_df[combined_df['Downloaded From'].str.contains('COVID-19')].dropna(axis = 1, how = 'all').dropna(axis = 0, how = 'all').reset_index(drop=True)

In [6]:
covid19_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62613 entries, 0 to 62612
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Downloaded From      62613 non-null  object 
 1   DATE                 62613 non-null  object 
 2   NHSE_REGION_CODE     62613 non-null  object 
 3   NHSE_REGION_NAME     62613 non-null  object 
 4   ORG_CODE             62613 non-null  object 
 5   ORG_NAME             62613 non-null  object 
 6   FTE_DAYS_LOST        50158 non-null  float64
 7   FTE_DAYS_AVAILABLE   50419 non-null  float64
 8   STAFF_GROUP          62613 non-null  object 
 9   FTE_DAYS_LOST_COVID  49015 non-null  float64
 10  ICS_CODE             43648 non-null  object 
 11  ICS_NAME             43648 non-null  object 
dtypes: float64(3), object(9)
memory usage: 5.7+ MB


In [7]:
reason_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 815248 entries, 0 to 815247
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Downloaded From  815248 non-null  object 
 1   Month            815248 non-null  object 
 2   Staff group      815248 non-null  object 
 3   Type             815248 non-null  object 
 4   Reason           815248 non-null  object 
 5   FTE days         815248 non-null  float64
dtypes: float64(1), object(5)
memory usage: 37.3+ MB


In [8]:
reason_df.head()

Unnamed: 0,Downloaded From,Month,Staff group,Type,Reason,FTE days
0,https://files.digital.nhs.uk/5D/B325FA/MDS_ABS...,2015-01-31,All staff groups,FTE days available,All reasons,32986160.0
1,https://files.digital.nhs.uk/5D/B325FA/MDS_ABS...,2015-01-31,Professionally qualified clinical staff,FTE days available,All reasons,17788990.0
2,https://files.digital.nhs.uk/5D/B325FA/MDS_ABS...,2015-01-31,HCHS doctors,FTE days available,All reasons,3300960.0
3,https://files.digital.nhs.uk/5D/B325FA/MDS_ABS...,2015-01-31,Associate Specialist,FTE days available,All reasons,80383.44
4,https://files.digital.nhs.uk/5D/B325FA/MDS_ABS...,2015-01-31,Consultant,FTE days available,All reasons,1310032.0


In [9]:
# Convert the 'Month' column to datetime format
reason_df['Month'] = pd.to_datetime(reason_df['Month'], errors='coerce')

# Convert Month column dates to the format 'YYYY-MM-DD'
reason_df['Month'] = reason_df['Month'].dt.to_period('M').dt.to_timestamp()

#reason_df['Month'].unique()

In [10]:
reason_df.rename(columns={'Month': 'Date'}, inplace=True)


In [11]:
# Convert the 'Month' column to datetime format
absence_df['Month'] = pd.to_datetime(absence_df[ 'Month'], errors='coerce')

# Convert Month column dates to the format 'YYYY-MM-DD'
absence_df['Month'] = absence_df['Month'].dt.to_period('M').dt.to_timestamp()
#absence_df['Month'].unique()

  absence_df['Month'] = pd.to_datetime(absence_df[ 'Month'], errors='coerce')


In [12]:
# Convert the 'Date' column to datetime format
absence_df['DATE'] = pd.to_datetime(absence_df['DATE'], errors='coerce')

# Convert all dates to the format 'YYYY-MM-DD'
absence_df['DATE'] = absence_df['DATE'].dt.to_period('M').dt.to_timestamp()
absence_df['DATE'].unique()

  absence_df['DATE'] = pd.to_datetime(absence_df['DATE'], errors='coerce')


<DatetimeArray>
[                'NaT', '2022-04-01 00:00:00', '2022-05-01 00:00:00',
 '2022-07-01 00:00:00', '2022-08-01 00:00:00', '2022-09-01 00:00:00',
 '2022-10-01 00:00:00', '2022-11-01 00:00:00', '2022-12-01 00:00:00',
 '2023-01-01 00:00:00']
Length: 10, dtype: datetime64[ns]

In [13]:
# Merge 'Month' and 'DATE' columns into a single column 'Date'
absence_df['Date'] = absence_df['Month'].combine_first(absence_df['DATE'])

# Drop old dates columns
absence_df = absence_df.drop(['Month','DATE'], axis=1)


In [14]:
# Do the same thing for all columns containing same data category but different names
columns_to_merge = {
    'NHSE region code': 'NHSE_REGION_CODE',
    'NHSE region name': 'NHSE_REGION_NAME',
    'Org code': 'ORG_CODE',
    'Org name': 'ORG_NAME',
    'FTE days lost': 'FTE_DAYS_LOST',
    'FTE days available': 'FTE_DAYS_AVAILABLE',
    'Sickness absence rate (%)': 'SICKNESS_ABSENCE_RATE_PERCENT',
    'Staff group': 'STAFF_GROUP',
    'Cluster group': 'CLUSTER_GROUP',
    'Benchmark group': 'BENCHMARK_GROUP',
}


In [15]:
# Iterate over the columns to merge
for column, matching_column in columns_to_merge.items():
    # Check if both columns exist in the dataframe
    if column in absence_df.columns and matching_column in absence_df.columns:
        # Merge the columns by filling the missing values
        absence_df[column] = absence_df[column].fillna(absence_df[matching_column])

# Drop the matching columns so only the merged column remains
absence_df = absence_df.drop(columns_to_merge.values(), axis=1)


In [16]:
# clean up and simplify data frame
to_drop = ['Downloaded From','Tm End Date','ICS_CODE','ICS_NAME','HEE region code',
         'HEE region name','Sickness absence rate (%)']
replace_dict_region = {'South East of England':'South East',
                'South West of England':'South West'}
replace_dict_staff = {'All staff':'All staff groups',
                'HCHS Doctors':'HCHS doctors (exc. junior Drs)',
                'HCHS doctors':'HCHS doctors (exc. junior Drs)'}
df = absence_df.sort_values('Date')
df['NHSE region name'] = df['NHSE region name'].replace(replace_dict_region)
df['Staff group'] = df['Staff group'].replace(replace_dict_staff)
df.drop(to_drop,axis=1, inplace=True)
order = ['Date','Org code','Org name','NHSE region code','NHSE region name','Cluster group','Benchmark group',
         'Staff group','FTE days lost','FTE days available']
df = df.drop_duplicates()
df = df[order].reset_index(drop=True)

In [17]:
reason_df.head()

Unnamed: 0,Downloaded From,Date,Staff group,Type,Reason,FTE days
0,https://files.digital.nhs.uk/5D/B325FA/MDS_ABS...,2015-01-01,All staff groups,FTE days available,All reasons,32986160.0
1,https://files.digital.nhs.uk/5D/B325FA/MDS_ABS...,2015-01-01,Professionally qualified clinical staff,FTE days available,All reasons,17788990.0
2,https://files.digital.nhs.uk/5D/B325FA/MDS_ABS...,2015-01-01,HCHS doctors,FTE days available,All reasons,3300960.0
3,https://files.digital.nhs.uk/5D/B325FA/MDS_ABS...,2015-01-01,Associate Specialist,FTE days available,All reasons,80383.44
4,https://files.digital.nhs.uk/5D/B325FA/MDS_ABS...,2015-01-01,Consultant,FTE days available,All reasons,1310032.0


In [18]:
sorted(reason_df['Staff group'].unique())

['All staff groups',
 'Ambulance staff',
 'Associate Specialist',
 'Central functions',
 'Consultant',
 'Core Training',
 'Foundation Doctor Year 1',
 'Foundation Doctor Year 2',
 'HCHS doctors',
 'Hospital Practitioner / Clinical Assistant',
 'Hotel, property & estates',
 'Managers',
 'Midwives',
 'NHS infrastructure support',
 'Nurses & health visitors',
 'Other and Local HCHS Doctor Grades',
 'Other staff or those with unknown classification',
 'Professionally qualified clinical staff',
 'Scientific, therapeutic & technical staff',
 'Senior managers',
 'Specialty Doctor',
 'Specialty Registrar',
 'Staff Grade',
 'Support to ST&T staff',
 'Support to ambulance staff',
 'Support to clinical staff',
 'Support to doctors, nurses & midwives']

In [19]:
reason_df['Type'].unique()

array(['FTE days available', 'FTE days lost'], dtype=object)

In [20]:
reason_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 815248 entries, 0 to 815247
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   Downloaded From  815248 non-null  object        
 1   Date             815248 non-null  datetime64[ns]
 2   Staff group      815248 non-null  object        
 3   Type             815248 non-null  object        
 4   Reason           815248 non-null  object        
 5   FTE days         815248 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 37.3+ MB


In [24]:
reason_df.to_csv('sickness_absence_reason_unprocessed.csv', index=False)


In [21]:

# Pivot the 'Type' column using pivot_table
p_reason_df = pd.pivot_table(reason_df, index=['Date','Reason','Staff group'], columns=['Type'], values='FTE days', aggfunc='sum')

# Reset the index
p_reason_df.reset_index(inplace=True)


In [22]:
## FTE days available is only recorded for all reasons (not broken down by sickness absence reason)
p_reason_df.head()

Type,Date,Reason,Staff group,FTE days available,FTE days lost
0,2015-01-01,All reasons,All staff groups,494792400.0,23142840.0
1,2015-01-01,All reasons,Ambulance staff,8261049.0,624561.1
2,2015-01-01,All reasons,Associate Specialist,1205752.0,32911.03
3,2015-01-01,All reasons,Central functions,43294970.0,1597401.0
4,2015-01-01,All reasons,Consultant,19650470.0,237507.7


In [23]:
p_reason_df.to_csv('sickness_absence_reason_unprocessed.csv', index=False)


In [None]:
url = 'REF_SICK_ABSENCE_REASONS.csv'
df_ref = pd.read_csv(url)
df_ref.rename(columns={'Sick_Lv1_Reason':'Reason','Sick_Lv1_Description':'Description'},inplace=True) 
df_ref.info()

In [None]:
df_reason = pd.merge(p_reason_df, df_ref[['Reason','Description']], on='Reason',how='left')
#df_leaver_r2 = df_reason.dropna()
#df_leaver_r2.drop_duplicates(subset=merge_cols)
df_reason.info()

In [None]:
df_reason.tail()

In [None]:
df_reason.to_csv('sickness_absence_reason.csv', index=False)


In [None]:
df.to_csv('sickness_absence.csv', index=False)