## Sickness absence scraper from NHSD published stats
#### Load in data from websites

In [None]:

### Script to scrape published data from NHSD webpages and output a compiled and procesed CSV
### of FTE days available and FTE days lost by staff group and organisation
### Takes up to 10 minutes to run

import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup

# Allows unverified SSLs
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# The monthly publication of sickness absences sometimes vary in format (sometimes monthly "provisional", sometimes just monthly, sometimes quarterly)
# These three base URLs pick up on all variations. The code iterates through all possible URL formats for each month (and quarter). Where an invalid
# URL is created, the code will output "Failed to access [URL]", where it is valid it will output "Downloaded and parsed: [URL]". All data about 
# failed/successful access is outputted in a seperate CSV.

base_urls = [
    "https://digital.nhs.uk/data-and-information/publications/statistical/nhs-sickness-absence-rates/{month}-{year}-provisional-statistics",
    "https://digital.nhs.uk/data-and-information/publications/statistical/nhs-sickness-absence-rates/{month}-{year}",
    "https://digital.nhs.uk/data-and-information/publications/statistical/nhs-sickness-absence-rates/{month1}-{year1}-to-{month2}-{year2}-provisional-statistics"
]

accessed_data = []
dfs = []

## Function to get quarterly month values

def get_month_range_quarterly(month):
    quarters = {
        "january": ("january", "march"),
        "april": ("april", "june"),
        "july": ("july", "september"),
        "november": ("november", "december")
    }
    return quarters.get(month, (None, None))

## Iterates over years and months - specify years in range (remember need to +1 to upper range)

for year in range(2018, 2024):
    for month in range(1, 13):
        month_name = datetime(year, month, 1).strftime('%B').lower()

        for base_url in base_urls:
            ## Gets correct year for quarter
            if "{month1}-{year1}-to-{month2}-{year2}" in base_url:
                start_month, end_month = get_month_range_quarterly(month_name)
                if not start_month:
                    continue
                month1 = start_month
                month2 = end_month
                year1 = year
                year2 = year
                if end_month == "december":
                    year2 += 1
                url = base_url.format(month1=month1, year1=year1, month2=month2, year2=year2)
            else:
                url = base_url.format(month=month_name, year=year)

            # Download the monthly webpage
            response = requests.get(url)
            if response.status_code == 200:
                # Parse the HTML content of the monthly webpage
                soup = BeautifulSoup(response.content, "html.parser")
                # Find all the CSV links on the monthly webpage
                csv_links = soup.select('a[href$=".csv"]')

                # Append the accessed webpage to the accessed_data list
                accessed_data.append({"URL": url, "Status": "Accessed", "CSV Count": len(csv_links)})

                # Iterate over the CSV links and download the files
                for link in csv_links:
                    csv_url = link["href"]
                    # Download the CSV file
                    response_csv = requests.get(csv_url)
                    if response_csv.status_code == 200:
                        # Read the CSV data into a DataFrame
                        df = pd.read_csv(csv_url)
                        # Add a new column with the downloaded URL
                        df["Downloaded From"] = csv_url
                        # Append the downloaded data to the dfs list
                        dfs.append(df)
                        # Append the downloaded CSV URL to the downloaded_data list
                        accessed_data.append({"URL": csv_url, "Status": "Downloaded"})
                        print(f"Downloaded and parsed: {csv_url}")
                    else:
                        # Append the failed CSV URL to the downloaded_data list
                        accessed_data.append({"URL": csv_url, "Status": "Failed"})
                        print(f"Failed to download CSV from {csv_url}")
                break  # Exit the loop if CSV files were found and downloaded
            else:
                # Append the failed webpage to the accessed_data list
                accessed_data.append({"URL": url, "Status": "Failed", "CSV Count": 0})
                print(f"Failed to access webpage: {url}")

accessed_df = pd.DataFrame(accessed_data)
combined_df = pd.concat(dfs, ignore_index=True)

# Optional: Save the accessed_df to separate CSV file
accessed_df.to_csv("tempdir/accessed_data.csv", index=False)


In [None]:
combined_df.info()

### Process and save sickness absence data

In [None]:
# Filter the 'Downloaded From' for cells containing "benchmarking" because that picks up on
# sickness absence benchmarking data which is in the format we want. 
absence_df = combined_df[combined_df['Downloaded From'].str.contains('benchmarking')].dropna(axis = 1, 
                                                                                             how = 'all').dropna(axis = 0, 
                                                                                             how = 'all').reset_index(drop=True)

In [None]:
# Merge 'Month' and 'DATE' columns into a single column 'Date'
absence_df['Date'] = absence_df['Month'].combine_first(absence_df['DATE'])

# Drop old dates columns
absence_df = absence_df.drop(['Month','DATE'], axis=1)


In [None]:
# Convert the 'Month' column to datetime format
absence_df['Date'] = pd.to_datetime(absence_df['Date'], errors='coerce')

# Convert Month column dates to the format 'YYYY-MM-DD'
absence_df['Date'] = absence_df['Date'].dt.to_period('M').dt.to_timestamp()
sorted(absence_df['Date'].unique())

In [None]:
# Do the same thing for all columns containing same data category but different names
columns_to_merge = {
    'NHSE region code': 'NHSE_REGION_CODE',
    'NHSE region name': 'NHSE_REGION_NAME',
    'Org code': 'ORG_CODE',
    'Org name': 'ORG_NAME',
    'FTE days lost': 'FTE_DAYS_LOST',
    'FTE days available': 'FTE_DAYS_AVAILABLE',
    'Sickness absence rate (%)': 'SICKNESS_ABSENCE_RATE_PERCENT',
    'Staff group': 'STAFF_GROUP',
    'Cluster group': 'CLUSTER_GROUP',
    'Benchmark group': 'BENCHMARK_GROUP',
}


In [None]:
# Iterate over the columns to merge
for column, matching_column in columns_to_merge.items():
    # Check if both columns exist in the dataframe
    if column in absence_df.columns and matching_column in absence_df.columns:
        # Merge the columns by filling the missing values
        absence_df[column] = absence_df[column].fillna(absence_df[matching_column])

# Drop the matching columns so only the merged column remains
absence_df = absence_df.drop(columns_to_merge.values(), axis=1)


In [None]:
# clean up and simplify data frame
to_drop = ['Downloaded From','Tm End Date','ICS_CODE','ICS_NAME','HEE region code',
         'HEE region name','Sickness absence rate (%)']
replace_dict_region = {'South East of England':'South East',
                'South West of England':'South West'}
replace_dict_staff = {'All staff':'All staff groups',
                'HCHS Doctors':'HCHS doctors (exc. junior Drs)',
                'HCHS doctors':'HCHS doctors (exc. junior Drs)'}
df = absence_df.sort_values('Date')
df['NHSE region name'] = df['NHSE region name'].replace(replace_dict_region)
df['Staff group'] = df['Staff group'].replace(replace_dict_staff)
df.drop(to_drop,axis=1, inplace=True)
order = ['Date','Org code','Org name','NHSE region code','NHSE region name','Cluster group','Benchmark group',
         'Staff group','FTE days lost','FTE days available']
df = df.drop_duplicates()
df = df[order].reset_index(drop=True)

In [None]:
# remove rows where 'fte days available' is nan as assume no data available
df = df.dropna(subset=['FTE days available'])


In [None]:
df.tail()

Data for June 2022 is missing. To fill the missing values I will duplicate May 2022's valid data for June 2022 (2022-06-01).

In [None]:

# Find the data corresponding to 2022-05-01
may_data = df[df['Date'] == pd.to_datetime('2022-05-01')]

# Create a copy of the data with the date changed to 2022-06-01
june_data = may_data.copy()
june_data['Date'] = pd.to_datetime('2022-06-01')

# Append the copied data to the original DataFrame
df2 = pd.concat([df, june_data], ignore_index=True)

df2_check = df2[df2['Date'] == pd.to_datetime('2022-06-01')]
df2_check.head()

In [None]:
df2.to_csv('../sickness_absence.csv', index=False)

### Process and save sickness absence *reason* data

Filter the 'Downloaded From' for cells containing "REASON" because that picks up on sickness absence reason data. Drop other columns that contain all NAs.

In [None]:
reason_df = combined_df[combined_df['Downloaded From'].str.contains('REASON')].dropna(axis = 1, 
                                                                                      how = 'all').dropna(axis = 0, 
                                                                                      how = 'all').reset_index(drop=True)

reason_df = reason_df.drop(['Downloaded From'], axis=1)

In [None]:
reason_df['Month'] = pd.to_datetime(reason_df['Month'])
sorted(reason_df['Month'].unique())

Another approach to see if we can get later data - where REASON column is populated 

In [None]:
reason_df2 = combined_df[combined_df['REASON'].notna()].dropna(axis = 1, how = 'all').dropna(axis = 0, 
                                                                                      how = 'all').reset_index(drop=True)

In [None]:
reason_df2 = reason_df2.drop(['Downloaded From'], axis=1)
reason_df2['DATE'] = pd.to_datetime(reason_df2['DATE'])
sorted(reason_df2['DATE'].unique())

Looks like this is where the rest of the data are so we need to stitch together these dfs. There is someoverlap so I will cut 2022-04 and 2022-05 from the first df. Then process them both in that same way:

In [None]:
reason_df = reason_df[reason_df['Month'] < '2022-04-30']
#sorted(reason_df['Month'].unique())

In [None]:
# Convert Month column dates to the format 'YYYY-MM-DD'
reason_df['Month'] = reason_df['Month'].dt.to_period('M').dt.to_timestamp()


In [None]:
reason_df.tail()

In [None]:
# Convert DATE column dates to the format 'YYYY-MM-DD'
reason_df2['DATE'] = reason_df2['DATE'].dt.to_period('M').dt.to_timestamp()


In [None]:
reason_df2.head()

In [None]:
reason_df.rename(columns={'Month': 'Date'}, inplace=True)

In [None]:
reason_df2.rename(columns={'DATE': 'Date','FTE_DAYS_LOST':'FTE days lost',
                           'STAFF_GROUP':'Staff group','REASON':'Reason'}, inplace=True)

In [None]:
reason_df['Type'].unique()

In [None]:
# Pivot the 'Type' column using pivot_table
p_reason_df = pd.pivot_table(reason_df, index=['Date','Reason','Staff group'], columns=['Type'], values='FTE days', aggfunc='sum')

# Reset the index
p_reason_df.reset_index(inplace=True)


FTE days available is only recorded for all reasons (not broken down by sickness absence reason), so we can drop that column


In [None]:
p_reason_df = p_reason_df.drop(['FTE days available'], axis=1)

In [None]:
reason_df2 = reason_df2.drop(['FTE_DAYS_AVAILABLE'], axis=1)

In [None]:
p_reason_df.head()

In [None]:
reason_df2.head()

In [None]:
cat_reason_df = pd.concat([p_reason_df, reason_df2], ignore_index=True)


In [None]:
cat_reason_df.info()

Import sickness absence reference table to decode reason information into a new column

In [None]:
url = 'REF_SICK_ABSENCE_REASONS.csv'
df_ref = pd.read_csv(url)
df_ref.rename(columns={'Sick_Lv1_Reason':'Reason','Sick_Lv1_Description':'Description'},inplace=True) 
df_ref.info()

Add description information to main df

In [None]:
df_reason = pd.merge(cat_reason_df, df_ref[['Reason','reason_short']], on='Reason',how='left')
df_reason.info()

In [None]:
# remove rows where 'Description' is nan as this equates to no reason breakdown
df_reason = df_reason.dropna(subset=['reason_short'])

In [None]:
#drop Reason column as information now in reason_short
df_reason = df_reason.drop(['Reason'], axis=1)

In [None]:
# de-duplicate - now lots of duplicate rows
df_reason = df_reason.drop_duplicates()


In [None]:
df_reason.tail()

There are more staff groups in this data than in the independent variable data - need to compile HCHS doctors and then drop ones that don't match

In [None]:
sorted(df_reason['Staff group'].unique())

In [None]:
clinical_grades = ['Specialty Doctor',
 'Specialty Registrar',
 'Staff Grade',
 'Professionally qualified clinical staff',
 'Other and Local HCHS Doctor Grades',
 'Hospital Practitioner / Clinical Assistant',
 'HCHS doctors',
 'Consultant',
 'Core Training',
 'Foundation Doctor Year 1',
 'Foundation Doctor Year 2',
 'Associate Specialist']


In [None]:
df_reason['Staff group'] = df_reason['Staff group'].replace('HCHS doctors','HCHS doctors (exc. junior Drs)')

In [None]:
sg_in_df_reason = df_reason['Staff group'].unique()
sg_in_df_absence = df['Staff group'].unique()

In [None]:
df_reason = df_reason[df_reason['Staff group'].isin(sg_in_df_absence)]


Calculate proportion of sickness absence due each reason

In [None]:
# Calculate the total days lost by staff group, reason and 'date'
df_reason['total_days_lost'] = df_reason.groupby(['Staff group', 'Date'])['FTE days lost'].transform('sum')

# Calculate the percentage of days lost for each 'staff_group', 'reason', and 'date'
df_reason['percentage_days_lost'] = (df_reason['FTE days lost'] / df_reason['total_days_lost']) 


In [None]:
df_reason.tail()

I will create a new dataframe where the proportion of sickness absence days due to each reason is a separate column, format suitable for regression

In [None]:
df_reason_as_cols = df_reason.pivot_table(index=['Date','Staff group'], 
                                          columns='reason_short', values='percentage_days_lost', aggfunc='sum')

In [None]:
df_reason_as_cols.reset_index(inplace=True)
df_reason_as_cols.head()

June 2022 data is missing, as in sickness absence data. I will duplicate May 2022 data for June 2022. 

In [None]:
# Find the data corresponding to 2022-05-01
may_data = df_reason_as_cols[df_reason_as_cols['Date'] == pd.to_datetime('2022-05-01')]

# Create a copy of the data with the date changed to 2022-06-01
june_data = may_data.copy()
june_data['Date'] = pd.to_datetime('2022-06-01')

# Append the copied data to the original DataFrame
df_reason_as_cols2 = pd.concat([df_reason_as_cols, june_data], ignore_index=True)

check = df_reason_as_cols2[df_reason_as_cols2['Date'] == pd.to_datetime('2022-06-01')]
check.head()

In [None]:
df_reason_as_cols2.to_csv('../sickness_absence_reason_pivot.csv', index=False)

In [None]:
sorted(df_reason['Staff group'].unique())

In [None]:
staff_group = 'All staff groups'
df_reason_sg = df_reason.loc[df_reason['Staff group'] == staff_group]

In [None]:
df_reason_sg = df_reason_sg.sort_values(by='percentage_days_lost', ascending=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a grouped bar plot using seaborn
sns.set(style="whitegrid")
plt.figure(figsize=(8, 8))
sns.barplot(x='Staff group', y='percentage_days_lost', hue='reason_short', data=df_reason_sg)
plt.title('Percentage of Days Lost by Reason and Staff Group')
plt.xlabel('Staff Group')
plt.ylabel('Percentage of Days Lost')
plt.legend(title='Reason')
plt.show()

In [None]:
#df_reason['Description']
# Group the data by 'reason' and calculate the total days lost for each reason
#reasons_totals = df_reason.groupby('Description')['FTE days lost'].sum().reset_index()

# Sort the DataFrame by 'days_lost' in descending order
#reasons_totals = reasons_totals.sort_values(by='FTE days lost', ascending=False)

#reasons_totals

In [None]:
df_reason.to_csv('../sickness_absence_reason.csv', index=False)


### Process and save sickness absence due to COVID data

Filter the 'Downloaded From' for cells containing "COVID-19" because that picks up on sickness absence reason data. 

In [None]:
covid19_df = combined_df[combined_df['Downloaded From'].str.contains('COVID-19')].dropna(axis = 1,
                                                                                          how = 'all').dropna(axis = 0,
                                                                                          how = 'all').reset_index(drop=True)

In [None]:
covid19_df = covid19_df.drop(['Downloaded From'], axis=1)

In [None]:
covid19_df = covid19_df.dropna(subset=['FTE_DAYS_AVAILABLE'])


In [None]:
covid19_df.info()

In [None]:
covid19_df.head()

In [None]:
covid19_df['DATE'].unique()

In [None]:
covid19_df.to_csv('../covid-19_sickness_absence.csv', index=False)


In [None]:
# import pickle
# with open('objs.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
#     pickle.dump([absence_df, combined_df, covid19_df, df, df_reason, df_ref,p_reason_df,
#                  reason_df], f)

In [None]:
#with open('objs.pkl') as f:  # Python 3: open(..., 'rb')
#    absence_df, combined_df, covid19_df, df, df_reason, df_ref,p_reason_df,reason_df = pickle.load(f)