In [1]:
import requests
from bs4 import BeautifulSoup
import re
import zipfile
import csv
import pandas as pd

In [2]:
month = "february"
year = "2023"
url = f"https://digital.nhs.uk/data-and-information/publications/statistical/nhs-workforce-statistics/{month}-{year}"
keyword = "benchmark"

In [3]:
def download_zip(url,month,year):
    response = requests.get(url)
    
    if response.status_code == 200:
        filename = f"turnover-{month}-{year}.zip"
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"Downloaded: {filename}")
    else:
        print(f"Failed to download the zip file from: {url}")


In [4]:
def find_zip_files(url, keyword,month,year):
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        zip_links = soup.find_all("a", href=re.compile(".zip"))
        
        for link in zip_links:
            if keyword in link["href"]:
                download_url = link["href"]
                download_zip(download_url,month,year)
    else:
        print("Failed to retrieve the webpage.")



In [5]:
find_zip_files(url,keyword,month,year)

Downloaded: turnover-february-2023.zip


In [6]:
def unzip_and_read_csv(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_file:
        # Extract all files in the zip folder
        zip_file.extractall()

        # Read the CSV files inside the extracted folder
        for file_name in zip_file.namelist():
            if file_name.endswith('.csv') and 'Monthly' in file_name:
                df = pd.read_csv(file_name)
                #print(df)
                return df

    return None


In [7]:
zip_path = f'turnover-{month}-{year}.zip'
df = unzip_and_read_csv(zip_path)

In [8]:
df.head()

Unnamed: 0,Period,Type,Org code,Org name,NHSE region code,NHSE region name,ICS code,ICS name,Cluster group,Benchmark group,Staff group,HC,FTE
0,202112 to 202201,Leavers,RXX,Surrey and Borders Partnership NHS Foundation ...,Y59,South East,QXU,Surrey Heartlands,Mental Health,Mental Health and Learning Disability,All staff groups,46,41.09334
1,202112 to 202201,Leavers,RJ2,Lewisham and Greenwich NHS Trust,Y56,London,QKK,South East London,Acute,Acute - Large,All staff groups,112,101.00739
2,202112 to 202201,Leavers,RDR,Sussex Community NHS Foundation Trust,Y59,South East,QNX,Sussex,Community Provider Trust,Community Provider Trust,All staff groups,86,67.40329
3,202112 to 202201,Leavers,26A,NHS Norfolk and Waveney CCG,Y61,East of England,QMM,Norfolk and Waveney,Clinical Commissioning Group,Clinical Commissioning Group,All staff groups,9,8.2
4,202112 to 202201,Leavers,RWX,Berkshire Healthcare NHS Foundation Trust,Y59,South East,QNQ,Frimley,Mental Health,Mental Health and Learning Disability,All staff groups,73,60.87147


Fix the period formatting

In [9]:
leaver_joiner = df.loc[df['Type'].isin(['Leavers','Joiners'])].copy()

# Convert the date column to string type (if not already)
leaver_joiner['Period'] = leaver_joiner['Period'].astype(str)

# Extract the year and month components from the date column
leaver_joiner['Year'] = leaver_joiner['Period'].str[:4]
leaver_joiner['Month'] = leaver_joiner['Period'].str[4:6]


In [10]:

# Create a new column with the desired date format
leaver_joiner['Date'] = pd.to_datetime(leaver_joiner['Year'] + '-' + leaver_joiner['Month'] + '-01')

# Drop the intermediate columns 'Year' and 'Month' if not needed
leaver_joiner = leaver_joiner.drop(['Year', 'Month','Period'], axis=1)
leaver_joiner.head()

Unnamed: 0,Type,Org code,Org name,NHSE region code,NHSE region name,ICS code,ICS name,Cluster group,Benchmark group,Staff group,HC,FTE,Date
0,Leavers,RXX,Surrey and Borders Partnership NHS Foundation ...,Y59,South East,QXU,Surrey Heartlands,Mental Health,Mental Health and Learning Disability,All staff groups,46,41.09334,2021-12-01
1,Leavers,RJ2,Lewisham and Greenwich NHS Trust,Y56,London,QKK,South East London,Acute,Acute - Large,All staff groups,112,101.00739,2021-12-01
2,Leavers,RDR,Sussex Community NHS Foundation Trust,Y59,South East,QNX,Sussex,Community Provider Trust,Community Provider Trust,All staff groups,86,67.40329,2021-12-01
3,Leavers,26A,NHS Norfolk and Waveney CCG,Y61,East of England,QMM,Norfolk and Waveney,Clinical Commissioning Group,Clinical Commissioning Group,All staff groups,9,8.2,2021-12-01
4,Leavers,RWX,Berkshire Healthcare NHS Foundation Trust,Y59,South East,QNQ,Frimley,Mental Health,Mental Health and Learning Disability,All staff groups,73,60.87147,2021-12-01


In [31]:
denoms = df.loc[df['Type'].isin(['Denoms'])]
denoms = denoms.rename(columns={'Period':'Date'})


In [32]:
denoms['Date'] = pd.to_datetime(denoms['Date'],format='%d/%m/%Y')


In [33]:
denoms['Date'] = denoms['Date'].dt.strftime('%Y-%m-01')
denoms['Date'] = pd.to_datetime(denoms['Date'])


In [50]:
merged_df = pd.concat([denoms,leaver_joiner])
merged_df = merged_df.drop(['ICS code','ICS name'],axis=1)


In [51]:
merged_df.tail()

Unnamed: 0,Date,Type,Org code,Org name,NHSE region code,NHSE region name,Cluster group,Benchmark group,Staff group,HC,FTE
442245,2023-01-01,Leavers,RTR,South Tees Hospitals NHS Foundation Trust,Y63,North East and Yorkshire,Acute,Acute - Teaching,Support to ST&T staff,7,5.58333
442246,2023-01-01,Leavers,RCF,Airedale NHS Foundation Trust,Y63,North East and Yorkshire,Acute,Acute - Small,Support to ST&T staff,2,0.72
442247,2023-01-01,Leavers,RGD,Leeds and York Partnership NHS Foundation Trust,Y63,North East and Yorkshire,Mental Health,Mental Health and Learning Disability,Support to ST&T staff,6,6.0
442248,2023-01-01,Leavers,RR8,Leeds Teaching Hospitals NHS Trust,Y63,North East and Yorkshire,Acute,Acute - Teaching,Support to ST&T staff,14,12.22
442249,2023-01-01,Leavers,RXF,Mid Yorkshire Hospitals NHS Trust,Y63,North East and Yorkshire,Acute,Acute - Large,Support to ST&T staff,6,5.23333


In [52]:
df_p = merged_df.pivot(index=['Date','Org code','Org name','NHSE region code','NHSE region name','Cluster group',
                       'Benchmark group','Staff group'], columns='Type', values=['HC','FTE']).reset_index()


In [53]:
# Flatten the column multi-index

df_p.columns = [f'{col[0]}_{col[1]}' if col[1] else col[0] for col in df_p.columns]


In [59]:
df_p.tail()


Unnamed: 0,Date,Org code,Org name,NHSE region code,NHSE region name,Cluster group,Benchmark group,Staff group,HC_Denoms,HC_Joiners,HC_Leavers,FTE_Denoms,FTE_Joiners,FTE_Leavers
193665,2023-02-01,TAJ,Black Country Healthcare NHS Foundation Trust,Y60,Midlands,Mental Health,Mental Health and Learning Disability,Nurses & health visitors,1102.0,,,1007.473,,
193666,2023-02-01,TAJ,Black Country Healthcare NHS Foundation Trust,Y60,Midlands,Mental Health,Mental Health and Learning Disability,"Scientific, therapeutic & technical staff",494.0,,,429.24553,,
193667,2023-02-01,TAJ,Black Country Healthcare NHS Foundation Trust,Y60,Midlands,Mental Health,Mental Health and Learning Disability,Senior managers,56.0,,,55.6,,
193668,2023-02-01,TAJ,Black Country Healthcare NHS Foundation Trust,Y60,Midlands,Mental Health,Mental Health and Learning Disability,Support to ST&T staff,437.0,,,402.212,,
193669,2023-02-01,TAJ,Black Country Healthcare NHS Foundation Trust,Y60,Midlands,Mental Health,Mental Health and Learning Disability,"Support to doctors, nurses & midwives",999.0,,,871.02637,,


SIP data to month later than leaver data in monthly turnover! Remove final month of data in merged DF

In [61]:
# Find the maximum date value in the 'Date' column
#max_date = merged_df['Date'].max()

# Remove data with the latest date value
#merged_df = merged_df[merged_df['Date'] < max_date] 

In [62]:
df_p.to_csv(f'../02_data/turnover-processed-{month}-{year}.csv', index=False)
