In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
path = "tempdir/benchmark"  # or unix / linux / mac path

# Get the files from the path provided in the OP
files = Path(path).glob('*.csv')

columns_to_merge = {
    'Date' : 'DATE',
    'Tm End Date' : 'DATE',
    'NHSE region code': 'NHSE_REGION_CODE',
    'NHSE region name': 'NHSE_REGION_NAME',
    'Org code': 'ORG_CODE',
    'Org name': 'ORG_NAME',
    'Org Code': 'ORG_CODE',
    'Org Name': 'ORG_NAME',
    'Org Type': 'ORG_TYPE',
    'FTE days lost': 'FTE_DAYS_LOST',
    'FTE Days Sick' : 'FTE_DAYS_LOST',
    'FTE days available': 'FTE_DAYS_AVAILABLE',
    'FTE Days Available' : 'FTE_DAYS_AVAILABLE',
    'Sickness absence rate (%)': 'SICKNESS_ABSENCE_RATE_PERCENT',
    'Staff group': 'STAFF_GROUP',
    'Cluster group': 'CLUSTER_GROUP',
    'Benchmark group': 'BENCHMARK_GROUP',
}

In [3]:
list(columns_to_merge.values())

['DATE',
 'DATE',
 'NHSE_REGION_CODE',
 'NHSE_REGION_NAME',
 'ORG_CODE',
 'ORG_NAME',
 'ORG_CODE',
 'ORG_NAME',
 'ORG_TYPE',
 'FTE_DAYS_LOST',
 'FTE_DAYS_LOST',
 'FTE_DAYS_AVAILABLE',
 'FTE_DAYS_AVAILABLE',
 'SICKNESS_ABSENCE_RATE_PERCENT',
 'STAFF_GROUP',
 'CLUSTER_GROUP',
 'BENCHMARK_GROUP']

In [4]:
dfs = list()
df = None

for f in files:
    #print(f)
    data = pd.read_csv(f)
    # .stem is method for pathlib objects to get the filename w/o the extension
    data['file_date'] = str(f)[18:28] # Varies depending on name
    data.columns = [columns_to_merge.get(k,k) for k in data.columns]
    list(data)
    dfs.append(data)

df = pd.concat(dfs, ignore_index=True)

  data = pd.read_csv(f)


In [5]:
list(df)

['Month',
 'ORG_CODE',
 'ORG_NAME',
 'HEE region code',
 'HEE region name',
 'CLUSTER_GROUP',
 'BENCHMARK_GROUP',
 'NHSE_REGION_CODE',
 'NHSE_REGION_NAME',
 'STAFF_GROUP',
 'FTE_DAYS_LOST',
 'FTE_DAYS_AVAILABLE',
 'SICKNESS_ABSENCE_RATE_PERCENT',
 'file_date',
 'DATE',
 'ICS_CODE',
 'ICS_NAME']

In [6]:
df1 = df[["file_date", "DATE", "ORG_CODE", "ORG_NAME", "NHSE_REGION_CODE", "NHSE_REGION_NAME", "CLUSTER_GROUP", "BENCHMARK_GROUP", "STAFF_GROUP", "FTE_DAYS_LOST", "FTE_DAYS_AVAILABLE"]]

In [7]:
df1.tail()

Unnamed: 0,file_date,DATE,ORG_CODE,ORG_NAME,NHSE_REGION_CODE,NHSE_REGION_NAME,CLUSTER_GROUP,BENCHMARK_GROUP,STAFF_GROUP,FTE_DAYS_LOST,FTE_DAYS_AVAILABLE
4456822,2023-06-01,30/06/2023,X80,National Institute for Health and Care Excellence,QZZ,Special Health Authorities and other statutory...,Others,Others,HCHS Doctors,,
4456823,2023-06-01,30/06/2023,X80,National Institute for Health and Care Excellence,QZZ,Special Health Authorities and other statutory...,Others,Others,Central functions,326.84667,13834.71392
4456824,2023-06-01,30/06/2023,X80,National Institute for Health and Care Excellence,QZZ,Special Health Authorities and other statutory...,Others,Others,Managers,133.13333,10320.0998
4456825,2023-06-01,30/06/2023,X80,National Institute for Health and Care Excellence,QZZ,Special Health Authorities and other statutory...,Others,Others,Senior managers,,
4456826,2023-06-01,30/06/2023,X80,National Institute for Health and Care Excellence,QZZ,Special Health Authorities and other statutory...,Others,Others,Other staff or those with unknown classification,,


In [9]:
# set date columns to date types
df1['DATE'] = pd.to_datetime(df1['DATE'], format='mixed')
df1['file_date'] = pd.to_datetime(df['file_date'], yearfirst = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['DATE'] = pd.to_datetime(df1['DATE'], format='mixed')


In [11]:
df1.sort_values(by='file_date').tail()

Unnamed: 0,file_date,DATE,ORG_CODE,ORG_NAME,NHSE_REGION_CODE,NHSE_REGION_NAME,CLUSTER_GROUP,BENCHMARK_GROUP,STAFF_GROUP,FTE_DAYS_LOST,FTE_DAYS_AVAILABLE
4454719,2023-06-01,2023-06-30,RDE,East Suffolk and North Essex NHS Foundation Trust,Y61,East of England,Acute,Acute - Large,Ambulance staff,0.53333,337.0002
4454720,2023-06-01,2023-06-30,RDE,East Suffolk and North Essex NHS Foundation Trust,Y61,East of England,Acute,Acute - Large,Central functions,594.77955,23631.70701
4454721,2023-06-01,2023-06-30,RDE,East Suffolk and North Essex NHS Foundation Trust,Y61,East of England,Acute,Acute - Large,"Hotel, property & estates",810.7531,15810.1336
4454723,2023-06-01,2023-06-30,RDE,East Suffolk and North Essex NHS Foundation Trust,Y61,East of England,Acute,Acute - Large,Midwives,464.96648,8175.9167
4456826,2023-06-01,2023-06-30,X80,National Institute for Health and Care Excellence,QZZ,Special Health Authorities and other statutory...,Others,Others,Other staff or those with unknown classification,,


In [12]:
replace_dict_region = {'South East of England':'South East',
                'South West of England':'South West'}
replace_dict_staff = {'All staff':'All staff groups',
                'HCHS Doctors':'HCHS doctors (exc. junior Drs)',
                'HCHS doctors':'HCHS doctors (exc. junior Drs)'}

df1['NHSE_REGION_NAME'] = df1['NHSE_REGION_NAME'].replace(replace_dict_region)
df1['STAFF_GROUP'] = df1['STAFF_GROUP'].replace(replace_dict_staff)

df1 = df1.drop_duplicates()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['NHSE_REGION_NAME'] = df1['NHSE_REGION_NAME'].replace(replace_dict_region)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['STAFF_GROUP'] = df1['STAFF_GROUP'].replace(replace_dict_staff)


In [13]:
df1 = df1.sort_values(by='file_date')

df1.tail()

Unnamed: 0,file_date,DATE,ORG_CODE,ORG_NAME,NHSE_REGION_CODE,NHSE_REGION_NAME,CLUSTER_GROUP,BENCHMARK_GROUP,STAFF_GROUP,FTE_DAYS_LOST,FTE_DAYS_AVAILABLE
4454719,2023-06-01,2023-06-30,RDE,East Suffolk and North Essex NHS Foundation Trust,Y61,East of England,Acute,Acute - Large,Ambulance staff,0.53333,337.0002
4454720,2023-06-01,2023-06-30,RDE,East Suffolk and North Essex NHS Foundation Trust,Y61,East of England,Acute,Acute - Large,Central functions,594.77955,23631.70701
4454721,2023-06-01,2023-06-30,RDE,East Suffolk and North Essex NHS Foundation Trust,Y61,East of England,Acute,Acute - Large,"Hotel, property & estates",810.7531,15810.1336
4454723,2023-06-01,2023-06-30,RDE,East Suffolk and North Essex NHS Foundation Trust,Y61,East of England,Acute,Acute - Large,Midwives,464.96648,8175.9167
4456826,2023-06-01,2023-06-30,X80,National Institute for Health and Care Excellence,QZZ,Special Health Authorities and other statutory...,Others,Others,Other staff or those with unknown classification,,


In [14]:
df2 = df1.sort_values(by='file_date', ascending = False).groupby(['DATE', 'ORG_NAME', 'STAFF_GROUP']).first().reset_index()

In [None]:
#df2[(df2['ORG_NAME'] == 'Liverpool University Hospitals NHS Foundation Trust') & (df2['STAFF_GROUP']=="All staff groups")].sort_values(by='DATE', ascending = False)

In [15]:
df2 = df2.sort_values(by=['NHSE_REGION_CODE', 'ORG_NAME', 'CLUSTER_GROUP','DATE'], ascending = False)

In [16]:
df2.head()


Unnamed: 0,DATE,ORG_NAME,STAFF_GROUP,file_date,ORG_CODE,NHSE_REGION_CODE,NHSE_REGION_NAME,CLUSTER_GROUP,BENCHMARK_GROUP,FTE_DAYS_LOST,FTE_DAYS_AVAILABLE
242973,2023-06-30,Yorkshire Ambulance Service NHS Trust,All staff groups,2023-06-01,RX8,Y63,North East and Yorkshire,Ambulance,Ambulance Trust,10350.38716,166395.39707
242974,2023-06-30,Yorkshire Ambulance Service NHS Trust,Ambulance staff,2023-06-01,RX8,Y63,North East and Yorkshire,Ambulance,Ambulance Trust,2439.18285,49324.60728
242975,2023-06-30,Yorkshire Ambulance Service NHS Trust,Central functions,2023-06-01,RX8,Y63,North East and Yorkshire,Ambulance,Ambulance Trust,783.82827,17358.73118
242976,2023-06-30,Yorkshire Ambulance Service NHS Trust,HCHS doctors (exc. junior Drs),2023-06-01,RX8,Y63,North East and Yorkshire,Ambulance,Ambulance Trust,,
242977,2023-06-30,Yorkshire Ambulance Service NHS Trust,"Hotel, property & estates",2023-06-01,RX8,Y63,North East and Yorkshire,Ambulance,Ambulance Trust,270.70678,6246.14884


In [17]:
df2.to_csv('../sickness_benchmarking.csv', index = False)