In [None]:
from datetime import datetime
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
path = "tempdir/benchmark"  # or unix / linux / mac path

# Get the files from the path provided in the OP
files = Path(path).glob('*.csv')

columns_to_merge = {
    'Date' : 'DATE',
    'Tm End Date' : 'DATE',
    'NHSE region code': 'NHSE_REGION_CODE',
    'NHSE region name': 'NHSE_REGION_NAME',
    'Org code': 'ORG_CODE',
    'Org name': 'ORG_NAME',
    'Org Code': 'ORG_CODE',
    'Org Name': 'ORG_NAME',
    'Org Type': 'ORG_TYPE',
    'FTE days lost': 'FTE_DAYS_LOST',
    'FTE Days Sick' : 'FTE_DAYS_LOST',
    'FTE days available': 'FTE_DAYS_AVAILABLE',
    'FTE Days Available' : 'FTE_DAYS_AVAILABLE',
    'Sickness absence rate (%)': 'SICKNESS_ABSENCE_RATE_PERCENT',
    'Staff group': 'STAFF_GROUP',
    'Cluster group': 'CLUSTER_GROUP',
    'Benchmark group': 'BENCHMARK_GROUP',
}

In [None]:
list(columns_to_merge.values())

In [None]:
dfs = list()
df = None

for f in files:
    #print(f)
    data = pd.read_csv(f)
    # .stem is method for pathlib objects to get the filename w/o the extension
    data['file_date'] = str(f)[18:28] # Varies depending on name
    data.columns = [columns_to_merge.get(k,k) for k in data.columns]
    list(data)
    dfs.append(data)

df = pd.concat(dfs, ignore_index=True)

In [None]:
list(df)

In [None]:
df1 = df[["file_date", "DATE", "ORG_CODE", "ORG_NAME", "NHSE_REGION_CODE", "NHSE_REGION_NAME", "CLUSTER_GROUP", "BENCHMARK_GROUP", "STAFF_GROUP", "FTE_DAYS_LOST", "FTE_DAYS_AVAILABLE"]]

In [None]:
df1.head()

In [None]:
# set date columns to date types
df1['DATE'] = pd.to_datetime(df1['DATE'],infer_datetime_format=True)
df1['file_date'] = pd.to_datetime(df1['file_date'], yearfirst = True)

In [None]:
df1.sort_values(by='file_date').tail()

In [None]:
replace_dict_region = {'South East of England':'South East',
                'South West of England':'South West'}
replace_dict_staff = {'All staff':'All staff groups',
                'HCHS Doctors':'HCHS doctors (exc. junior Drs)',
                'HCHS doctors':'HCHS doctors (exc. junior Drs)'}

df1['NHSE_REGION_NAME'] = df1['NHSE_REGION_NAME'].replace(replace_dict_region)
df1['STAFF_GROUP'] = df1['STAFF_GROUP'].replace(replace_dict_staff)

df1 = df1.drop_duplicates()

In [None]:
df1 = df1.sort_values(by='file_date')

df1.tail()

In [None]:
df2 = df1.sort_values(by='file_date', ascending = False).groupby(['DATE', 'ORG_NAME', 'STAFF_GROUP']).first().reset_index()

In [None]:
#df2[(df2['ORG_NAME'] == 'Liverpool University Hospitals NHS Foundation Trust') & (df2['STAFF_GROUP']=="All staff groups")].sort_values(by='DATE', ascending = False)

In [None]:
df2 = df2.sort_values(by=['NHSE_REGION_CODE', 'ORG_NAME', 'CLUSTER_GROUP','DATE'], ascending = False)

In [None]:
df2.head()


In [None]:
df2.to_csv('../sickness_benchmarking.csv', index = False)