In [None]:
from datetime import datetime
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
path = "tempdir/turnover"  # or unix / linux / mac path

# Get the files from the path provided in the OP
files_annual = Path(path).glob('*annual.csv')
files_monthly = Path(path).glob('*monthly.csv')

columns_to_merge = {
    'Period' : 'DATE',
    'Type' : 'TYPE',
    'NHSE region code': 'NHSE_REGION_CODE',
    'NHSE region name': 'NHSE_REGION_NAME',
    'Org code': 'ORG_CODE',
    'Org name': 'ORG_NAME',
    'Org Code': 'ORG_CODE',
    'Org Name': 'ORG_NAME',
    'Org Type': 'ORG_TYPE',
    'FTE days lost': 'FTE_DAYS_LOST',
    'FTE Days Sick' : 'FTE_DAYS_LOST',
    'FTE days available': 'FTE_DAYS_AVAILABLE',
    'FTE Days Available' : 'FTE_DAYS_AVAILABLE',
    'Sickness absence rate (%)': 'SICKNESS_ABSENCE_RATE_PERCENT',
    'Staff group': 'STAFF_GROUP',
    'Cluster group': 'CLUSTER_GROUP',
    'Benchmark group': 'BENCHMARK_GROUP',
    'HC':'HC',
    'FTE':'FTE'
}

In [None]:
for f in files_annual:
    print(f.as_posix())

In [None]:
dfs = list()
df = None

for f in files_annual:
    #print(f.as_posix())
    data = pd.read_csv(f.as_posix())
   # print(f"Number of rows: {len(data)}")
    # .stem is method for pathlib objects to get the filename w/o the extension
    data['file_date'] = str(f)[17:26] # Varies depending on name
    data.columns = [columns_to_merge.get(k,k) for k in data.columns]
    dfs.append(data)

df = pd.concat(dfs, ignore_index=True)

In [None]:
list(df)

In [None]:
df1 = df[["file_date", "DATE", "ORG_CODE", "ORG_NAME", "NHSE_REGION_CODE", "NHSE_REGION_NAME", "CLUSTER_GROUP", "BENCHMARK_GROUP", "STAFF_GROUP", "TYPE", "HC", "FTE"]]

In [None]:
# NEED TO REMOVE PERIOD ROWS

In [None]:
len(df1)

In [None]:
df1.tail()

In [None]:
df2 = df1.sort_values(by='file_date', ascending = False).groupby(['DATE', 'ORG_NAME', 'STAFF_GROUP', 'TYPE']).first().reset_index()

In [None]:
len(df2)

In [None]:
# set date columns to date types
# TODO
df2['DATE'] = pd.to_datetime(df2['DATE'], errors='coerce')
df2['file_date'] = pd.to_datetime(df2['file_date'], yearfirst = True)
df2 = df2.dropna(subset=['DATE'])

In [None]:
df1.sort_values(by='file_date').tail()

In [None]:
replace_dict_region = {'South East of England':'South East',
                'South West of England':'South West'}
replace_dict_staff = {'All staff':'All staff groups',
                'HCHS Doctors':'HCHS doctors (exc. junior Drs)',
                'HCHS doctors':'HCHS doctors (exc. junior Drs)'}

df1['NHSE_REGION_NAME'] = df1['NHSE_REGION_NAME'].replace(replace_dict_region)
df1['STAFF_GROUP'] = df1['STAFF_GROUP'].replace(replace_dict_staff)

df1 = df1.drop_duplicates()

In [None]:
df1 = df1.sort_values(by='file_date')

df1.tail()

In [None]:
df2 = df1.sort_values(by='file_date', ascending = False).groupby(['DATE', 'ORG_NAME', 'STAFF_GROUP']).first().reset_index()

In [None]:
#df2[(df2['ORG_NAME'] == 'Liverpool University Hospitals NHS Foundation Trust') & (df2['STAFF_GROUP']=="All staff groups")].sort_values(by='DATE', ascending = False)

In [None]:
df2 = df2.sort_values(by=['NHSE_REGION_CODE', 'ORG_NAME', 'CLUSTER_GROUP','DATE'], ascending = False)

In [None]:
df2.head()


In [None]:
df2.to_csv('../sickness_benchmarking.csv', index = False)