In [None]:
from datetime import datetime
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
path = "tempdir/turnover"  # or unix / linux / mac path

# Get the files from the path provided in the OP
files_annual = Path(path).glob('*annual.csv')
files_monthly = Path(path).glob('*monthly.csv')

columns_to_merge = {
    'Period' : 'PERIOD',
    'Type' : 'TYPE',
    'NHSE region code': 'NHSE_REGION_CODE',
    'NHSE region name': 'NHSE_REGION_NAME',
    'Org code': 'ORG_CODE',
    'Org name': 'ORG_NAME',
    'Org Code': 'ORG_CODE',
    'Org Name': 'ORG_NAME',
    'Org Type': 'ORG_TYPE',
    'FTE days lost': 'FTE_DAYS_LOST',
    'FTE Days Sick' : 'FTE_DAYS_LOST',
    'FTE days available': 'FTE_DAYS_AVAILABLE',
    'FTE Days Available' : 'FTE_DAYS_AVAILABLE',
    'Sickness absence rate (%)': 'SICKNESS_ABSENCE_RATE_PERCENT',
    'Staff group': 'STAFF_GROUP',
    'Cluster group': 'CLUSTER_GROUP',
    'Benchmark group': 'BENCHMARK_GROUP',
    'HC':'HC',
    'FTE':'FTE'
}

In [None]:
for f in files_annual:
    print(f.as_posix())

In [None]:
dfs = list()
df = None

for f in files_annual:
    #print(f.as_posix())
    data = pd.read_csv(f.as_posix())
   # print(f"Number of rows: {len(data)}")
    # .stem is method for pathlib objects to get the filename w/o the extension
    data['file_date'] = str(f)[17:27] # Varies depending on name
    data.columns = [columns_to_merge.get(k,k) for k in data.columns]
    dfs.append(data)

df = pd.concat(dfs, ignore_index=True)

In [None]:
list(df)

In [None]:
df1 = df[["file_date", "PERIOD", "ORG_CODE", "ORG_NAME", "NHSE_REGION_CODE", "NHSE_REGION_NAME", "CLUSTER_GROUP", "BENCHMARK_GROUP", "STAFF_GROUP", "TYPE", "HC", "FTE"]]

In [None]:
df2 = df1.sort_values(by='file_date', ascending = False).groupby(['PERIOD', 'ORG_NAME', 'STAFF_GROUP', 'TYPE']).first().reset_index()

In [None]:
len(df2)

In [None]:
df2.head()

In [None]:
# set date columns to date types
# TODO
df2['DATE'] = df2.apply(lambda x: datetime.strptime(x['PERIOD'].split(' to ')[1]+'01', 
                                '%Y%m%d') 
                               if x['TYPE'] in ('Leavers', 
                                'Joiners') else (datetime.strptime('01/'+x['PERIOD'][3:10], 
                                                                   '%d/%m/%Y') 
                                                                            if x['TYPE'] == 'Denoms' else None), 
                                                                            axis = 1) 
df2['file_date'] = pd.to_datetime(df2['file_date'], yearfirst = True)

In [None]:
df3 = df2.groupby(['NHSE_REGION_NAME', 'BENCHMARK_GROUP', 'ORG_NAME', 'STAFF_GROUP', 'DATE']).apply(lambda x: 
        pd.Series({
        'n': x.shape[0],
        'join_HC': x.loc[x['TYPE'].str.contains('Joiners'), 'HC'].values[0] if any(x['TYPE'] == 'Joiners') else None,
        'join_FTE': x.loc[x['TYPE'] == 'Joiners', 'FTE'].values[0] if any(x['TYPE'] == 'Joiners') else None,
        'leave_HC': x.loc[x['TYPE'] == 'Leavers', 'HC'].values[0] if any(x['TYPE'] == 'Leavers') else None,
        'leave_FTE': x.loc[x['TYPE'] == 'Leavers', 'FTE'].values[0] if any(x['TYPE'] == 'Leavers') else None,
        'denom_HC': x.loc[x['TYPE'] == 'Denoms', 'HC'].values[0] if any(x['TYPE'] == 'Denoms') else None,
        'denom_FTE': x.loc[x['TYPE'] == 'Denoms', 'FTE'].values[0] if any(x['TYPE'] == 'Denoms') else None,
    })).reset_index()


In [None]:
df3.columns

In [None]:
# Drop CCGs and ICBs
df3 = df3[df3['ORG_NAME'].str.contains("CCG|ICB")==False]

In [None]:
df3.columns

In [None]:
df4 = None
df4 = df3.copy()
df4.columns

In [None]:
df4.index = df4['DATE']

In [None]:
df4.head()

In [None]:
from datetime import timedelta
one_year = timedelta(days=365)
fte_avg = df4.groupby(['NHSE_REGION_NAME', 'BENCHMARK_GROUP', 'ORG_NAME', 'STAFF_GROUP']).rolling(one_year, on="DATE")['denom_FTE'].mean()

In [None]:
hc_avg = df4.groupby(['NHSE_REGION_NAME', 'BENCHMARK_GROUP', 'ORG_NAME', 'STAFF_GROUP']).rolling(one_year, on="DATE")['denom_HC'].mean()

In [None]:
df4['AVG_FTE'] = fte_avg.reset_index()['denom_FTE']
df4['AVG_HC'] = hc_avg.reset_index()['denom_HC']

In [None]:
df4.head()

In [None]:
df4 = df4.sort_values(by=['NHSE_REGION_NAME', 'ORG_NAME', 'BENCHMARK_GROUP','DATE'], ascending = False)

In [None]:
df4.to_csv('../annual_turnover.csv', index = False)