In [None]:
from datetime import datetime
import pandas as pd
import numpy as np
from pathlib import Path

Inactive orgs

In [None]:
from urllib.request import urlopen
  
# import json
import json
# store the URL in url as 
# parameter for urlopen
url = "https://directory.spineservices.nhs.uk/ORD/2-0-0/organisations?Status=Inactive&Roles=RO197,RO98&Limit=1000"
  
# store the response of URL
response = urlopen(url)
  
# storing the JSON response 
# from url in data
data_json = json.loads(response.read())
  
# print the json response
#print(data_json)

In [None]:
inactive_orgs = []

for q in data_json['Organisations']:
    inactive_orgs.append({
        "ORG_NAME": q['Name'],
        "ORG_CODE" : q['OrgId']
    })

In [None]:
with open("./tempdir/inactive_organisations.txt", 'w') as f:
    json.dump(inactive_orgs, f)
f.close()


In [None]:
inactive_orgs = pd.read_json('./tempdir/inactive_organisations.txt')

Load turnover data

In [None]:
path = "tempdir/turnover"  # or unix / linux / mac path

# Get the files from the path provided in the OP
files_annual = Path(path).glob('*annual.csv')
files_monthly = Path(path).glob('*monthly.csv')

columns_to_merge = {
    'Period' : 'PERIOD',
    'Type' : 'TYPE',
    'Org code': 'ORG_CODE',
    'Org name': 'ORG_NAME',
    'NHSE region code': 'NHSE_REGION_CODE',
    'NHSE region name': 'NHSE_REGION_NAME',
    'ICS code': 'ICS_CODE',
    'ICS name': 'ICS_NAME',
    'Cluster group': 'CLUSTER_GROUP',
    'Benchmark group': 'BENCHMARK_GROUP',   
    'Staff group': 'STAFF_GROUP',
    'HC':'HC',
    'FTE':'FTE'
    # 'Org Code': 'ORG_CODE',
    # 'Org Name': 'ORG_NAME',
    # 'Org Type': 'ORG_TYPE',
    # 'FTE days lost': 'FTE_DAYS_LOST',
    # 'FTE Days Sick' : 'FTE_DAYS_LOST',
    # 'FTE days available': 'FTE_DAYS_AVAILABLE',
    # 'FTE Days Available' : 'FTE_DAYS_AVAILABLE',
    # 'Sickness absence rate (%)': 'SICKNESS_ABSENCE_RATE_PERCENT',

}

In [None]:
# for f in files_annual:
#     print(f.as_posix())

Concat files in tempdir

In [None]:
dfs = list()
df = None

for f in files_annual:
    #print(f.as_posix())
    data = pd.read_csv(f.as_posix())
   # print(f"Number of rows: {len(data)}")
    # .stem is method for pathlib objects to get the filename w/o the extension
    data['file_date'] = str(f)[17:27] # Varies depending on name
    data.columns = [columns_to_merge.get(k,k) for k in data.columns]
    dfs.append(data)

df = pd.concat(dfs, ignore_index=True)

In [None]:
list(df)

In [None]:
# remove regions, benchmarking etc. here and rejoin at later stage with latest ref table
# df1 = df[["file_date", "PERIOD", "ORG_CODE", "ORG_NAME", "NHSE_REGION_CODE", 
#           "NHSE_REGION_NAME", "CLUSTER_GROUP", "BENCHMARK_GROUP", "STAFF_GROUP", 
#           "TYPE", "HC", "FTE"]]

df1 = df[["file_date", "PERIOD", "ORG_CODE", "STAFF_GROUP", 
          "TYPE", "HC", "FTE"]]

In [None]:
df2 = df1.sort_values(by='file_date', 
                      ascending = False).groupby(['PERIOD', 'ORG_CODE',
                        'STAFF_GROUP', 'TYPE']).first().reset_index()

In [None]:
# set date columns to date types
# set leavers and joiners date to be the "from" date in original col
# correct date so that it is always first of the month
df2['DATE'] = df2.apply(lambda x: datetime.strptime(x['PERIOD'].split(' to ')[1]+'01', 
                                '%Y%m%d') 
                               if x['TYPE'] in ('Leavers', 
                                'Joiners') else (datetime.strptime('01/'+x['PERIOD'][3:10], 
                                                                   '%d/%m/%Y') 
                                                                            if x['TYPE'] == 'Denoms' else None), 
                                                                            axis = 1) 
df2['file_date'] = pd.to_datetime(df2['file_date'], yearfirst = True)

In [None]:
df3 = df2.groupby(['ORG_CODE', 'STAFF_GROUP', 'DATE']).apply(lambda x: 
        pd.Series({
        'n': x.shape[0],
        'join_HC': x.loc[x['TYPE'].str.contains('Joiners'), 'HC'].values[0] if any(x['TYPE'] == 'Joiners') else None,
        'join_FTE': x.loc[x['TYPE'] == 'Joiners', 'FTE'].values[0] if any(x['TYPE'] == 'Joiners') else None,
        'leave_HC': x.loc[x['TYPE'] == 'Leavers', 'HC'].values[0] if any(x['TYPE'] == 'Leavers') else None,
        'leave_FTE': x.loc[x['TYPE'] == 'Leavers', 'FTE'].values[0] if any(x['TYPE'] == 'Leavers') else None,
        'denom_HC': x.loc[x['TYPE'] == 'Denoms', 'HC'].values[0] if any(x['TYPE'] == 'Denoms') else None,
        'denom_FTE': x.loc[x['TYPE'] == 'Denoms', 'FTE'].values[0] if any(x['TYPE'] == 'Denoms') else None,
    })).reset_index()

# 10 min runtime MR work laptop

In [None]:
# org_ref = pd.read_csv('../REF_ORGANISATION.csv')

# # keep only first five columns
# org_ref = org_ref.iloc[:,0:5]

# # rename columns
# org_ref.columns = ['ORG_CODE', 'ORG_CODE_USE', 'ORG_NAME_LEGACY',
#                     'ORG_NAME', 'ORG_STATUS']

# # keep only org_code and org_status
# org_ref = org_ref[['ORG_CODE', 'ORG_STATUS']]

# org_ref.info()


In [None]:
# # join org_ref to df3
# df4 = pd.merge(df3, org_ref, how='left', on=['ORG_CODE'])

# df4.info()

In [None]:
# show unique org_status values
# df4['ORG_STATUS'].unique()

In [None]:
# drop organisations where org_status is not open
#df5 = df4[df4['ORG_STATUS'] == 'Open']

In [None]:
#inactive_orgs.info()

In [None]:
# Merge inactive dataframe
#df4 = pd.merge(df3, inactive_orgs, on='ORG_CODE', how='left')

In [None]:
#orgs_pre_filter = df4['ORG_CODE'].nunique()

In [None]:
# # # Filter out inactive organizations
# df4 = df4[~df4['ORG_NAME'].notna()].copy()
# df4.drop(columns='ORG_NAME', inplace=True)
# # how many orgs were filtered out?
# orgs_post_filter = df4['ORG_CODE'].nunique()
# n_orgs_filtered = orgs_pre_filter - orgs_post_filter
# print(f"Number of inactive organizations filtered out: {n_orgs_filtered}")

In [None]:
# Merge with latest org list
# ref table with org information
url_ref_org = '../REF_ORGANISATION.csv'
ref_org = pd.read_csv(url_ref_org)

ref_org = ref_org.drop(['Org_Code_For_Join','Org_Open_Date',
                        'Org_Region_Code',
                        'Org_System_Code','Org_ICB_Name',
                        'Org_Close_Date', 'Org_Name','Org_Type','Org_Post_Code',
                        'Legacy_Org_Close_Date','UDALFileID','Org_System_Name',
                        ' NHS Provider flag ',' Total WTE recorded '],axis=1)

ref_org.rename(columns={'Org_Code_For_Use':'ORG_CODE','Org_Type_Grouped':'ORG_TYPE',
                        'Org_Name_For_Use':'ORG_NAME',
                        'Org_Region_Name':'region_name'},inplace=True)

ref_org.info()


In [None]:
# org reference data merge
df5 = pd.merge(df3, ref_org, on=['ORG_CODE'],how='left')

In [None]:
df5.head()

In [None]:
# drop where org status is not open
df5 = df5[df5['Org_Status'] == 'Open']

# drop org status column
df5 = df5.drop(['Org_Status'], axis=1)

# Drop ICBs
df5 = df5[~df5['ORG_TYPE'].isin(['INTEGRATED CARE BOARD'])]

# Drop NAN org types
df5 = df5.dropna(subset=['ORG_TYPE'])

# cut rows where denom_FTE is null or 0 - no staff in post for that group/org/period
df5 = df5[df5['denom_FTE'] != 0]
df5 = df5[df5['denom_FTE'].notna()]


In [None]:
df5.info()

In [None]:
df6 = None
df6 = df5.copy()
df6.sort_values('DATE')
df6.columns

In [None]:
df6['denom_FTE_12'] = df6.sort_values(by=['DATE']).groupby(['ORG_CODE','STAFF_GROUP'])['denom_FTE'].shift(12)
df6['denom_HC_12'] = df6.sort_values(by=['DATE']).groupby(['ORG_CODE','STAFF_GROUP'])['denom_HC'].shift(12)

In [None]:
df6['denom_FTE_mean'] = df6[['denom_FTE', 'denom_FTE_12']].mean(axis=1)
df6['denom_HC_mean'] = df6[['denom_HC', 'denom_HC_12']].mean(axis=1)

In [None]:
df6.head()

In [None]:
# rename DATE column to month_year
df6.rename(columns={'DATE':'month_year'}, inplace=True)

In [None]:
#df5[(df5['ORG_NAME']=='Yorkshire Ambulance Service NHS Trust') & (df5['STAFF_GROUP']=='All staff groups') & (df5['DATE'].dt.month == 7)]

In [None]:
df7 = df6.copy()

In [None]:
df7 = df7.sort_values(by=['ORG_CODE','month_year'], ascending = False)

In [None]:
df7.to_csv('../annual_turnover.csv', index = False)