In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
from pathlib import Path

Inactive orgs

In [2]:
from urllib.request import urlopen
  
# import json
import json
# store the URL in url as 
# parameter for urlopen
url = "https://directory.spineservices.nhs.uk/ORD/2-0-0/organisations?Status=Inactive&Roles=RO197,RO98&Limit=1000"
  
# store the response of URL
response = urlopen(url)
  
# storing the JSON response 
# from url in data
data_json = json.loads(response.read())
  
# print the json response
#print(data_json)

In [3]:
inactive_orgs = []

for q in data_json['Organisations']:
    inactive_orgs.append({
        "ORG_NAME": q['Name'],
        "ORG_CODE" : q['OrgId']
    })

In [4]:
with open("./tempdir/inactive_organisations.txt", 'w') as f:
    json.dump(inactive_orgs, f)
f.close()


In [5]:
inactive_orgs = pd.read_json('./tempdir/inactive_organisations.txt')

Load turnover data

In [6]:
path = "tempdir/turnover"  # or unix / linux / mac path

# Get the files from the path provided in the OP
files_annual = Path(path).glob('*annual.csv')
files_monthly = Path(path).glob('*monthly.csv')

columns_to_merge = {
    'Period' : 'PERIOD',
    'Type' : 'TYPE',
    'Org code': 'ORG_CODE',
    'Org name': 'ORG_NAME',
    'NHSE region code': 'NHSE_REGION_CODE',
    'NHSE region name': 'NHSE_REGION_NAME',
    'ICS code': 'ICS_CODE',
    'ICS name': 'ICS_NAME',
    'Cluster group': 'CLUSTER_GROUP',
    'Benchmark group': 'BENCHMARK_GROUP',   
    'Staff group': 'STAFF_GROUP',
    'HC':'HC',
    'FTE':'FTE'
    # 'Org Code': 'ORG_CODE',
    # 'Org Name': 'ORG_NAME',
    # 'Org Type': 'ORG_TYPE',
    # 'FTE days lost': 'FTE_DAYS_LOST',
    # 'FTE Days Sick' : 'FTE_DAYS_LOST',
    # 'FTE days available': 'FTE_DAYS_AVAILABLE',
    # 'FTE Days Available' : 'FTE_DAYS_AVAILABLE',
    # 'Sickness absence rate (%)': 'SICKNESS_ABSENCE_RATE_PERCENT',

}

In [7]:
# for f in files_annual:
#     print(f.as_posix())

Concat files in tempdir

In [8]:
dfs = list()
df = None

for f in files_annual:
    #print(f.as_posix())
    data = pd.read_csv(f.as_posix())
   # print(f"Number of rows: {len(data)}")
    # .stem is method for pathlib objects to get the filename w/o the extension
    data['file_date'] = str(f)[17:27] # Varies depending on name
    data.columns = [columns_to_merge.get(k,k) for k in data.columns]
    dfs.append(data)

df = pd.concat(dfs, ignore_index=True)

In [9]:
list(df)

['PERIOD',
 'TYPE',
 'ORG_CODE',
 'ORG_NAME',
 'NHSE_REGION_CODE',
 'NHSE_REGION_NAME',
 'CLUSTER_GROUP',
 'BENCHMARK_GROUP',
 'STAFF_GROUP',
 'HC',
 'FTE',
 'Unnamed: 11',
 'Unnamed: 12',
 'Unnamed: 13',
 'Unnamed: 14',
 'Unnamed: 15',
 'Unnamed: 16',
 'Unnamed: 17',
 'Unnamed: 18',
 'Unnamed: 19',
 'Unnamed: 20',
 'Unnamed: 21',
 'Unnamed: 22',
 'file_date',
 'ICS_CODE',
 'ICS_NAME']

In [10]:
# remove regions, benchmarking etc. here and rejoin at later stage with latest ref table
# df1 = df[["file_date", "PERIOD", "ORG_CODE", "ORG_NAME", "NHSE_REGION_CODE", 
#           "NHSE_REGION_NAME", "CLUSTER_GROUP", "BENCHMARK_GROUP", "STAFF_GROUP", 
#           "TYPE", "HC", "FTE"]]

df1 = df[["file_date", "PERIOD", "ORG_CODE", "STAFF_GROUP", 
          "TYPE", "HC", "FTE"]]

In [11]:
df2 = df1.sort_values(by='file_date', 
                      ascending = False).groupby(['PERIOD', 'ORG_CODE',
                        'STAFF_GROUP', 'TYPE']).first().reset_index()

In [12]:
# set date columns to date types
# set leavers and joiners date to be the "from" date in original col
# correct date so that it is always first of the month
df2['DATE'] = df2.apply(lambda x: datetime.strptime(x['PERIOD'].split(' to ')[1]+'01', 
                                '%Y%m%d') 
                               if x['TYPE'] in ('Leavers', 
                                'Joiners') else (datetime.strptime('01/'+x['PERIOD'][3:10], 
                                                                   '%d/%m/%Y') 
                                                                            if x['TYPE'] == 'Denoms' else None), 
                                                                            axis = 1) 
df2['file_date'] = pd.to_datetime(df2['file_date'], yearfirst = True)

In [13]:
df3 = df2.groupby(['ORG_CODE', 'STAFF_GROUP', 'DATE']).apply(lambda x: 
        pd.Series({
        'n': x.shape[0],
        'join_HC': x.loc[x['TYPE'].str.contains('Joiners'), 'HC'].values[0] if any(x['TYPE'] == 'Joiners') else None,
        'join_FTE': x.loc[x['TYPE'] == 'Joiners', 'FTE'].values[0] if any(x['TYPE'] == 'Joiners') else None,
        'leave_HC': x.loc[x['TYPE'] == 'Leavers', 'HC'].values[0] if any(x['TYPE'] == 'Leavers') else None,
        'leave_FTE': x.loc[x['TYPE'] == 'Leavers', 'FTE'].values[0] if any(x['TYPE'] == 'Leavers') else None,
        'denom_HC': x.loc[x['TYPE'] == 'Denoms', 'HC'].values[0] if any(x['TYPE'] == 'Denoms') else None,
        'denom_FTE': x.loc[x['TYPE'] == 'Denoms', 'FTE'].values[0] if any(x['TYPE'] == 'Denoms') else None,
    })).reset_index()


In [34]:
# Merge inactive dataframe
df4 = pd.merge(df3, inactive_orgs, on='ORG_CODE', how='left')

In [35]:
# # Filter out inactive organizations
df4 = df4[~df4['ORG_NAME'].notna()].copy()
df4.drop(columns='ORG_NAME', inplace=True)

In [36]:
# Merge with latest org list
# ref table with org information
url_ref_org = '../REF_ORGANISATION.csv'
ref_org = pd.read_csv(url_ref_org)

ref_org = ref_org.drop(['Org_Code_For_Use','Org_Name_For_Use','Org_Open_Date',
                        'Org_Status','Org_Region_Code',
                        'Org_System_Code','Org_ICB_Name',
                        'Org_Close_Date', 'Org_Name','Org_Type','Org_Post_Code',
                        'Legacy_Org_Close_Date','UDALFileID','Org_System_Name',
                        ' NHS Provider flag ',' Total WTE recorded '],axis=1)

ref_org.rename(columns={'Org_Code_For_Join':'ORG_CODE','Org_Type_Grouped':'ORG_TYPE',
                        'Org_Region_Name':'region_name'},inplace=True)

ref_org.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21813 entries, 0 to 21812
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ORG_CODE     21813 non-null  object
 1   region_name  21647 non-null  object
 2   ORG_TYPE     21813 non-null  object
dtypes: object(3)
memory usage: 511.4+ KB


In [37]:
# org reference data
df5 = pd.merge(df4, ref_org, on=['ORG_CODE'],how='left')

In [38]:
df5.head()

Unnamed: 0,ORG_CODE,STAFF_GROUP,DATE,n,join_HC,join_FTE,leave_HC,leave_FTE,denom_HC,denom_FTE,region_name,ORG_TYPE
0,00P,All staff groups,2017-04-01,1.0,,,,,13.0,10.75333,NORTH EAST AND YORKSHIRE,INTEGRATED CARE BOARD
1,00P,All staff groups,2017-05-01,1.0,,,,,13.0,10.75333,NORTH EAST AND YORKSHIRE,INTEGRATED CARE BOARD
2,00P,All staff groups,2017-06-01,1.0,,,,,13.0,11.25333,NORTH EAST AND YORKSHIRE,INTEGRATED CARE BOARD
3,00P,All staff groups,2017-07-01,1.0,,,,,12.0,10.25333,NORTH EAST AND YORKSHIRE,INTEGRATED CARE BOARD
4,00P,All staff groups,2017-08-01,1.0,,,,,12.0,10.25333,NORTH EAST AND YORKSHIRE,INTEGRATED CARE BOARD


In [39]:
# Drop ICBs
df5 = df5[~df5['ORG_TYPE'].isin(['INTEGRATED CARE BOARD'])]

# Drop NANs
df5 = df5.dropna(subset=['ORG_TYPE'])


In [40]:
df5['ORG_TYPE'].unique()

array(['ACUTE', 'COMMUNITY', 'MENTAL HEALTH AND LEARNING DISABILITY',
       'AMBULANCE', 'CARE TRUST'], dtype=object)

In [41]:
df6 = None
df6 = df5.copy()
df6.sort_values('DATE')
df6.columns

Index(['ORG_CODE', 'STAFF_GROUP', 'DATE', 'n', 'join_HC', 'join_FTE',
       'leave_HC', 'leave_FTE', 'denom_HC', 'denom_FTE', 'region_name',
       'ORG_TYPE'],
      dtype='object')

In [42]:
df6['denom_FTE_12'] = df6.sort_values(by=['DATE']).groupby(['ORG_CODE','STAFF_GROUP'])['denom_FTE'].shift(12)
df6['denom_HC_12'] = df6.sort_values(by=['DATE']).groupby(['ORG_CODE','STAFF_GROUP'])['denom_HC'].shift(12)

In [43]:
df6['denom_FTE_mean'] = df6[['denom_FTE', 'denom_FTE_12']].mean(axis=1)
df6['denom_HC_mean'] = df6[['denom_HC', 'denom_HC_12']].mean(axis=1)

In [44]:
df6.head()

Unnamed: 0,ORG_CODE,STAFF_GROUP,DATE,n,join_HC,join_FTE,leave_HC,leave_FTE,denom_HC,denom_FTE,region_name,ORG_TYPE,denom_FTE_12,denom_HC_12,denom_FTE_mean,denom_HC_mean
49086,R0A,All staff groups,2017-10-01,1.0,,,,,19130.0,17095.95832,NORTH WEST,ACUTE,,,17095.95832,19130.0
49087,R0A,All staff groups,2017-11-01,1.0,,,,,19211.0,17171.25859,NORTH WEST,ACUTE,,,17171.25859,19211.0
49088,R0A,All staff groups,2017-12-01,1.0,,,,,19214.0,17170.86679,NORTH WEST,ACUTE,,,17170.86679,19214.0
49089,R0A,All staff groups,2018-01-01,1.0,,,,,19295.0,17240.694,NORTH WEST,ACUTE,,,17240.694,19295.0
49090,R0A,All staff groups,2018-02-01,1.0,,,,,19326.0,17271.63829,NORTH WEST,ACUTE,,,17271.63829,19326.0


In [45]:
# rename DATE column to month_year
df6.rename(columns={'DATE':'month_year'}, inplace=True)

In [25]:
#df5[(df5['ORG_NAME']=='Yorkshire Ambulance Service NHS Trust') & (df5['STAFF_GROUP']=='All staff groups') & (df5['DATE'].dt.month == 7)]

In [46]:
df7 = df6.copy()

In [48]:
df7 = df7.sort_values(by=['ORG_CODE','month_year'], ascending = False)

In [49]:
df7.to_csv('../annual_turnover.csv', index = False)