# Data processing for model

## Import libraries

In [141]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

#import matplotlib.pyplot as plt
#import seaborn as sns

#from sklearn.preprocessing import MinMaxScaler
#import geopandas as gpd

## Read in and process dependent variable - turnover

Read turnover data from csv into a dataframe.

In [142]:
# annual and monthly data
#annual_url = '../01_data/processed_annual_turnover.csv'
annual_url = '../01_data/annual_turnover.csv'

# monthly_url = '../01_data/processed_monthly_turnover.csv'

# staff group ref table
ref_sg = pd.read_csv('../01_data/ref_sg_grouped.csv')

annual_df = pd.read_csv(annual_url, parse_dates=['month_year'])
annual_df = annual_df.drop(['n'],axis=1)

annual_df.rename(columns={'ORG_CODE':'org_code',
                        'STAFF_GROUP':'staff_group'}, inplace=True)

# drop unneeded columns
annual_df = annual_df.drop(['ORG_TYPE','join_HC','leave_HC','denom_HC','denom_FTE_12',
                            'ORG_NAME','denom_HC_12','denom_HC_mean'],axis=1)

# group staff groups with mapping table
annual_df = pd.merge(annual_df, ref_sg, on='staff_group', how='left')

annual_df.info()

# monthly_df = pd.read_csv(monthly_url, parse_dates=['month_year'])
# monthly_df = monthly_df.drop(['n'],axis=1)
# monthly_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223188 entries, 0 to 223187
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   org_code        223188 non-null  object        
 1   staff_group     223188 non-null  object        
 2   month_year      223188 non-null  datetime64[ns]
 3   join_FTE        178819 non-null  float64       
 4   leave_FTE       175224 non-null  float64       
 5   denom_FTE       223188 non-null  float64       
 6   region_name     223188 non-null  object        
 7   denom_FTE_mean  223188 non-null  float64       
 8   grouped_sg      223188 non-null  object        
dtypes: datetime64[ns](1), float64(4), object(4)
memory usage: 15.3+ MB


clean and process DV dataframes

In [143]:
# add leaver and joiner rates column
annual_df['leaver_rate'] = annual_df['leave_FTE']/annual_df['denom_FTE_mean']
annual_df['joiner_rate'] = annual_df['join_FTE']/annual_df['denom_FTE_mean']

In [144]:
annual_df1 = annual_df.copy()

annual_df1.head()

Unnamed: 0,org_code,staff_group,month_year,join_FTE,leave_FTE,denom_FTE,region_name,denom_FTE_mean,grouped_sg,leaver_rate,joiner_rate
0,TAJ,All staff groups,2023-07-01,614.16006,346.48324,3453.54378,MIDLANDS,3324.193505,all,0.104231,0.184755
1,TAJ,Central functions,2023-07-01,42.25333,35.56669,278.88667,MIDLANDS,273.472425,non-clinical,0.130056,0.154507
2,TAJ,HCHS doctors (exc. junior Drs),2023-07-01,4.8,5.6,122.66591,MIDLANDS,120.94091,clinical,0.046304,0.039689
3,TAJ,"Hotel, property & estates",2023-07-01,22.14668,15.36,181.20004,MIDLANDS,180.306715,non-clinical,0.085188,0.122828
4,TAJ,Managers,2023-07-01,21.8,11.05333,93.59334,MIDLANDS,78.290005,non-clinical,0.141184,0.278452


In [145]:
# drop 'all staff' group
annual_df1 = annual_df1[annual_df1['grouped_sg'] != 'all']

In [146]:
# Assuming 'df' is your dataframe
# Group by 'org', 'date', and 'staff_group' and sum the 'denom_FTE'
grouped_df = annual_df1.groupby(['org_code', 'month_year', 'grouped_sg'])['denom_FTE'].sum().reset_index()

# Calculate the total 'denom_FTE' for each 'org' and 'date'
total_denom_FTE = grouped_df.groupby(['org_code', 'month_year'])['denom_FTE'].transform('sum')

# Create new columns for the proportion of each staff group within each organization and date
grouped_df['prop_staff_group'] = (grouped_df['denom_FTE'] / total_denom_FTE)

# Pivot the dataframe to have columns for each staff group
pivot_df = grouped_df.pivot_table(index=['org_code', 'month_year'], columns='grouped_sg', values='prop_staff_group', 
                                  fill_value=0).reset_index()

# Rename the columns to include the prefix '%_'
#pivot_df.columns = [f'%_{col}_staff' if col != ('org', 'date') else col for col in pivot_df.columns]

# Merge the pivot table back to the original dataframe
annual_df2 = pd.merge(annual_df1, pivot_df, on=['org_code', 'month_year'], how='left')

# If there are missing values, you may want to fill them with 0
annual_df2.fillna(0, inplace=True)

In [147]:
annual_df2.head()

Unnamed: 0,org_code,staff_group,month_year,join_FTE,leave_FTE,denom_FTE,region_name,denom_FTE_mean,grouped_sg,leaver_rate,joiner_rate,clinical,non-clinical
0,TAJ,Central functions,2023-07-01,42.25333,35.56669,278.88667,MIDLANDS,273.472425,non-clinical,0.130056,0.154507,0.577655,0.422345
1,TAJ,HCHS doctors (exc. junior Drs),2023-07-01,4.8,5.6,122.66591,MIDLANDS,120.94091,clinical,0.046304,0.039689,0.577655,0.422345
2,TAJ,"Hotel, property & estates",2023-07-01,22.14668,15.36,181.20004,MIDLANDS,180.306715,non-clinical,0.085188,0.122828,0.577655,0.422345
3,TAJ,Managers,2023-07-01,21.8,11.05333,93.59334,MIDLANDS,78.290005,non-clinical,0.141184,0.278452,0.577655,0.422345
4,TAJ,Nurses & health visitors,2023-07-01,155.0763,108.6144,1007.00718,MIDLANDS,986.729345,clinical,0.110075,0.157162,0.577655,0.422345


In [148]:
## Calculate a regional sip FTE by staff group column for later use
# staff in post by staff group for each region by month
df_sip_region = annual_df2.groupby(['month_year','staff_group','region_name'])['denom_FTE'].sum().reset_index()
# # # merge the df_sip_region DataFrame  
annual_df3 = pd.merge(annual_df2, df_sip_region, on=['month_year','region_name','staff_group'], suffixes=('', '_region'))

In [149]:
annual_df3.rename(columns={'denom_FTE_total': 'total_sip_FTE','denom_FTE_region':'sip_FTE_region'}, inplace=True)
annual_df3.head()

Unnamed: 0,org_code,staff_group,month_year,join_FTE,leave_FTE,denom_FTE,region_name,denom_FTE_mean,grouped_sg,leaver_rate,joiner_rate,clinical,non-clinical,sip_FTE_region
0,TAJ,Central functions,2023-07-01,42.25333,35.56669,278.88667,MIDLANDS,273.472425,non-clinical,0.130056,0.154507,0.577655,0.422345,21026.73633
1,RYW,Central functions,2023-07-01,60.07332,63.52,339.09167,MIDLANDS,483.5315,non-clinical,0.131367,0.124239,0.599197,0.400803,21026.73633
2,RYK,Central functions,2023-07-01,6.0,5.0,22.70667,MIDLANDS,20.58667,non-clinical,0.242876,0.291451,0.509536,0.490464,21026.73633
3,RYG,Central functions,2023-07-01,42.77333,40.58667,228.20186,MIDLANDS,233.11593,non-clinical,0.174105,0.183485,0.601556,0.398444,21026.73633
4,RYA,Central functions,2023-07-01,21.3,31.54667,229.50426,MIDLANDS,222.14666,non-clinical,0.142008,0.095883,0.904498,0.095502,21026.73633


In [150]:
# replace inf values with nan (can happen with rate calcs)
# df4.replace([np.inf, -np.inf], np.nan, inplace=True)
# # transform nans to zeros
# df4.fillna(0, inplace=True)

# Add a small constant to avoid taking the log of zero
# small_constant = 1e-5
    
# # log scale the total_SIP_FTE column to be in line with other variables. proxy for size of organisation
# annual_df3['log_total_sip_FTE'] = np.log(annual_df3['total_sip_FTE'] + small_constant)

# # drop unused columns (keep total SIP FTE for calculating vacancy rates later)
# annual_df3.drop(['join_FTE','leave_FTE','denom_FTE','total_sip_FTE'], axis=1, inplace=True)


In [151]:
annual_df3.head()

Unnamed: 0,org_code,staff_group,month_year,join_FTE,leave_FTE,denom_FTE,region_name,denom_FTE_mean,grouped_sg,leaver_rate,joiner_rate,clinical,non-clinical,sip_FTE_region
0,TAJ,Central functions,2023-07-01,42.25333,35.56669,278.88667,MIDLANDS,273.472425,non-clinical,0.130056,0.154507,0.577655,0.422345,21026.73633
1,RYW,Central functions,2023-07-01,60.07332,63.52,339.09167,MIDLANDS,483.5315,non-clinical,0.131367,0.124239,0.599197,0.400803,21026.73633
2,RYK,Central functions,2023-07-01,6.0,5.0,22.70667,MIDLANDS,20.58667,non-clinical,0.242876,0.291451,0.509536,0.490464,21026.73633
3,RYG,Central functions,2023-07-01,42.77333,40.58667,228.20186,MIDLANDS,233.11593,non-clinical,0.174105,0.183485,0.601556,0.398444,21026.73633
4,RYA,Central functions,2023-07-01,21.3,31.54667,229.50426,MIDLANDS,222.14666,non-clinical,0.142008,0.095883,0.904498,0.095502,21026.73633


The data show the full time equivalent (FTE) number of leavers by organisation and staff group for the previous 12-month period from the date. It also shows the number of staff in post (SIP) FTE averaged over the 12-month period to date.

In [152]:
sorted(annual_df1['staff_group'].unique())

['Ambulance staff',
 'Central functions',
 'HCHS doctors (exc. junior Drs)',
 'Hotel, property & estates',
 'Managers',
 'Midwives',
 'Nurses & health visitors',
 'Other staff or those with unknown classification',
 'Scientific, therapeutic & technical staff',
 'Senior managers',
 'Support to ST&T staff',
 'Support to ambulance staff',
 'Support to doctors, nurses & midwives']

## Load independent variable 1 - local unemployment

Load data about local unemployment so we can use it as a regressor

In [153]:
url_r1 = '../01_data/ONS_localunemployment_monthly.csv'
df_r1 = pd.read_csv(url_r1, parse_dates=['Date'])

df_r1.rename(columns={'%':'local_unemployment','Date':'month_year',
                      'NHSE region name':'region_name'},inplace=True)
df_r1 = df_r1.sort_values('month_year')
df_r1.tail()

Unnamed: 0,month_year,region_name,local_unemployment
2619,2023-06-01,London,0.048
2620,2023-06-01,Midlands,0.0455
2621,2023-06-01,North East,0.052
2622,2023-06-01,North West,0.053
2624,2023-06-01,South West,0.036


## Load IV 2 - sickness absence

Load data about sickness absence to use as second regressor

In [154]:
url_r2 = '../01_data/sickness_benchmarking.csv'
df_r2 = pd.read_csv(url_r2, parse_dates=['DATE'])
trust_types_todrop = ['Clinical Commissioning Group','Integrated Care Board']
df_r2 = df_r2[~df_r2['CLUSTER_GROUP'].isin(trust_types_todrop)]
df_r2 = df_r2.drop(['BENCHMARK_GROUP','ORG_NAME',
                     'NHSE_REGION_CODE','CLUSTER_GROUP'],axis=1)
df_r2.rename(columns={'ORG_CODE':'org_code','DATE':'month_year',
                       'NHSE_REGION_NAME':'region_name','STAFF_GROUP':'staff_group',
                       'FTE_DAYS_LOST':'fte_days_lost','FTE_DAYS_AVAILABLE':'fte_days_available'},inplace=True)
merge_cols = ['month_year', 'org_code','region_name','staff_group']
df_r2['sickness_absence'] = df_r2['fte_days_lost']/df_r2['fte_days_available']
df_r2 = df_r2.reset_index(drop=True)
df_r2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192606 entries, 0 to 192605
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   month_year          192606 non-null  datetime64[ns]
 1   staff_group         192606 non-null  object        
 2   file_date           192606 non-null  object        
 3   org_code            192606 non-null  object        
 4   region_name         192606 non-null  object        
 5   fte_days_lost       166612 non-null  float64       
 6   fte_days_available  166612 non-null  float64       
 7   sickness_absence    166358 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 11.8+ MB


Create 12-month rolling sickness absence column for use with annual turnover data

In [155]:
df_r2['month_year'] = pd.to_datetime(df_r2['month_year'])

# Sort the DataFrame by organisation, staff_group, and month
df_r2.sort_values(by=['month_year'], inplace=True)

# # Calculate the rolling sums for days lost and days available
# df_r2['rolling_days_lost'] = df_r2.groupby(['org_code', 
#                         'staff_group'])['fte_days_lost'].rolling(window=12, min_periods=1).sum().reset_index(level=[0, 1], drop=True)

# df_r2['rolling_days_available'] = df_r2.groupby(['org_code', 
#                         'staff_group'])['fte_days_available'].rolling(window=12, min_periods=1).sum().reset_index(level=[0, 1], drop=True)

# # Calculate the rolling sickness absence rate
# df_r2['annual_sickness_absence'] = df_r2['rolling_days_lost'] / df_r2['rolling_days_available']

# drop fte_days_lost fte_days_available, rolling_days_available and rolling_days_lost columns
df_r2.drop(columns=['fte_days_lost', 'fte_days_available','file_date','region_name'], inplace=True)
#, 'rolling_days_available', 'rolling_days_lost'], inplace=True)

df_r2.head()

Unnamed: 0,month_year,staff_group,org_code,sickness_absence
192605,2018-04-01,Other staff or those with unknown classification,8JX76,
167398,2018-04-01,Nurses & health visitors,RAT,0.051255
167399,2018-04-01,Other staff or those with unknown classification,RAT,
167400,2018-04-01,"Scientific, therapeutic & technical staff",RAT,0.02221
167401,2018-04-01,Senior managers,RAT,0.010011


## Load IV 3 - reasons for sickness absence

Add data about reasons for sickness absence

In [156]:
url_r3 = '../01_data/sickness_absence_reason_pivot.csv'
df_r3 = pd.read_csv(url_r3, parse_dates=['Date'])
#df_r3 = df_r3.drop(['FTE days lost'],axis=1)
df_r3.rename(columns={'Date':'month_year','Staff group':'staff_group'},inplace=True)
#df_r2 = df_r2.reset_index(drop=True)

# drop least frequent reasons for absence
df_r3 = df_r3.drop(['substance_abus','asthma',
                    'dental','blood_disorder','endocrine',
                    'eye','skin_disorders','nervous_system',
                    'gynaecological','unknown','pregnancy_related',
                    'other'],axis=1)

# Replace NaN values with 0 
df_r3 = df_r3.fillna(0)
df_r3.info()
# national level data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1413 entries, 0 to 1412
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   month_year      1413 non-null   datetime64[ns]
 1   staff_group     1413 non-null   object        
 2   anxiety_stress  1413 non-null   float64       
 3   back_problems   1413 non-null   float64       
 4   gastro          1413 non-null   float64       
 5   headache_mig    1413 non-null   float64       
 6   infectious_dis  1413 non-null   float64       
 7   other_msk       1413 non-null   float64       
 8   respiratory     1413 non-null   float64       
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 99.5+ KB


## Load IV 4 and 5 - staff vacancies

In [157]:
url_sg_ref = '../01_data/ref_sg_vacancy.csv'
df_sg_ref = pd.read_csv(url_sg_ref)
df_sg_ref.head()

Unnamed: 0,staff_group,vacancy_sg,all
0,Ambulance staff,Allied Health Professionals,All staff groups
1,Central functions,Administrative and Clerical,All staff groups
2,HCHS doctors (exc. junior Drs),Medical and Dental,All staff groups
3,"Hotel, property & estates",Estates and Ancillary,All staff groups
4,Managers,Administrative and Clerical,All staff groups


In [158]:
url_r4 = '../01_data/vacancy_ESR.csv'
df_r4 = pd.read_csv(url_r4,parse_dates=['month_year'],dayfirst=True)

df_r4 = df_r4.drop(['Published month','Published quarter','England'],axis=1)
df_r4.rename(columns={'NWD Staff Group':'vacancy_sg','NHS England region':'region_name',
                        'Vacancy Wte':'vacancy_FTE'},inplace=True)

df_r4 = df_r4.fillna(0)

# Remove code in brackets
df_r4['region_name'] = df_r4['region_name'].str[:-6].str.rstrip()

# Add staff groupings to match other datasets
df_r4 = pd.merge(df_r4, df_sg_ref, on='vacancy_sg',how='left')

df_r4 = df_r4.drop(['all'],axis=1)

df_r4.info()

# regional level

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8387 entries, 0 to 8386
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   month_year   8387 non-null   datetime64[ns]
 1   vacancy_sg   8387 non-null   object        
 2   region_name  8387 non-null   object        
 3   vacancy_FTE  8387 non-null   float64       
 4   staff_group  7948 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 327.7+ KB


In [159]:
url_r5 = '../01_data/vacancy_TRAC.csv'
df_r5 = pd.read_csv(url_r5,parse_dates=['month_year'],dayfirst=True)

df_r5 = df_r5.drop(['Published month','Published quarter','England'],axis=1)

df_r5.rename(columns={'NWD Staff Group':'vacancy_sg','NHS England region':'region_name',
                        'Advertised FTE':'advertised_FTE'},inplace=True)

df_r5 = df_r5.fillna(0)

# Remove region code in brackets
df_r5['region_name'] = df_r5['region_name'].str[:-6].str.rstrip()

# Add staff groupings to match other datasets
df_r5 = pd.merge(df_r5, df_sg_ref, on='vacancy_sg',how='left')

df_r5 = df_r5.drop(['all'],axis=1)

# regional level

df_r5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4226 entries, 0 to 4225
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   month_year      4226 non-null   datetime64[ns]
 1   vacancy_sg      4226 non-null   object        
 2   region_name     4226 non-null   object        
 3   advertised_FTE  4226 non-null   float64       
 4   staff_group     3999 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 165.2+ KB


## Load IV 6 - Reasons for leaving

In [160]:
url_r6 = '../01_data/rfl_jun23.csv'
df_r6_a = pd.read_csv(url_r6,parse_dates=['month_year'],dayfirst=True)


In [161]:
df_r6_a.head()

Unnamed: 0,quarter,month_year,rfl_death_in_service,rfl_dismissal,rfl_end_of_ft,rfl_flexibility,rfl_health,rfl_incompat_relations,rfl_other,rfl_pay_reward,rfl_pregnancy,rfl_progression_cpd,rfl_relocation,rfl_retirement,rfl_unknown,rfl_work_life_balance,rfl_workforce_transform,rfl_neg_RFL
0,Q1-2016-17,2016-04-01,0.01,0.03,0.09,0.02,0.02,0.01,0.0,0.11,0,0.03,0.12,0.16,0.21,0.1,0.09,0.33
1,Q1-2016-17,2016-05-01,0.01,0.03,0.09,0.02,0.02,0.01,0.0,0.11,0,0.03,0.12,0.16,0.21,0.1,0.09,0.33
2,Q1-2016-17,2016-06-01,0.01,0.03,0.09,0.02,0.02,0.01,0.0,0.11,0,0.03,0.12,0.16,0.21,0.1,0.09,0.33
3,Q2-2016-17,2016-07-01,0.0,0.02,0.34,0.01,0.01,0.01,0.0,0.08,0,0.05,0.09,0.1,0.15,0.06,0.08,0.24
4,Q2-2016-17,2016-08-01,0.0,0.02,0.34,0.01,0.01,0.01,0.0,0.08,0,0.05,0.09,0.1,0.15,0.06,0.08,0.24


In [162]:
# to_keep = ['month_year','rfl_dismissal','rfl_end_of_ft',
#             'rfl_flexibility','rfl_health','rfl_pay_reward','rfl_progression_cpd',
#             'rfl_relocation','rfl_retirement','rfl_work_life_balance','rfl_workforce_transform']
to_keep = ['month_year','rfl_neg_RFL']
df_r6 = df_r6_a[to_keep]
#df_r6 = df_r6_a
df_r6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   month_year   87 non-null     datetime64[ns]
 1   rfl_neg_RFL  87 non-null     float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 1.5 KB


## Merge IV dfs to main df

In [163]:
annual_df3.head()

Unnamed: 0,org_code,staff_group,month_year,join_FTE,leave_FTE,denom_FTE,region_name,denom_FTE_mean,grouped_sg,leaver_rate,joiner_rate,clinical,non-clinical,sip_FTE_region
0,TAJ,Central functions,2023-07-01,42.25333,35.56669,278.88667,MIDLANDS,273.472425,non-clinical,0.130056,0.154507,0.577655,0.422345,21026.73633
1,RYW,Central functions,2023-07-01,60.07332,63.52,339.09167,MIDLANDS,483.5315,non-clinical,0.131367,0.124239,0.599197,0.400803,21026.73633
2,RYK,Central functions,2023-07-01,6.0,5.0,22.70667,MIDLANDS,20.58667,non-clinical,0.242876,0.291451,0.509536,0.490464,21026.73633
3,RYG,Central functions,2023-07-01,42.77333,40.58667,228.20186,MIDLANDS,233.11593,non-clinical,0.174105,0.183485,0.601556,0.398444,21026.73633
4,RYA,Central functions,2023-07-01,21.3,31.54667,229.50426,MIDLANDS,222.14666,non-clinical,0.142008,0.095883,0.904498,0.095502,21026.73633


In [164]:
sorted(annual_df3['staff_group'].unique())

['Ambulance staff',
 'Central functions',
 'HCHS doctors (exc. junior Drs)',
 'Hotel, property & estates',
 'Managers',
 'Midwives',
 'Nurses & health visitors',
 'Other staff or those with unknown classification',
 'Scientific, therapeutic & technical staff',
 'Senior managers',
 'Support to ST&T staff',
 'Support to ambulance staff',
 'Support to doctors, nurses & midwives']

In [165]:
# # ref table with org information
# url_ref_org = '../01_data/REF_ORGANISATION.csv'
# ref_org = pd.read_csv(url_ref_org)

# ref_org = ref_org.drop(['Org_Open_Date','Org_Close_Date', 'Org_Name','Org_Type',
#                         'Org_Code_For_Use','Org_Name_For_Use','Org_Status',
#                         'Legacy_Org_Close_Date','UDALFileID','Org_System_Name',
#                         ' NHS Provider flag ',' Total WTE recorded '],axis=1)

# ref_org.rename(columns={'Org_Code_For_Join':'org_code','Org_Type_Grouped':'org_type'},inplace=True)

# ref_org.info()


In [166]:
#def merge_ivs(df, ref_org, df_r1, df_r2, df_r3,df_r4,df_r5,df_r6):
    # org reference data
#df1 = pd.merge(annual_df3, ref_org, on=['org_code'],how='left')

In [167]:
# convert region_name to all upper case
df_r1['region_name'] = df_r1['region_name'].str.upper()
df_r1['region_name'].unique()

array(['EAST OF ENGLAND', 'LONDON', 'MIDLANDS', 'NORTH EAST',
       'NORTH WEST', 'SOUTH EAST', 'SOUTH WEST'], dtype=object)

In [168]:
# local unemployment rate
df1 = pd.merge(annual_df3, df_r1, on=['month_year', 'region_name'],how='left')
df1 = df1.sort_values('month_year')

In [169]:
# sickness absence
r2_merge_cols = ['month_year', 'org_code','staff_group']
df2 = pd.merge(df1, df_r2, on=r2_merge_cols,how='left')

In [170]:
df2.tail()

Unnamed: 0,org_code,staff_group,month_year,join_FTE,leave_FTE,denom_FTE,region_name,denom_FTE_mean,grouped_sg,leaver_rate,joiner_rate,clinical,non-clinical,sip_FTE_region,local_unemployment,sickness_absence
212351,RR7,"Support to doctors, nurses & midwives",2023-07-01,210.19465,141.23079,1138.84478,NORTH EAST AND YORKSHIRE,1115.71059,clinical,0.126584,0.188395,0.592948,0.407052,58157.37682,,0.06905
212352,RP5,"Support to doctors, nurses & midwives",2023-07-01,270.01464,122.50665,1325.36765,NORTH EAST AND YORKSHIRE,1268.898895,clinical,0.096546,0.212794,0.582598,0.417402,58157.37682,,0.076689
212353,RNN,"Support to doctors, nurses & midwives",2023-07-01,421.44452,138.48668,1678.37333,NORTH EAST AND YORKSHIRE,1630.333475,clinical,0.084944,0.258502,0.642758,0.357242,58157.37682,,0.054363
212354,RTF,"Support to doctors, nurses & midwives",2023-07-01,309.41073,203.43067,1910.07865,NORTH EAST AND YORKSHIRE,1881.29756,clinical,0.108133,0.164467,0.588138,0.411862,58157.37682,,0.068247
212355,TAJ,Central functions,2023-07-01,42.25333,35.56669,278.88667,MIDLANDS,273.472425,non-clinical,0.130056,0.154507,0.577655,0.422345,21026.73633,,0.023288


In [171]:
# reason for sickness absence
r3_merge_cols = ['month_year','staff_group']
df3 = pd.merge(df2, df_r3, on=r3_merge_cols,how='left')

In [172]:
df3.head()

Unnamed: 0,org_code,staff_group,month_year,join_FTE,leave_FTE,denom_FTE,region_name,denom_FTE_mean,grouped_sg,leaver_rate,...,sip_FTE_region,local_unemployment,sickness_absence,anxiety_stress,back_problems,gastro,headache_mig,infectious_dis,other_msk,respiratory
0,R1H,Midwives,2017-04-01,0.0,0.0,549.96616,LONDON,549.96616,clinical,0.0,...,5512.35267,0.054,,0.286536,0.067674,0.082077,0.023262,0.0138,0.143418,0.03331
1,RWW,Ambulance staff,2017-04-01,0.0,0.0,1.0,NORTH WEST,1.0,clinical,0.0,...,3225.55901,0.043,,0.257249,0.12452,0.085541,0.01672,0.00223,0.119982,0.033138
2,RWJ,Ambulance staff,2017-04-01,0.0,0.0,1.0,NORTH WEST,1.0,clinical,0.0,...,3225.55901,0.043,,0.257249,0.12452,0.085541,0.01672,0.00223,0.119982,0.033138
3,RM3,Ambulance staff,2017-04-01,0.0,0.0,1.53333,NORTH WEST,1.53333,clinical,0.0,...,3225.55901,0.043,,0.257249,0.12452,0.085541,0.01672,0.00223,0.119982,0.033138
4,RM3,Ambulance staff,2017-04-01,0.0,0.0,1.53333,NORTH WEST,1.53333,clinical,0.0,...,3225.55901,0.043,,0.257249,0.12452,0.085541,0.01672,0.00223,0.119982,0.033138


In [173]:
# vacancy - need to calculate rate at regional level here
# use sip_FTE_region calculated earlier
r4_merge_cols = ['month_year','region_name','staff_group']

df_r4['region_name'] = df_r4['region_name'].str.upper()

df4 = pd.merge(df3, df_r4, on=r4_merge_cols,how='left')

df4.drop(columns=['vacancy_sg'], inplace=True)

df4['vacancy_rate'] = df4['vacancy_FTE'] / df4['sip_FTE_region']


In [174]:
df4.drop(columns=['sip_FTE_region','vacancy_FTE'], inplace=True)

In [285]:
# reasons for leaving
df6 = pd.merge(df4, df_r6, on='month_year',how='left')

In [286]:
# drop pre-2019 data - missing for most variables and messes up date dummies
df6 = df6[df6['month_year'] > '2018-12-31']

In [287]:
# add region, year and quarter as dummy variables

# add region as dummy variable
df6 = pd.get_dummies(df6, columns=['region_name'], drop_first=True)

# Extract year and quarter from month_year
df6['year'] = df6['month_year'].dt.year
df6['quarter'] = df6['month_year'].dt.quarter

df6 = pd.get_dummies(df6, columns=['year'], prefix='year', drop_first=True)
df6 = pd.get_dummies(df6, columns=['quarter'], prefix='q', drop_first=True)

# Convert True/False dummy variable categories to integer 0/1
bool_columns = df6.select_dtypes(include='bool').columns
df6[bool_columns] = df6[bool_columns].astype(int)

In [288]:
# drop duplicates
#df6.drop_duplicates(inplace=True)

# replaces nans with zeros
df6.fillna(0, inplace=True)

In [289]:
df6.info()

<class 'pandas.core.frame.DataFrame'>
Index: 154195 entries, 58161 to 212355
Data columns (total 36 columns):
 #   Column                                Non-Null Count   Dtype         
---  ------                                --------------   -----         
 0   org_code                              154195 non-null  object        
 1   staff_group                           154195 non-null  object        
 2   month_year                            154195 non-null  datetime64[ns]
 3   join_FTE                              154195 non-null  float64       
 4   leave_FTE                             154195 non-null  float64       
 5   denom_FTE                             154195 non-null  float64       
 6   denom_FTE_mean                        154195 non-null  float64       
 7   grouped_sg                            154195 non-null  object        
 8   leaver_rate                           154195 non-null  float64       
 9   joiner_rate                           154195 non-null  float

In [290]:
# Need to cut dataframe to earliest and latest data available for all fields. 
# Do this by cutting rows where all values for key variables are zero
df7 = df6[~((df6['leaver_rate'] == 0) | (df6['joiner_rate'] == 0) | (df6['sickness_absence'] == 0))]

In [291]:
df7 = df7[~((df7['rfl_neg_RFL'] == 0))]

df7 = df7[~((df7['anxiety_stress'] == 0)| (df7['vacancy_rate'] == 0)
            | (df7['respiratory'] == 0))]

In [292]:
df7.info()

<class 'pandas.core.frame.DataFrame'>
Index: 116193 entries, 58161 to 201499
Data columns (total 36 columns):
 #   Column                                Non-Null Count   Dtype         
---  ------                                --------------   -----         
 0   org_code                              116193 non-null  object        
 1   staff_group                           116193 non-null  object        
 2   month_year                            116193 non-null  datetime64[ns]
 3   join_FTE                              116193 non-null  float64       
 4   leave_FTE                             116193 non-null  float64       
 5   denom_FTE                             116193 non-null  float64       
 6   denom_FTE_mean                        116193 non-null  float64       
 7   grouped_sg                            116193 non-null  object        
 8   leaver_rate                           116193 non-null  float64       
 9   joiner_rate                           116193 non-null  float

In [293]:
#annual_df_ivs = merge_ivs(annual_df3,ref_org,df_r1, df_r2, df_r3,df_r4,df_r5,df_r6)
#monthly_df_ivs = merge_ivs(monthly_df1,df_r1, df_r2, df_r3,df_r4,df_r5,df_r6)

In [294]:
# create copy of df7 and name annual_df_ivs
annual_df_ivs = df7.copy()

In [295]:
annual_df_ivs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 116193 entries, 58161 to 201499
Data columns (total 36 columns):
 #   Column                                Non-Null Count   Dtype         
---  ------                                --------------   -----         
 0   org_code                              116193 non-null  object        
 1   staff_group                           116193 non-null  object        
 2   month_year                            116193 non-null  datetime64[ns]
 3   join_FTE                              116193 non-null  float64       
 4   leave_FTE                             116193 non-null  float64       
 5   denom_FTE                             116193 non-null  float64       
 6   denom_FTE_mean                        116193 non-null  float64       
 7   grouped_sg                            116193 non-null  object        
 8   leaver_rate                           116193 non-null  float64       
 9   joiner_rate                           116193 non-null  float

In [296]:
annual_df_ivs.to_csv(f'annual_modelling_data.csv', index=False)

In [297]:
#monthly_df_ivs.to_csv(f'monthly_modelling_data.csv', index=False)

## OLS multiple regression

### Annual data

Specify the dependent variable (dv). All other fields to be dropped. 

In [298]:
dv = 'leaver_rate'
to_drop = ['join_FTE','leave_FTE','month_year','denom_FTE','org_code',
           'denom_FTE_mean','staff_group','grouped_sg','non-clinical',                          
           dv]


Define the design matrix (X) and the dependent variable (y)


In [299]:
annual_df_ivs.reset_index(drop = True)
X = annual_df_ivs.drop(to_drop, axis=1)
y = annual_df_ivs[dv]

y.head()


58161    0.117382
58162    0.130728
58163    0.118449
58164    0.072889
58165    0.412151
Name: leaver_rate, dtype: float64

In [300]:
# Add a constant column to the design matrix
X = sm.add_constant(X)

X.info()


<class 'pandas.core.frame.DataFrame'>
Index: 116193 entries, 58161 to 201499
Data columns (total 27 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   const                                 116193 non-null  float64
 1   joiner_rate                           116193 non-null  float64
 2   clinical                              116193 non-null  float64
 3   local_unemployment                    116193 non-null  float64
 4   sickness_absence                      116193 non-null  float64
 5   anxiety_stress                        116193 non-null  float64
 6   back_problems                         116193 non-null  float64
 7   gastro                                116193 non-null  float64
 8   headache_mig                          116193 non-null  float64
 9   infectious_dis                        116193 non-null  float64
 10  other_msk                             116193 non-null  float64
 11  r

In [301]:
# Fit the regression model
model = sm.OLS(y, X)
results = model.fit()

# Print the regression results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:            leaver_rate   R-squared:                       0.137
Model:                            OLS   Adj. R-squared:                  0.136
Method:                 Least Squares   F-statistic:                     706.4
Date:                Mon, 27 Nov 2023   Prob (F-statistic):               0.00
Time:                        12:11:02   Log-Likelihood:             1.7033e+05
No. Observations:              116193   AIC:                        -3.406e+05
Df Residuals:                  116166   BIC:                        -3.404e+05
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


In [302]:
# Convert the regression results summary to a DataFrame
results_df = pd.read_html(results.summary().tables[1].as_html(), header=0, index_col=0)[0]

# Export the DataFrame to a CSV file
results_df.to_csv("annual_regression_results.csv")

PermissionError: [Errno 13] Permission denied: 'annual_regression_results.csv'

In [303]:
results_df.sort_values(by='coef')

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
local_unemployment,-0.2655,0.039,-6.817,0.0,-0.342,-0.189
const,-0.1332,0.006,-21.558,0.0,-0.145,-0.121
back_problems,-0.0468,0.02,-2.377,0.017,-0.085,-0.008
region_name_NORTH EAST AND YORKSHIRE,-0.0263,0.002,-17.31,0.0,-0.029,-0.023
clinical,-0.0218,0.002,-8.987,0.0,-0.027,-0.017
year_2021,-0.0218,0.001,-27.183,0.0,-0.023,-0.02
region_name_SOUTH WEST,-0.0135,0.001,-19.065,0.0,-0.015,-0.012
year_2023,-0.0111,0.001,-8.986,0.0,-0.013,-0.009
region_name_MIDLANDS,-0.0111,0.001,-15.366,0.0,-0.013,-0.01
q_4,-0.0106,0.001,-13.514,0.0,-0.012,-0.009


### Monthly data

In [190]:
# dv = 'leaver_rate'
# to_drop = ['month_year','org_code','staff_group','annual_sickness_absence',
#            dv]

# monthly_df_ivs.reset_index(drop = True)
# X = monthly_df_ivs.drop(to_drop, axis=1)
# y = monthly_df_ivs[dv]

# #y = y.dropna()

# y.head()


In [191]:
# # Add a constant column to the design matrix
# X = sm.add_constant(X)

# X.tail()


In [192]:
# Fit the regression model
model = sm.OLS(y, X)
monthly_results = model.fit()

# Print the regression results
print(monthly_results.summary())

                            OLS Regression Results                            
Dep. Variable:            leaver_rate   R-squared:                       0.130
Model:                            OLS   Adj. R-squared:                  0.130
Method:                 Least Squares   F-statistic:                     544.1
Date:                Mon, 27 Nov 2023   Prob (F-statistic):               0.00
Time:                        11:45:35   Log-Likelihood:             1.1298e+05
No. Observations:              127535   AIC:                        -2.259e+05
Df Residuals:                  127499   BIC:                        -2.255e+05
Df Model:                          35                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


In [193]:
# Convert the regression results summary to a DataFrame
monthly_results_df = pd.read_html(monthly_results.summary().tables[1].as_html(), header=0, index_col=0)[0]

# Export the DataFrame to a CSV file
monthly_results_df.to_csv("monthly_regression_results.csv")