# Data processing for model and dashboard

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns


#from sklearn.preprocessing import MinMaxScaler
#import geopandas as gpd

## Read in and process dependent variable - turnover

Read turnover data from csv into a dataframe.

In [2]:
# annual and monthly data

annual_url = '../01_data/processed_annual_turnover.csv'
monthly_url = '../01_data/processed_monthly_turnover.csv'

annual_df = pd.read_csv(annual_url, parse_dates=['month_year'])
annual_df = annual_df.drop(['n'],axis=1)
#annual_df.info()

monthly_df = pd.read_csv(monthly_url, parse_dates=['month_year'])
monthly_df = monthly_df.drop(['n'],axis=1)
monthly_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147107 entries, 0 to 147106
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   org_code     147107 non-null  object        
 1   region_name  147107 non-null  object        
 2   org_type     147107 non-null  object        
 3   staff_group  147107 non-null  object        
 4   month_year   147107 non-null  datetime64[ns]
 5   join_HC      109986 non-null  float64       
 6   join_FTE     109986 non-null  float64       
 7   leave_HC     110122 non-null  float64       
 8   leave_FTE    110122 non-null  float64       
 9   denom_HC     146797 non-null  float64       
 10  denom_FTE    146797 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(4)
memory usage: 12.3+ MB


Function to clean and process DV dataframes

In [3]:
def clean_dv(df):
    # add leaver and joiner rates column
    df['leaver_rate'] = df['leave_FTE']/df['denom_FTE']
    df['joiner_rate'] = df['join_FTE']/df['denom_FTE']

    # drop unneeded HC columns
    df = df.drop(['join_HC','leave_HC','denom_HC'],axis=1)
    # drop 'All staff groups' - use sum of all others instead when wanting all staff.
    df.drop(df[df['staff_group'] == 'All staff groups'].index, inplace = True)

    ## calculate a columns with % of staff groups by organisation and date
    # first, calculate a total staff in post (SIP; all staff) FTE dataframe for each organisation by month.
    df_sip_org = df.groupby(['month_year', 'org_code', 'staff_group','region_name'])['denom_FTE'].sum().reset_index()
    # second, group by 'month_year' and 'org_code' and sum the 'FTE' values for each group
    df_total_sip_FTE = df_sip_org.groupby(['month_year', 'org_code','region_name'])['denom_FTE'].sum().reset_index()
    # third, merge the total_sip_FTE DataFrame back into the original DataFrame 
    df2 = pd.merge(df, df_total_sip_FTE, on=['month_year', 'org_code','region_name'], suffixes=('', '_total'))

    ## (also calculate here a regional sip FTE by staff group column for later use)
    # staff in post by staff group for each region by month
    df_sip_region = df.groupby(['month_year','staff_group','region_name'])['denom_FTE'].sum().reset_index()
    # # # merge the df_sip_region DataFrame  
    df2 = pd.merge(df2, df_sip_region, on=['month_year','region_name','staff_group'], suffixes=('', '_region'))

    df2.rename(columns={'denom_FTE_total': 'total_sip_FTE','denom_FTE_region':'sip_FTE_region'}, inplace=True)

    # fourth, calculate the percentage of staff group FTE by organization and date
    df2['%_FTE'] = df2['denom_FTE'] / df2['total_sip_FTE']

    # fifth, pivot the DataFrame to get staff groups as new columns with % values
    df3 = df2.pivot(index=['month_year', 'org_code','region_name'], columns='staff_group', values='%_FTE').reset_index()

    # finally, merge the pivot DataFrame with the original DataFrame
    df4 = pd.merge(df2, df3, on=['month_year', 'org_code','region_name'])

    # make the new staff group name columns friendlier
    df4.rename(columns={'Ambulance staff': '%_amb_staff','Central functions':'%_cent_funct','HCHS doctors (exc. junior Drs)': '%_senior_docs',
    'Hotel, property & estates': '%_estates', 'Managers':'%_managers','Nurses & health visitors': '%_nurses_hv',
    'Other staff or those with unknown classification':'%_unknown','Scientific, therapeutic & technical staff':'%_sci_tech_staff',
    'Senior managers':'%senior_managers','Support to ST&T staff': '%_supp_sci_tech','Support to doctors, nurses & midwives': '%_supp_doc_nur_mid',
    'Midwives':'%_midwives','Support to ambulance staff': '%_supp_amb_staff'}, inplace=True)

    # replace inf values with nan (can happen with rate calcs)
    df4.replace([np.inf, -np.inf], np.nan, inplace=True)
    # transform nans to zeros
    df4.fillna(0, inplace=True)

    # Add a small constant to avoid taking the log of zero
    small_constant = 1e-5
    
    # log scale the total_SIP_FTE column to be in line with other variables. proxy for size of organisation
    df4['log_total_sip_FTE'] = np.log(df4['total_sip_FTE'] + small_constant)

    # drop unused columns (keep total SIP FTE for calculating vacancy rates later)
    df4.drop(['join_FTE','leave_FTE','%_FTE','denom_FTE','total_sip_FTE'], axis=1, inplace=True)

    return df4

In [4]:
annual_df = clean_dv(annual_df)
monthly_df = clean_dv(monthly_df)

In [5]:
annual_df.head()

Unnamed: 0,org_code,region_name,org_type,staff_group,month_year,leaver_rate,joiner_rate,sip_FTE_region,%_amb_staff,%_cent_funct,...,%_managers,%_midwives,%_nurses_hv,%_unknown,%_sci_tech_staff,%senior_managers,%_supp_sci_tech,%_supp_amb_staff,%_supp_doc_nur_mid,log_total_sip_FTE
0,R0A,North West,Acute - Teaching,Ambulance staff,2020-02-01,0.0,0.0,2349.4823,5e-05,0.083106,...,0.009468,0.024753,0.315781,0.000501,0.161369,0.007363,0.071334,0.0,0.244138,9.901994
1,R0A,North West,Acute - Teaching,Central functions,2020-02-01,0.113644,0.207552,14709.9266,5e-05,0.083106,...,0.009468,0.024753,0.315781,0.000501,0.161369,0.007363,0.071334,0.0,0.244138,9.901994
2,R0A,North West,Acute - Teaching,HCHS doctors (exc. junior Drs),2020-02-01,0.054796,0.081947,8418.31775,5e-05,0.083106,...,0.009468,0.024753,0.315781,0.000501,0.161369,0.007363,0.071334,0.0,0.244138,9.901994
3,R0A,North West,Acute - Teaching,"Hotel, property & estates",2020-02-01,0.058028,0.151142,9362.758,5e-05,0.083106,...,0.009468,0.024753,0.315781,0.000501,0.161369,0.007363,0.071334,0.0,0.244138,9.901994
4,R0A,North West,Acute - Teaching,Managers,2020-02-01,0.081874,0.204227,2537.68919,5e-05,0.083106,...,0.009468,0.024753,0.315781,0.000501,0.161369,0.007363,0.071334,0.0,0.244138,9.901994


The data show the full time equivalent (FTE) number of leavers by organisation and staff group for the previous 12-month period from the date. It also shows the number of staff in post (SIP) FTE averaged over the 12-month period to date.

In [6]:
annual_df['staff_group'].unique()

array(['Ambulance staff', 'Central functions',
       'HCHS doctors (exc. junior Drs)', 'Hotel, property & estates',
       'Managers', 'Midwives', 'Nurses & health visitors',
       'Other staff or those with unknown classification',
       'Scientific, therapeutic & technical staff', 'Senior managers',
       'Support to ST&T staff', 'Support to doctors, nurses & midwives',
       'Support to ambulance staff'], dtype=object)

## Load independent variable 1 - local unemployment

Load data about local unemployment so we can use it as a regressor

In [7]:
url_r1 = '../01_data/ONS_localunemployment_monthly.csv'
df_r1 = pd.read_csv(url_r1, parse_dates=['Date'])

df_r1.drop(['Region','thousands'],axis=1,inplace=True)
df_r1.rename(columns={'%':'local_unemployment','Date':'month_year',
                      'NHSE region name':'region_name'},inplace=True)
df_r1 = df_r1.sort_values('month_year')
df_r1.head()

Unnamed: 0,month_year,region_name,local_unemployment
0,1992-04-01,Midlands,0.087
1,1992-04-01,East of England,0.079
2,1992-04-01,London,0.123
3,1992-04-01,North East,0.122
4,1992-04-01,North West,0.101


## Load IV 2 - sickness absence

Load data about sickness absence to use as second regressor

In [8]:
url_r2 = '../01_data/sickness_absence.csv'
df_r2 = pd.read_csv(url_r2, parse_dates=['Date'])
trust_types_todrop = ['Clinical Commissioning Group','Integrated Care Board']
df_r2 = df_r2[~df_r2['Cluster group'].isin(trust_types_todrop)]
df_r2 = df_r2.drop(['Benchmark group','Org name',
                    'NHSE region code','Cluster group'],axis=1)
df_r2.rename(columns={'Org code':'org_code','Date':'month_year',
                      'NHSE region name':'region_name','Staff group':'staff_group',
                      'FTE days lost':'fte_days_lost','FTE days available':'fte_days_available'},inplace=True)
merge_cols = ['month_year', 'org_code','region_name','staff_group']
df_r2['sickness_absence'] = df_r2['fte_days_lost']/df_r2['fte_days_available']
df_r2 = df_r2.reset_index(drop=True)
df_r2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243626 entries, 0 to 243625
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   month_year          243626 non-null  datetime64[ns]
 1   org_code            243626 non-null  object        
 2   region_name         243626 non-null  object        
 3   staff_group         243626 non-null  object        
 4   fte_days_lost       243626 non-null  float64       
 5   fte_days_available  243626 non-null  float64       
 6   sickness_absence    243372 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(3)
memory usage: 13.0+ MB


Create 12-month rolling sickness absence column for use with annual turnover data

In [9]:
df_r2['month_year'] = pd.to_datetime(df_r2['month_year'])

# Sort the DataFrame by organisation, staff_group, and month
df_r2.sort_values(by=['org_code', 'staff_group', 'month_year'], inplace=True)

# Calculate the rolling sums for days lost and days available
df_r2['rolling_days_lost'] = df_r2.groupby(['org_code', 
                        'staff_group'])['fte_days_lost'].rolling(window=12, min_periods=1).sum().reset_index(level=[0, 1], drop=True)

df_r2['rolling_days_available'] = df_r2.groupby(['org_code', 
                        'staff_group'])['fte_days_available'].rolling(window=12, min_periods=1).sum().reset_index(level=[0, 1], drop=True)

# Calculate the rolling sickness absence rate
df_r2['annual_sickness_absence'] = df_r2['rolling_days_lost'] / df_r2['rolling_days_available']

# drop fte_days_lost fte_days_available, rolling_days_available and rolling_days_lost columns
df_r2.drop(columns=['fte_days_lost', 'fte_days_available', 'rolling_days_available', 'rolling_days_lost'], inplace=True)

df_r2.head()

Unnamed: 0,month_year,org_code,region_name,staff_group,sickness_absence,annual_sickness_absence
523,2018-03-01,0AR,Special Health Authorities and other statutory...,All staff groups,0.030707,0.030707
7395,2018-04-01,0AR,Special Health Authorities and other statutory...,All staff groups,0.038024,0.034318
12211,2018-05-01,0AR,Special Health Authorities and other statutory...,All staff groups,0.03973,0.036162
18374,2018-06-01,0AR,Special Health Authorities and other statutory...,All staff groups,0.033094,0.035407
22005,2018-07-01,0AR,Special Health Authorities and other statutory...,All staff groups,0.031692,0.034648


## Load IV 3 - reasons for sickness absence

Add data about reasons for sickness absence

In [10]:
url_r3 = '../01_data/sickness_absence_reason_pivot.csv'
df_r3 = pd.read_csv(url_r3, parse_dates=['Date'])
#df_r3 = df_r3.drop(['FTE days lost'],axis=1)
df_r3.rename(columns={'Date':'month_year','Staff group':'staff_group'},inplace=True)
#df_r2 = df_r2.reset_index(drop=True)

# drop least frequent reasons for absence
df_r3 = df_r3.drop(['substance_abus','asthma',
                    'dental','blood_disorder','endocrine',
                    'eye','skin_disorders','nervous_system'],axis=1)

# Replace NaN values with 0 
df_r3 = df_r3.fillna(0)
df_r3.info()
# national level data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1387 entries, 0 to 1386
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   month_year         1387 non-null   datetime64[ns]
 1   staff_group        1387 non-null   object        
 2   anxiety_stress     1387 non-null   float64       
 3   back_problems      1387 non-null   float64       
 4   gastro             1387 non-null   float64       
 5   gynaecological     1387 non-null   float64       
 6   headache_mig       1387 non-null   float64       
 7   infectious_dis     1387 non-null   float64       
 8   other              1387 non-null   float64       
 9   other_msk          1387 non-null   float64       
 10  pregnancy_related  1387 non-null   float64       
 11  respiratory        1387 non-null   float64       
 12  unknown            1387 non-null   float64       
dtypes: datetime64[ns](1), float64(11), object(1)
memory usage: 141.

## Load IV 4 and 5 - staff vacancies

In [11]:
url_sg_ref = '../01_data/ref_sg_vacancy.csv'
df_sg_ref = pd.read_csv(url_sg_ref)
df_sg_ref.head()

Unnamed: 0,staff_group,vacancy_sg,all
0,Ambulance staff,Allied Health Professionals,All staff groups
1,Central functions,Administrative and Clerical,All staff groups
2,HCHS doctors (exc. junior Drs),Medical and Dental,All staff groups
3,"Hotel, property & estates",Estates and Ancillary,All staff groups
4,Managers,Administrative and Clerical,All staff groups


In [12]:
url_r4 = '../01_data/vacancy_ESR.csv'
df_r4 = pd.read_csv(url_r4,parse_dates=['month_year'],dayfirst=True)

df_r4 = df_r4.drop(['Published month','Published quarter','England'],axis=1)
df_r4.rename(columns={'NWD Staff Group':'vacancy_sg','NHS England region':'region_name',
                        'Vacancy Wte':'vacancy_FTE'},inplace=True)

df_r4 = df_r4.fillna(0)

# Remove code in brackets
df_r4['region_name'] = df_r4['region_name'].str[:-6].str.rstrip()

# Add staff groupings to match other datasets
df_r4 = pd.merge(df_r4, df_sg_ref, on='vacancy_sg',how='left')

df_r4 = df_r4.drop(['all'],axis=1)

df_r4.info()

# regional level

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8387 entries, 0 to 8386
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   month_year   8387 non-null   datetime64[ns]
 1   vacancy_sg   8387 non-null   object        
 2   region_name  8387 non-null   object        
 3   vacancy_FTE  8387 non-null   float64       
 4   staff_group  7948 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 327.7+ KB


In [13]:
url_r5 = '../01_data/vacancy_TRAC.csv'
df_r5 = pd.read_csv(url_r5,parse_dates=['month_year'],dayfirst=True)

df_r5 = df_r5.drop(['Published month','Published quarter','England'],axis=1)

df_r5.rename(columns={'NWD Staff Group':'vacancy_sg','NHS England region':'region_name',
                        'Advertised FTE':'advertised_FTE'},inplace=True)

df_r5 = df_r5.fillna(0)

# Remove region code in brackets
df_r5['region_name'] = df_r5['region_name'].str[:-6].str.rstrip()

# Add staff groupings to match other datasets
df_r5 = pd.merge(df_r5, df_sg_ref, on='vacancy_sg',how='left')

df_r5 = df_r5.drop(['all'],axis=1)

# regional level

df_r5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4226 entries, 0 to 4225
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   month_year      4226 non-null   datetime64[ns]
 1   vacancy_sg      4226 non-null   object        
 2   region_name     4226 non-null   object        
 3   advertised_FTE  4226 non-null   float64       
 4   staff_group     3999 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 165.2+ KB


## Load IV 6 - Reasons for leaving

In [14]:
url_r6 = '../01_data/rfl_dec22.csv'
df_r6 = pd.read_csv(url_r6,parse_dates=['month_year'],dayfirst=True)

#df_r6 = df_r6.drop(['financial_year','quarter'],axis=1)

# df_r5.rename(columns={'NWD Staff Group':'staff_group','NHS England region':'region_name',
#                         'Advertised FTE':'advertised_FTE'},inplace=True)

# df_r5 = df_r5.fillna(0)

# # Remove region code in brackets
# df_r5['region_name'] = df_r5['region_name'].str[:-6].str.rstrip()
# # regional level

# shortern column names and add %_ at beginning
df_r6.rename(columns={'Death in service':'%_death_in_service','Dismissal':'%_dismissal',
                        'End of fixed term':'%_end_of_ft','Flexibility':'%_flexibility',
                        'Health':'%_health','Incompatible working relationships':'%_incompat_relations',
                        'Other':'%_other', 'Pay/Reward':'%_pay_reward', 'Pregnancy':'%_pregnancy',
                        'Progression/CPD':'%_progression_cpd','Relocation':'%_relocation',
                        'Retirement':'%_retirement','Unknown':'%_unknown','Work/Life Balance':'%_work_life_balance',
                        'Workforce Transformation':'%_workforce_transform'},inplace=True)

df_r6['%_other'] = pd.to_numeric(df_r6['%_other'], errors='coerce')
df_r6['%_unknown'] = pd.to_numeric(df_r6['%_unknown'], errors='coerce')
df_r6['%_workforce_transform'] = pd.to_numeric(df_r6['%_workforce_transform'], errors='coerce')


df_r6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   month_year             20 non-null     datetime64[ns]
 1   %_death_in_service     20 non-null     float64       
 2   %_dismissal            20 non-null     float64       
 3   %_end_of_ft            20 non-null     float64       
 4   %_flexibility          20 non-null     float64       
 5   %_health               20 non-null     float64       
 6   %_incompat_relations   20 non-null     float64       
 7   %_other                17 non-null     float64       
 8   %_pay_reward           20 non-null     float64       
 9   %_pregnancy            20 non-null     int64         
 10  %_progression_cpd      20 non-null     float64       
 11  %_relocation           20 non-null     float64       
 12  %_retirement           20 non-null     float64       
 13  %_unkno

## Merge IV dfs to main df

In [15]:
annual_df.head()

Unnamed: 0,org_code,region_name,org_type,staff_group,month_year,leaver_rate,joiner_rate,sip_FTE_region,%_amb_staff,%_cent_funct,...,%_managers,%_midwives,%_nurses_hv,%_unknown,%_sci_tech_staff,%senior_managers,%_supp_sci_tech,%_supp_amb_staff,%_supp_doc_nur_mid,log_total_sip_FTE
0,R0A,North West,Acute - Teaching,Ambulance staff,2020-02-01,0.0,0.0,2349.4823,5e-05,0.083106,...,0.009468,0.024753,0.315781,0.000501,0.161369,0.007363,0.071334,0.0,0.244138,9.901994
1,R0A,North West,Acute - Teaching,Central functions,2020-02-01,0.113644,0.207552,14709.9266,5e-05,0.083106,...,0.009468,0.024753,0.315781,0.000501,0.161369,0.007363,0.071334,0.0,0.244138,9.901994
2,R0A,North West,Acute - Teaching,HCHS doctors (exc. junior Drs),2020-02-01,0.054796,0.081947,8418.31775,5e-05,0.083106,...,0.009468,0.024753,0.315781,0.000501,0.161369,0.007363,0.071334,0.0,0.244138,9.901994
3,R0A,North West,Acute - Teaching,"Hotel, property & estates",2020-02-01,0.058028,0.151142,9362.758,5e-05,0.083106,...,0.009468,0.024753,0.315781,0.000501,0.161369,0.007363,0.071334,0.0,0.244138,9.901994
4,R0A,North West,Acute - Teaching,Managers,2020-02-01,0.081874,0.204227,2537.68919,5e-05,0.083106,...,0.009468,0.024753,0.315781,0.000501,0.161369,0.007363,0.071334,0.0,0.244138,9.901994


In [16]:
def merge_ivs(df, df_r1, df_r2, df_r3,df_r4,df_r5,df_r6):
    # local unemployment rate
    df1 = pd.merge(df, df_r1, on=['month_year', 'region_name'],how='left')
    df1 = df1.sort_values('month_year')

    # sickness absence
    r2_merge_cols = ['month_year', 'org_code','region_name','staff_group']
    df2 = pd.merge(df1, df_r2, on=r2_merge_cols,how='left')
    df2.drop_duplicates(subset=r2_merge_cols)

    # reason for sickness absence
    r3_merge_cols = ['month_year','staff_group']
    df3 = pd.merge(df2, df_r3, on=r3_merge_cols,how='left')

    # vacancy - need to calculate rate at regional level here
    # use sip_FTE_region calculated earlier
    r4_merge_cols = ['month_year','region_name','staff_group']

    df4 = pd.merge(df3, df_r4, on=r4_merge_cols,how='left')

    df4.drop(columns=['vacancy_sg'], inplace=True)

    df4['vacancy_rate'] = df4['vacancy_FTE'] / df4['sip_FTE_region']

    df5 = pd.merge(df4, df_r5, on=r4_merge_cols,how='left')

    df5.drop(columns=['vacancy_sg'], inplace=True)

    df5['advertised_rate'] = df5['advertised_FTE'] / df5['sip_FTE_region']

    df5.drop(columns=['sip_FTE_region','advertised_FTE','vacancy_FTE'], inplace=True)

    # reasons for leaving
    df6 = pd.merge(df5, df_r6, on='month_year',how='left')

    # add region as dummy variable
    df6 = pd.get_dummies(df6, columns=['region_name'], drop_first=True)

    # add org type as dummy variable
    df6 = pd.get_dummies(df6, columns=['org_type'], drop_first=True)

    # Convert True/False dummy variable categories to integer 0/1
    bool_columns = df6.select_dtypes(include='bool').columns
    df6[bool_columns] = df6[bool_columns].astype(int)

    # Need to cut dataframe to earliest and latest data available for all fields. Do this by cutting rows where all values for key variables are zero
    df6 = df6[~((df5['leaver_rate'] == 0) | (df6['joiner_rate'] == 0) | (df6['sickness_absence'] == 0))]

    # transform nans into 0s
    df6 = df6.fillna(0)

    # drop duplicates
    df6.drop_duplicates(inplace=True)
    
    return df6

In [17]:
annual_df_ivs = merge_ivs(annual_df,df_r1, df_r2, df_r3,df_r4,df_r5,df_r6)
monthly_df_ivs = merge_ivs(monthly_df,df_r1, df_r2, df_r3,df_r4,df_r5,df_r6)


In [18]:
annual_df_ivs.head()

Unnamed: 0,org_code,staff_group,month_year,leaver_rate,joiner_rate,%_amb_staff,%_cent_funct,%_senior_docs,%_estates,%_managers,...,region_name_South West,org_type_Acute - Medium,org_type_Acute - Multi-Service,org_type_Acute - Small,org_type_Acute - Specialist,org_type_Acute - Teaching,org_type_Ambulance Trust,org_type_Care Trust,org_type_Community Provider Trust,org_type_Mental Health and Learning Disability
48754,RBA,"Hotel, property & estates",2018-08-01,0.053293,0.155154,0.0,0.081927,0.073878,0.094267,0.014461,...,1,1,0,0,0,0,0,0,0,0
48755,RBA,"Hotel, property & estates",2018-08-01,0.053293,0.155154,0.0,0.081927,0.073878,0.094267,0.014461,...,1,1,0,0,0,0,0,0,0,0
48756,RGN,Senior managers,2018-08-01,0.276243,0.165746,0.000594,0.069098,0.071875,0.034894,0.019698,...,0,0,0,0,0,0,0,0,0,0
48757,RGN,Senior managers,2018-08-01,0.276243,0.165746,0.000594,0.069098,0.071875,0.034894,0.019698,...,0,0,0,0,0,0,0,0,0,0
48758,RGN,Senior managers,2018-08-01,0.276243,0.165746,0.000594,0.069098,0.071875,0.034894,0.019698,...,0,0,0,0,0,0,0,0,0,0


In [19]:
annual_df_ivs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 206646 entries, 48754 to 285180
Data columns (total 65 columns):
 #   Column                                          Non-Null Count   Dtype         
---  ------                                          --------------   -----         
 0   org_code                                        206646 non-null  object        
 1   staff_group                                     206646 non-null  object        
 2   month_year                                      206646 non-null  datetime64[ns]
 3   leaver_rate                                     206646 non-null  float64       
 4   joiner_rate                                     206646 non-null  float64       
 5   %_amb_staff                                     206646 non-null  float64       
 6   %_cent_funct                                    206646 non-null  float64       
 7   %_senior_docs                                   206646 non-null  float64       
 8   %_estates                          

In [20]:
annual_df_ivs.to_csv(f'annual_modelling_data.csv', index=False)

In [21]:
monthly_df_ivs.to_csv(f'monthly_modelling_data.csv', index=False)

## OLS multiple regression

### Annual data

Specify the dependent variable (dv). All other fields to be dropped. 

In [22]:
dv = 'leaver_rate'
to_drop = ['month_year','org_code','staff_group','sickness_absence','%_unknown_x',
           dv]

Define the design matrix (X) and the dependent variable (y)


In [23]:
annual_df_ivs.reset_index(drop = True)
X = annual_df_ivs.drop(to_drop, axis=1)
y = annual_df_ivs[dv]

y.head()


48754    0.053293
48755    0.053293
48756    0.276243
48757    0.276243
48758    0.276243
Name: leaver_rate, dtype: float64

In [24]:
# Add a constant column to the design matrix
X = sm.add_constant(X)

X.tail()


Unnamed: 0,const,joiner_rate,%_amb_staff,%_cent_funct,%_senior_docs,%_estates,%_managers,%_midwives,%_nurses_hv,%_sci_tech_staff,...,region_name_South West,org_type_Acute - Medium,org_type_Acute - Multi-Service,org_type_Acute - Small,org_type_Acute - Specialist,org_type_Acute - Teaching,org_type_Ambulance Trust,org_type_Care Trust,org_type_Community Provider Trust,org_type_Mental Health and Learning Disability
285176,1.0,0.118626,0.0,0.128167,0.072705,0.107805,0.010009,0.0,0.200123,0.140594,...,0,0,0,0,1,0,0,0,0,0
285177,1.0,0.107421,0.0,0.076539,0.069032,0.038682,0.013728,0.023599,0.293687,0.128069,...,0,0,0,0,0,1,0,0,0,0
285178,1.0,0.107421,0.0,0.076539,0.069032,0.038682,0.013728,0.023599,0.293687,0.128069,...,0,0,0,0,0,1,0,0,0,0
285179,1.0,0.200625,0.0,0.076539,0.069032,0.038682,0.013728,0.023599,0.293687,0.128069,...,0,0,0,0,0,1,0,0,0,0
285180,1.0,0.248881,0.0,0.076539,0.069032,0.038682,0.013728,0.023599,0.293687,0.128069,...,0,0,0,0,0,1,0,0,0,0


In [25]:
# Fit the regression model
model = sm.OLS(y, X)
results = model.fit()

# Print the regression results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:            leaver_rate   R-squared:                       0.092
Model:                            OLS   Adj. R-squared:                  0.092
Method:                 Least Squares   F-statistic:                     362.2
Date:                Tue, 10 Oct 2023   Prob (F-statistic):               0.00
Time:                        12:52:43   Log-Likelihood:                -14915.
No. Observations:              206646   AIC:                         2.995e+04
Df Residuals:                  206587   BIC:                         3.055e+04
Df Model:                          58                                         
Covariance Type:            nonrobust                                         
                                                     coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------

In [26]:
results_summary = results.summary()

# Note that tables is a list. The table at index 1 is the "core" table. Additionally, read_html puts dfs in a list, so we want index 0
results_as_html = results_summary.tables[1].as_html()
pd.read_html(results_as_html, header=0, index_col=0)[0]

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.8559,0.178,-4.81,0.0,-1.205,-0.507
joiner_rate,0.5306,0.005,101.698,0.0,0.52,0.541
%_amb_staff,0.9841,0.191,5.145,0.0,0.609,1.359
%_cent_funct,1.0492,0.179,5.865,0.0,0.699,1.4
%_senior_docs,1.18,0.196,6.022,0.0,0.796,1.564
%_estates,0.9187,0.176,5.222,0.0,0.574,1.264
%_managers,1.1194,0.196,5.71,0.0,0.735,1.504
%_midwives,1.2043,0.18,6.7,0.0,0.852,1.557
%_nurses_hv,1.0261,0.178,5.756,0.0,0.677,1.376
%_sci_tech_staff,0.9424,0.178,5.289,0.0,0.593,1.292


In [27]:
# annual_results_text = results.as_text()

# import csv
# resultFile = open("table.csv",'w')
# resultFile.write(annual_results_text)
# resultFile.close()

# results.tables[0].to_csv("annual_results_text")

# pd.read_csv("annual_results_text")

import lxml

# Convert the regression results summary to a DataFrame
results_df = pd.read_html(results.summary().tables[1].as_html(), header=0, index_col=0)[0]

# Export the DataFrame to a CSV file
results_df.to_csv("annual_regression_results.csv")

### Monthly data

In [28]:
dv = 'leaver_rate'
to_drop = ['month_year','org_code','staff_group','annual_sickness_absence',
           dv]

monthly_df_ivs.reset_index(drop = True)
X = monthly_df_ivs.drop(to_drop, axis=1)
y = monthly_df_ivs[dv]

#y = y.dropna()

y.head()


5560    0.030204
5561    0.030204
5562    0.005962
5563    0.005962
5566    0.002877
Name: leaver_rate, dtype: float64

In [29]:
# Add a constant column to the design matrix
X = sm.add_constant(X)

X.tail()


Unnamed: 0,const,joiner_rate,%_amb_staff,%_cent_funct,%_senior_docs,%_estates,%_managers,%_midwives,%_nurses_hv,%_unknown_x,...,region_name_South West,org_type_Acute - Medium,org_type_Acute - Multi-Service,org_type_Acute - Small,org_type_Acute - Specialist,org_type_Acute - Teaching,org_type_Ambulance Trust,org_type_Care Trust,org_type_Community Provider Trust,org_type_Mental Health and Learning Disability
237608,1.0,0.024402,0.0,0.100655,0.041793,0.063877,0.005879,0.0,0.27911,0.00312,...,0,0,0,0,0,0,0,0,0,1
237609,1.0,0.024402,0.0,0.100655,0.041793,0.063877,0.005879,0.0,0.27911,0.00312,...,0,0,0,0,0,0,0,0,0,1
237610,1.0,0.024343,0.0,0.100655,0.041793,0.063877,0.005879,0.0,0.27911,0.00312,...,0,0,0,0,0,0,0,0,0,1
237611,1.0,0.024343,0.0,0.100655,0.041793,0.063877,0.005879,0.0,0.27911,0.00312,...,0,0,0,0,0,0,0,0,0,1
237613,1.0,0.021979,0.0,0.110338,0.029508,0.032491,0.01503,0.0,0.234538,0.000184,...,1,0,0,0,0,0,0,0,0,1


In [30]:
# Fit the regression model
model = sm.OLS(y, X)
results = model.fit()

# Print the regression results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:            leaver_rate   R-squared:                       0.134
Model:                            OLS   Adj. R-squared:                  0.134
Method:                 Least Squares   F-statistic:                     416.1
Date:                Tue, 10 Oct 2023   Prob (F-statistic):               0.00
Time:                        12:52:46   Log-Likelihood:             3.9492e+05
No. Observations:              155559   AIC:                        -7.897e+05
Df Residuals:                  155500   BIC:                        -7.891e+05
Df Model:                          58                                         
Covariance Type:            nonrobust                                         
                                                     coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------