# Data processing for model

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

#import matplotlib.pyplot as plt
#import seaborn as sns

#from sklearn.preprocessing import MinMaxScaler
#import geopandas as gpd

## Read in and process dependent variable - turnover

Read turnover data from csv into a dataframe.

In [2]:
# annual and monthly data
annual_url = '../01_data/processed_annual_turnover.csv'
# monthly_url = '../01_data/processed_monthly_turnover.csv'

# staff group ref table
ref_sg = pd.read_csv('../01_data/ref_sg_grouped.csv')

annual_df = pd.read_csv(annual_url, parse_dates=['month_year'])
annual_df = annual_df.drop(['n'],axis=1)
annual_df.info()

# monthly_df = pd.read_csv(monthly_url, parse_dates=['month_year'])
# monthly_df = monthly_df.drop(['n'],axis=1)
# monthly_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191090 entries, 0 to 191089
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   org_code           191090 non-null  object        
 1   region_name        191090 non-null  object        
 2   org_type           191090 non-null  object        
 3   staff_group        191090 non-null  object        
 4   month_year         191090 non-null  datetime64[ns]
 5   join_HC            145550 non-null  float64       
 6   join_FTE           145550 non-null  float64       
 7   leave_HC           147697 non-null  float64       
 8   leave_FTE          147697 non-null  float64       
 9   denom_HC           186640 non-null  float64       
 10  denom_FTE          186640 non-null  float64       
 11  start_date         191090 non-null  object        
 12  denom_FTE_start    155299 non-null  float64       
 13  denom_HC_start     155299 non-null  float64 

Function to clean and process DV dataframes

In [3]:
def clean_dv(df, ref_sg):
    # drop org_type because it has historic variation which is creating duplicates when mapping later
    df.drop('org_type', axis=1, inplace=True) 
    # add leaver and joiner rates column
    df['leaver_rate'] = df['leave_FTE']/df['denom_FTE_average']
    df['joiner_rate'] = df['join_FTE']/df['denom_FTE_average']

    # drop unneeded HC columns
    df = df.drop(['join_HC','leave_HC','denom_HC','denom_FTE_start',
                    'denom_HC_start','denom_FTE_average','denom_HC_average'],axis=1)
    # drop 'All staff groups' - use sum of all others instead when wanting all staff.
    #df.drop(df[df['staff_group'] == 'All staff groups'].index, inplace = True)

    # group staff groups with mapping table
    df = pd.merge(df, ref_sg, on='staff_group', how='left')

    ## calculate columns with % of staff groups by organisation and date for use in model
    # first, calculate a total staff in post (SIP; all staff) FTE dataframe for each organisation by month.
    df_sip_org = df.groupby(['month_year', 'org_code', 'staff_group','region_name'])['denom_FTE'].sum().reset_index()
    # second, group by 'month_year' and 'org_code' and sum the 'FTE' values for each group
    df_total_sip_FTE = df_sip_org.groupby(['month_year', 'org_code','region_name'])['denom_FTE'].sum().reset_index()
    # third, merge the total_sip_FTE DataFrame back into the original DataFrame 
    df2 = pd.merge(df, df_total_sip_FTE, on=['month_year', 'org_code','region_name'], suffixes=('', '_total'))

    ## (also calculate here a regional sip FTE by staff group column for later use)
    # staff in post by staff group for each region by month
    df_sip_region = df.groupby(['month_year','staff_group','region_name'])['denom_FTE'].sum().reset_index()
    # # # merge the df_sip_region DataFrame  
    df2 = pd.merge(df2, df_sip_region, on=['month_year','region_name','staff_group'], suffixes=('', '_region'))

    df2.rename(columns={'denom_FTE_total': 'total_sip_FTE','denom_FTE_region':'sip_FTE_region'}, inplace=True)

    # fourth, calculate the percentage of staff group FTE by organization and date
    df2['%_FTE'] = df2['denom_FTE'] / df2['total_sip_FTE']

    # fifth, pivot the DataFrame to get staff groups as new columns with % values
    df3 = df2.pivot(index=['month_year', 'org_code','region_name'], 
                    columns='staff_group', values='%_FTE').reset_index()

    # finally, merge the pivot DataFrame with the original DataFrame
    df4 = pd.merge(df2, df3, on=['month_year', 'org_code','region_name'])

    # # make the new staff group name columns friendlier
    df4.rename(columns={'Ambulance staff': 'amb_staff','Central functions':'cent_funct',
    'HCHS doctors (exc. junior Drs)': 'senior_docs',
    'Hotel, property & estates': 'estates', 'Managers':'managers','Nurses & health visitors': 'nurses_hv',
    'Other staff or those with unknown classification':'unknown',
    'Scientific, therapeutic & technical staff':'sci_tech_staff',
    'Senior managers':'senior_managers','Support to ST&T staff': 'supp_sci_tech',
    'Support to doctors, nurses & midwives': 'supp_doc_nur_mid',
    'Midwives':'midwives','Support to ambulance staff': 'supp_amb_staff'}, inplace=True)

    # drop small staff groups & non sig in regression:
    df4.drop(['All staff groups','supp_amb_staff','senior_managers','managers',
                'amb_staff','unknown','cent_funct'], axis=1, inplace=True) 

    # replace inf values with nan (can happen with rate calcs)
    df4.replace([np.inf, -np.inf], np.nan, inplace=True)
    # transform nans to zeros
    df4.fillna(0, inplace=True)

    # Add a small constant to avoid taking the log of zero
    small_constant = 1e-5
    
    # log scale the total_SIP_FTE column to be in line with other variables. proxy for size of organisation
    df4['log_total_sip_FTE'] = np.log(df4['total_sip_FTE'] + small_constant)

    # drop unused columns (keep total SIP FTE for calculating vacancy rates later)
    df4.drop(['join_FTE','leave_FTE','%_FTE','denom_FTE','total_sip_FTE'], axis=1, inplace=True)

    return df4

In [4]:
annual_df1 = clean_dv(annual_df, ref_sg)
#monthly_df1 = clean_dv(monthly_df)

In [5]:
annual_df1.head()

Unnamed: 0,org_code,region_name,staff_group,month_year,start_date,leaver_rate,joiner_rate,grouped_sg,sip_FTE_region,senior_docs,estates,midwives,nurses_hv,sci_tech_staff,supp_sci_tech,supp_doc_nur_mid,log_total_sip_FTE
0,R0A,North West,All staff groups,2017-10-01,2016-10-01,0.0,0.0,all,154513.34292,0.033198,0.006438,0.014217,0.160195,0.075306,0.038975,0.123831,10.439745
1,R0A,North West,Central functions,2017-10-01,2016-10-01,0.0,0.0,admin,12964.56304,0.033198,0.006438,0.014217,0.160195,0.075306,0.038975,0.123831,10.439745
2,R0A,North West,HCHS doctors (exc. junior Drs),2017-10-01,2016-10-01,0.0,0.0,medical,7793.01807,0.033198,0.006438,0.014217,0.160195,0.075306,0.038975,0.123831,10.439745
3,R0A,North West,"Hotel, property & estates",2017-10-01,2016-10-01,0.0,0.0,estates,8643.61861,0.033198,0.006438,0.014217,0.160195,0.075306,0.038975,0.123831,10.439745
4,R0A,North West,Managers,2017-10-01,2016-10-01,0.0,0.0,admin,2506.86746,0.033198,0.006438,0.014217,0.160195,0.075306,0.038975,0.123831,10.439745


The data show the full time equivalent (FTE) number of leavers by organisation and staff group for the previous 12-month period from the date. It also shows the number of staff in post (SIP) FTE averaged over the 12-month period to date.

In [6]:
sorted(annual_df1['staff_group'].unique())

['All staff groups',
 'Ambulance staff',
 'Central functions',
 'HCHS doctors (exc. junior Drs)',
 'Hotel, property & estates',
 'Managers',
 'Midwives',
 'Nurses & health visitors',
 'Other staff or those with unknown classification',
 'Scientific, therapeutic & technical staff',
 'Senior managers',
 'Support to ST&T staff',
 'Support to ambulance staff',
 'Support to doctors, nurses & midwives']

## Load independent variable 1 - local unemployment

Load data about local unemployment so we can use it as a regressor

In [7]:
url_r1 = '../01_data/ONS_localunemployment_monthly.csv'
df_r1 = pd.read_csv(url_r1, parse_dates=['Date'])

df_r1.rename(columns={'%':'local_unemployment','Date':'month_year',
                      'NHSE region name':'region_name'},inplace=True)
df_r1 = df_r1.sort_values('month_year')
df_r1.tail()

Unnamed: 0,month_year,region_name,local_unemployment
2619,2023-06-01,London,0.048
2620,2023-06-01,Midlands,0.0455
2621,2023-06-01,North East,0.052
2622,2023-06-01,North West,0.053
2624,2023-06-01,South West,0.036


## Load IV 2 - sickness absence

Load data about sickness absence to use as second regressor

In [8]:
url_r2 = '../01_data/sickness_benchmarking.csv'
df_r2 = pd.read_csv(url_r2, parse_dates=['DATE'])
trust_types_todrop = ['Clinical Commissioning Group','Integrated Care Board']
df_r2 = df_r2[~df_r2['CLUSTER_GROUP'].isin(trust_types_todrop)]
df_r2 = df_r2.drop(['BENCHMARK_GROUP','ORG_NAME',
                     'NHSE_REGION_CODE','CLUSTER_GROUP'],axis=1)
df_r2.rename(columns={'ORG_CODE':'org_code','DATE':'month_year',
                       'NHSE_REGION_NAME':'region_name','STAFF_GROUP':'staff_group',
                       'FTE_DAYS_LOST':'fte_days_lost','FTE_DAYS_AVAILABLE':'fte_days_available'},inplace=True)
merge_cols = ['month_year', 'org_code','region_name','staff_group']
df_r2['sickness_absence'] = df_r2['fte_days_lost']/df_r2['fte_days_available']
df_r2 = df_r2.reset_index(drop=True)
df_r2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192606 entries, 0 to 192605
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   month_year          192606 non-null  datetime64[ns]
 1   staff_group         192606 non-null  object        
 2   file_date           192606 non-null  object        
 3   org_code            192606 non-null  object        
 4   region_name         192606 non-null  object        
 5   fte_days_lost       166612 non-null  float64       
 6   fte_days_available  166612 non-null  float64       
 7   sickness_absence    166358 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 11.8+ MB


Create 12-month rolling sickness absence column for use with annual turnover data

In [9]:
df_r2['month_year'] = pd.to_datetime(df_r2['month_year'])

# Sort the DataFrame by organisation, staff_group, and month
df_r2.sort_values(by=['month_year'], inplace=True)

# # Calculate the rolling sums for days lost and days available
# df_r2['rolling_days_lost'] = df_r2.groupby(['org_code', 
#                         'staff_group'])['fte_days_lost'].rolling(window=12, min_periods=1).sum().reset_index(level=[0, 1], drop=True)

# df_r2['rolling_days_available'] = df_r2.groupby(['org_code', 
#                         'staff_group'])['fte_days_available'].rolling(window=12, min_periods=1).sum().reset_index(level=[0, 1], drop=True)

# # Calculate the rolling sickness absence rate
# df_r2['annual_sickness_absence'] = df_r2['rolling_days_lost'] / df_r2['rolling_days_available']

# drop fte_days_lost fte_days_available, rolling_days_available and rolling_days_lost columns
df_r2.drop(columns=['fte_days_lost', 'fte_days_available'], inplace=True)
#, 'rolling_days_available', 'rolling_days_lost'], inplace=True)

df_r2.tail()

Unnamed: 0,month_year,staff_group,file_date,org_code,region_name,sickness_absence
106839,2023-07-01,All staff groups,2023-06-01,RQ3,Midlands,0.04554
106840,2023-07-01,Central functions,2023-06-01,RQ3,Midlands,0.037741
106841,2023-07-01,HCHS doctors (exc. junior Drs),2023-06-01,RQ3,Midlands,0.006321
92403,2023-07-01,Nurses & health visitors,2023-06-01,NR3,Midlands,0.056169
0,2023-07-01,All staff groups,2023-06-01,RX8,North East and Yorkshire,0.062204


## Load IV 3 - reasons for sickness absence

Add data about reasons for sickness absence

In [10]:
url_r3 = '../01_data/sickness_absence_reason_pivot.csv'
df_r3 = pd.read_csv(url_r3, parse_dates=['Date'])
#df_r3 = df_r3.drop(['FTE days lost'],axis=1)
df_r3.rename(columns={'Date':'month_year','Staff group':'staff_group'},inplace=True)
#df_r2 = df_r2.reset_index(drop=True)

# drop least frequent reasons for absence
df_r3 = df_r3.drop(['substance_abus','asthma',
                    'dental','blood_disorder','endocrine',
                    'eye','skin_disorders','nervous_system',
                    'gynaecological','unknown','pregnancy_related',
                    'other'],axis=1)

# Replace NaN values with 0 
df_r3 = df_r3.fillna(0)
df_r3.info()
# national level data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1413 entries, 0 to 1412
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   month_year      1413 non-null   datetime64[ns]
 1   staff_group     1413 non-null   object        
 2   anxiety_stress  1413 non-null   float64       
 3   back_problems   1413 non-null   float64       
 4   gastro          1413 non-null   float64       
 5   headache_mig    1413 non-null   float64       
 6   infectious_dis  1413 non-null   float64       
 7   other_msk       1413 non-null   float64       
 8   respiratory     1413 non-null   float64       
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 99.5+ KB


## Load IV 4 and 5 - staff vacancies

In [11]:
url_sg_ref = '../01_data/ref_sg_vacancy.csv'
df_sg_ref = pd.read_csv(url_sg_ref)
df_sg_ref.head()

Unnamed: 0,staff_group,vacancy_sg,all
0,Ambulance staff,Allied Health Professionals,All staff groups
1,Central functions,Administrative and Clerical,All staff groups
2,HCHS doctors (exc. junior Drs),Medical and Dental,All staff groups
3,"Hotel, property & estates",Estates and Ancillary,All staff groups
4,Managers,Administrative and Clerical,All staff groups


In [12]:
url_r4 = '../01_data/vacancy_ESR.csv'
df_r4 = pd.read_csv(url_r4,parse_dates=['month_year'],dayfirst=True)

df_r4 = df_r4.drop(['Published month','Published quarter','England'],axis=1)
df_r4.rename(columns={'NWD Staff Group':'vacancy_sg','NHS England region':'region_name',
                        'Vacancy Wte':'vacancy_FTE'},inplace=True)

df_r4 = df_r4.fillna(0)

# Remove code in brackets
df_r4['region_name'] = df_r4['region_name'].str[:-6].str.rstrip()

# Add staff groupings to match other datasets
df_r4 = pd.merge(df_r4, df_sg_ref, on='vacancy_sg',how='left')

df_r4 = df_r4.drop(['all'],axis=1)

df_r4.info()

# regional level

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8387 entries, 0 to 8386
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   month_year   8387 non-null   datetime64[ns]
 1   vacancy_sg   8387 non-null   object        
 2   region_name  8387 non-null   object        
 3   vacancy_FTE  8387 non-null   float64       
 4   staff_group  7948 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 327.7+ KB


In [13]:
url_r5 = '../01_data/vacancy_TRAC.csv'
df_r5 = pd.read_csv(url_r5,parse_dates=['month_year'],dayfirst=True)

df_r5 = df_r5.drop(['Published month','Published quarter','England'],axis=1)

df_r5.rename(columns={'NWD Staff Group':'vacancy_sg','NHS England region':'region_name',
                        'Advertised FTE':'advertised_FTE'},inplace=True)

df_r5 = df_r5.fillna(0)

# Remove region code in brackets
df_r5['region_name'] = df_r5['region_name'].str[:-6].str.rstrip()

# Add staff groupings to match other datasets
df_r5 = pd.merge(df_r5, df_sg_ref, on='vacancy_sg',how='left')

df_r5 = df_r5.drop(['all'],axis=1)

# regional level

df_r5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4226 entries, 0 to 4225
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   month_year      4226 non-null   datetime64[ns]
 1   vacancy_sg      4226 non-null   object        
 2   region_name     4226 non-null   object        
 3   advertised_FTE  4226 non-null   float64       
 4   staff_group     3999 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 165.2+ KB


## Load IV 6 - Reasons for leaving

In [14]:
url_r6 = '../01_data/rfl_jun23.csv'
df_r6_a = pd.read_csv(url_r6,parse_dates=['month_year'],dayfirst=True)


In [15]:
df_r6_a.head()

Unnamed: 0,quarter,month_year,%_death_in_service,%_dismissal,%_end_of_ft,%_flexibility,%_health,%_incompat_relations,%_other,%_pay_reward,%_pregnancy,%_progression_cpd,%_relocation,%_retirement,%_unknown,%_work_life_balance,%_workforce_transform,%_neg_RFL
0,Q1-2016-17,2016-04-01,0.01,0.03,0.09,0.02,0.02,0.01,0.0,0.11,0,0.03,0.12,0.16,0.21,0.1,0.09,0.33
1,Q1-2016-17,2016-05-01,0.01,0.03,0.09,0.02,0.02,0.01,0.0,0.11,0,0.03,0.12,0.16,0.21,0.1,0.09,0.33
2,Q1-2016-17,2016-06-01,0.01,0.03,0.09,0.02,0.02,0.01,0.0,0.11,0,0.03,0.12,0.16,0.21,0.1,0.09,0.33
3,Q2-2016-17,2016-07-01,0.0,0.02,0.34,0.01,0.01,0.01,0.0,0.08,0,0.05,0.09,0.1,0.15,0.06,0.08,0.24
4,Q2-2016-17,2016-08-01,0.0,0.02,0.34,0.01,0.01,0.01,0.0,0.08,0,0.05,0.09,0.1,0.15,0.06,0.08,0.24


In [16]:
to_keep = ['month_year','%_neg_RFL','%_dismissal','%_end_of_ft',
            '%_flexibility','%_health','%_pay_reward','%_progression_cpd',
            '%_relocation','%_retirement','%_work_life_balance','%_workforce_transform']
df_r6 = df_r6_a[to_keep]
#df_r6 = df_r6_a
df_r6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   month_year             87 non-null     datetime64[ns]
 1   %_neg_RFL              87 non-null     float64       
 2   %_dismissal            87 non-null     float64       
 3   %_end_of_ft            87 non-null     float64       
 4   %_flexibility          87 non-null     float64       
 5   %_health               87 non-null     float64       
 6   %_pay_reward           87 non-null     float64       
 7   %_progression_cpd      87 non-null     float64       
 8   %_relocation           87 non-null     float64       
 9   %_retirement           87 non-null     float64       
 10  %_work_life_balance    87 non-null     float64       
 11  %_workforce_transform  87 non-null     float64       
dtypes: datetime64[ns](1), float64(11)
memory usage: 8.3 KB


## Merge IV dfs to main df

In [17]:
annual_df1.head()

Unnamed: 0,org_code,region_name,staff_group,month_year,start_date,leaver_rate,joiner_rate,grouped_sg,sip_FTE_region,senior_docs,estates,midwives,nurses_hv,sci_tech_staff,supp_sci_tech,supp_doc_nur_mid,log_total_sip_FTE
0,R0A,North West,All staff groups,2017-10-01,2016-10-01,0.0,0.0,all,154513.34292,0.033198,0.006438,0.014217,0.160195,0.075306,0.038975,0.123831,10.439745
1,R0A,North West,Central functions,2017-10-01,2016-10-01,0.0,0.0,admin,12964.56304,0.033198,0.006438,0.014217,0.160195,0.075306,0.038975,0.123831,10.439745
2,R0A,North West,HCHS doctors (exc. junior Drs),2017-10-01,2016-10-01,0.0,0.0,medical,7793.01807,0.033198,0.006438,0.014217,0.160195,0.075306,0.038975,0.123831,10.439745
3,R0A,North West,"Hotel, property & estates",2017-10-01,2016-10-01,0.0,0.0,estates,8643.61861,0.033198,0.006438,0.014217,0.160195,0.075306,0.038975,0.123831,10.439745
4,R0A,North West,Managers,2017-10-01,2016-10-01,0.0,0.0,admin,2506.86746,0.033198,0.006438,0.014217,0.160195,0.075306,0.038975,0.123831,10.439745


In [18]:
df_r2.tail()

Unnamed: 0,month_year,staff_group,file_date,org_code,region_name,sickness_absence
106839,2023-07-01,All staff groups,2023-06-01,RQ3,Midlands,0.04554
106840,2023-07-01,Central functions,2023-06-01,RQ3,Midlands,0.037741
106841,2023-07-01,HCHS doctors (exc. junior Drs),2023-06-01,RQ3,Midlands,0.006321
92403,2023-07-01,Nurses & health visitors,2023-06-01,NR3,Midlands,0.056169
0,2023-07-01,All staff groups,2023-06-01,RX8,North East and Yorkshire,0.062204


In [19]:
sorted(annual_df1['staff_group'].unique())

['All staff groups',
 'Ambulance staff',
 'Central functions',
 'HCHS doctors (exc. junior Drs)',
 'Hotel, property & estates',
 'Managers',
 'Midwives',
 'Nurses & health visitors',
 'Other staff or those with unknown classification',
 'Scientific, therapeutic & technical staff',
 'Senior managers',
 'Support to ST&T staff',
 'Support to ambulance staff',
 'Support to doctors, nurses & midwives']

In [20]:
def merge_ivs(df, df_r1, df_r2, df_r3,df_r4,df_r5,df_r6):
    # local unemployment rate
    df1 = pd.merge(df, df_r1, on=['month_year', 'region_name'],how='left')
    df1 = df1.sort_values('month_year')

    # sickness absence
    r2_merge_cols = ['month_year', 'org_code','region_name','staff_group']
    df2 = pd.merge(df1, df_r2, on=r2_merge_cols,how='left')
    #df2.drop_duplicates(subset=r2_merge_cols)

    # reason for sickness absence
    r3_merge_cols = ['month_year','staff_group']
    df3 = pd.merge(df2, df_r3, on=r3_merge_cols,how='left')

    # vacancy - need to calculate rate at regional level here
    # use sip_FTE_region calculated earlier
    r4_merge_cols = ['month_year','region_name','staff_group']

    df4 = pd.merge(df3, df_r4, on=r4_merge_cols,how='left')

    df4.drop(columns=['vacancy_sg'], inplace=True)

    df4['vacancy_rate'] = df4['vacancy_FTE'] / df4['sip_FTE_region']

    df5 = pd.merge(df4, df_r5, on=r4_merge_cols,how='left')

    df5.drop(columns=['vacancy_sg'], inplace=True)

    df5['advertised_rate'] = df5['advertised_FTE'] / df5['sip_FTE_region']

    df5.drop(columns=['sip_FTE_region','advertised_FTE','vacancy_FTE'], inplace=True)

    # reasons for leaving
    df6 = pd.merge(df5, df_r6, on='month_year',how='left')

    # add region as dummy variable
    df6 = pd.get_dummies(df6, columns=['region_name'], drop_first=True)

    # add org type as dummy variable
#    df6 = pd.get_dummies(df6, columns=['org_type'], drop_first=True)

    # Convert True/False dummy variable categories to integer 0/1
    bool_columns = df6.select_dtypes(include='bool').columns
    df6[bool_columns] = df6[bool_columns].astype(int)

    # Need to cut dataframe to earliest and latest data available for all fields. Do this by cutting rows where all values for key variables are zero
    df6 = df6[~((df5['leaver_rate'] == 0) | (df6['joiner_rate'] == 0) | (df6['sickness_absence'] == 0))]

    # transform nans into 0s
    df6 = df6.fillna(0)

    # drop duplicates
    df6.drop_duplicates(inplace=True)
    
    return df6

In [21]:
annual_df_ivs = merge_ivs(annual_df1,df_r1, df_r2, df_r3,df_r4,df_r5,df_r6)
#monthly_df_ivs = merge_ivs(monthly_df1,df_r1, df_r2, df_r3,df_r4,df_r5,df_r6)


In [22]:
annual_df_ivs.head()

Unnamed: 0,org_code,staff_group,month_year,start_date,leaver_rate,joiner_rate,grouped_sg,senior_docs,estates,midwives,...,%_relocation,%_retirement,%_work_life_balance,%_workforce_transform,region_name_London,region_name_Midlands,region_name_North East and Yorkshire,region_name_North West,region_name_South East,region_name_South West
33401,RXK,Other staff or those with unknown classification,2018-08-01,2017-08-01,0.339723,0.845264,unknown,0.030216,0.045563,0.01757,...,0.09,0.09,0.07,0.11,0,1,0,0,0,0
33402,RXK,Nurses & health visitors,2018-08-01,2017-08-01,0.120171,0.116056,nurses_midwives,0.030216,0.045563,0.01757,...,0.09,0.09,0.07,0.11,0,1,0,0,0,0
33403,RXK,Managers,2018-08-01,2017-08-01,0.137695,0.114422,admin,0.030216,0.045563,0.01757,...,0.09,0.09,0.07,0.11,0,1,0,0,0,0
33404,RXK,"Hotel, property & estates",2018-08-01,2017-08-01,0.054907,0.041009,estates,0.030216,0.045563,0.01757,...,0.09,0.09,0.07,0.11,0,1,0,0,0,0
33405,RXK,HCHS doctors (exc. junior Drs),2018-08-01,2017-08-01,0.11821,0.09556,medical,0.030216,0.045563,0.01757,...,0.09,0.09,0.07,0.11,0,1,0,0,0,0


In [23]:
annual_df_ivs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 141024 entries, 33401 to 196507
Data columns (total 44 columns):
 #   Column                                Non-Null Count   Dtype         
---  ------                                --------------   -----         
 0   org_code                              141024 non-null  object        
 1   staff_group                           141024 non-null  object        
 2   month_year                            141024 non-null  datetime64[ns]
 3   start_date                            141024 non-null  object        
 4   leaver_rate                           141024 non-null  float64       
 5   joiner_rate                           141024 non-null  float64       
 6   grouped_sg                            141024 non-null  object        
 7   senior_docs                           141024 non-null  float64       
 8   estates                               141024 non-null  float64       
 9   midwives                              141024 non-null  float

In [24]:
annual_df_ivs.to_csv(f'annual_modelling_data.csv', index=False)

In [25]:
#monthly_df_ivs.to_csv(f'monthly_modelling_data.csv', index=False)

## OLS multiple regression

### Annual data

Specify the dependent variable (dv). All other fields to be dropped. 

In [26]:
dv = 'leaver_rate'
to_drop = ['month_year','org_code','staff_group',
            'start_date','file_date','grouped_sg',                          
           dv]

Define the design matrix (X) and the dependent variable (y)


In [27]:
annual_df_ivs.reset_index(drop = True)
X = annual_df_ivs.drop(to_drop, axis=1)
y = annual_df_ivs[dv]

y.head()


33401    0.339723
33402    0.120171
33403    0.137695
33404    0.054907
33405    0.118210
Name: leaver_rate, dtype: float64

In [28]:
# Add a constant column to the design matrix
X = sm.add_constant(X)

X.info()


<class 'pandas.core.frame.DataFrame'>
Index: 141024 entries, 33401 to 196507
Data columns (total 38 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   const                                 141024 non-null  float64
 1   joiner_rate                           141024 non-null  float64
 2   senior_docs                           141024 non-null  float64
 3   estates                               141024 non-null  float64
 4   midwives                              141024 non-null  float64
 5   nurses_hv                             141024 non-null  float64
 6   sci_tech_staff                        141024 non-null  float64
 7   supp_sci_tech                         141024 non-null  float64
 8   supp_doc_nur_mid                      141024 non-null  float64
 9   log_total_sip_FTE                     141024 non-null  float64
 10  local_unemployment                    141024 non-null  float64
 11  s

In [29]:
# Fit the regression model
model = sm.OLS(y, X)
results = model.fit()

# Print the regression results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:            leaver_rate   R-squared:                       0.159
Model:                            OLS   Adj. R-squared:                  0.159
Method:                 Least Squares   F-statistic:                     720.6
Date:                Tue, 14 Nov 2023   Prob (F-statistic):               0.00
Time:                        17:03:08   Log-Likelihood:             1.5489e+05
No. Observations:              141024   AIC:                        -3.097e+05
Df Residuals:                  140986   BIC:                        -3.093e+05
Df Model:                          37                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


In [30]:
# Convert the regression results summary to a DataFrame
results_df = pd.read_html(results.summary().tables[1].as_html(), header=0, index_col=0)[0]

# Export the DataFrame to a CSV file
results_df.to_csv("annual_regression_results.csv")

In [33]:
results_df.sort_values(by='coef')

Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
%_relocation,-1.1287,0.053,-21.114,0.0,-1.233,-1.024
sickness_absence,-0.7388,0.011,-67.05,0.0,-0.76,-0.717
local_unemployment,-0.415,0.044,-9.329,0.0,-0.502,-0.328
senior_docs,-0.3189,0.025,-12.666,0.0,-0.368,-0.27
%_retirement,-0.2864,0.019,-15.297,0.0,-0.323,-0.25
%_neg_RFL,-0.1952,0.137,-1.422,0.155,-0.464,0.074
estates,-0.1524,0.015,-9.946,0.0,-0.182,-0.122
%_end_of_ft,-0.1293,0.011,-11.904,0.0,-0.151,-0.108
supp_sci_tech,-0.1064,0.016,-6.678,0.0,-0.138,-0.075
%_workforce_transform,-0.0819,0.01,-8.465,0.0,-0.101,-0.063


### Monthly data

In [32]:
dv = 'leaver_rate'
to_drop = ['month_year','org_code','staff_group','annual_sickness_absence',
           dv]

monthly_df_ivs.reset_index(drop = True)
X = monthly_df_ivs.drop(to_drop, axis=1)
y = monthly_df_ivs[dv]

#y = y.dropna()

y.head()


NameError: name 'monthly_df_ivs' is not defined

In [None]:
# Add a constant column to the design matrix
X = sm.add_constant(X)

X.tail()


Unnamed: 0,const,joiner_rate,%_amb_staff,%_cent_funct,%_senior_docs,%_estates,%_managers,%_midwives,%_nurses_hv,%_unknown_x,...,region_name_South West,org_type_Acute - Medium,org_type_Acute - Multi-Service,org_type_Acute - Small,org_type_Acute - Specialist,org_type_Acute - Teaching,org_type_Ambulance Trust,org_type_Care Trust,org_type_Community Provider Trust,org_type_Mental Health and Learning Disability
243043,1.0,0.011086,0.0,0.09866,0.025805,0.072696,0.023059,0.0,0.219519,0.0,...,0,0,0,0,0,0,0,1,0,0
243045,1.0,0.01634,0.0,0.09866,0.025805,0.072696,0.023059,0.0,0.219519,0.0,...,0,0,0,0,0,0,0,1,0,0
243047,1.0,0.011452,0.0,0.09866,0.025805,0.072696,0.023059,0.0,0.219519,0.0,...,0,0,0,0,0,0,0,1,0,0
243049,1.0,0.019148,0.0,0.107179,0.024261,0.04033,0.014776,0.0,0.310333,0.001028,...,0,0,0,0,0,0,0,1,0,0
243051,1.0,0.010044,0.0,0.109833,0.028598,0.033367,0.016183,0.0,0.232586,0.0,...,1,0,0,0,0,0,0,0,0,1


In [None]:
# Fit the regression model
model = sm.OLS(y, X)
monthly_results = model.fit()

# Print the regression results
print(monthly_results.summary())

                            OLS Regression Results                            
Dep. Variable:            leaver_rate   R-squared:                       0.132
Model:                            OLS   Adj. R-squared:                  0.132
Method:                 Least Squares   F-statistic:                     417.9
Date:                Tue, 10 Oct 2023   Prob (F-statistic):               0.00
Time:                        16:19:21   Log-Likelihood:             4.0457e+05
No. Observations:              159231   AIC:                        -8.090e+05
Df Residuals:                  159172   BIC:                        -8.084e+05
Df Model:                          58                                         
Covariance Type:            nonrobust                                         
                                                     coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------

In [None]:
# Convert the regression results summary to a DataFrame
monthly_results_df = pd.read_html(monthly_results.summary().tables[1].as_html(), header=0, index_col=0)[0]

# Export the DataFrame to a CSV file
monthly_results_df.to_csv("monthly_regression_results.csv")