In [None]:
from datetime import datetime
import pandas as pd
import numpy as np
from pathlib import Path

Load turnover data

In [None]:
path = "tempdir/reason"  # or unix / linux / mac path

# Get the files from the path provided in the OP
files_monthly_mds = Path(path).glob('*-reason-mds.csv') # These have days lost and available on separate rows
files_monthly = Path(path).glob('*-reason.csv')

# These are only for the MDS datasets
columns_to_merge = {
    'DATE' : 'DATE',
    'Month': 'DATE',
    'Type' : 'TYPE',
    'Staff group': 'STAFF_GROUP',
    'Reason': 'REASON',
    'FTE days': 'FTE_DAYS'
}

Concat MDS files in tempdir first and then combine with other monthly data

In [None]:
dfs = list()
df = None

for f in files_monthly_mds:
    #print(f.as_posix())
    data = pd.read_csv(f.as_posix())
   # print(f"Number of rows: {len(data)}")
    # .stem is method for pathlib objects to get the filename w/o the extension
    data['FILE_DATE'] = str(f)[15:25] # Varies depending on name, includes dir in filename string too, hence need to start at 15
    data.columns = [columns_to_merge.get(k,k) for k in data.columns]
    dfs.append(data)

df = pd.concat(dfs, ignore_index=True)

In [None]:
list(df)

Retrieve only 1 row from the most up-to-date file

In [None]:
df1 = df.sort_values(by='FILE_DATE', 
                      ascending = False).groupby(['DATE', 'TYPE','STAFF_GROUP','REASON',]).first().reset_index()

In [None]:
df1.head()

In [None]:
df1['DATE'] = pd.to_datetime(df1['DATE'], yearfirst = True)
df1['FILE_DATE'] = pd.to_datetime(df1['FILE_DATE'], yearfirst = True)
df1['REASON'] = df1['REASON'].str.lower()

In [None]:
df2 = df1.groupby(['DATE', 'STAFF_GROUP', 'REASON']).apply(lambda x: 
        pd.Series({
        'FILE_DATE' : x['FILE_DATE'].values[0],
        'FTE_DAYS_LOST': x.loc[x['TYPE'] == 'FTE days lost', 'FTE_DAYS'].values[0] if any(x['TYPE'] == 'FTE days lost') else 0,
        'FTE_DAYS_AVAILABLE': x.loc[x['TYPE'] == 'FTE days available', 'FTE_DAYS'].values[0] if any(x['TYPE'] == 'FTE days available') else 0,
    })).reset_index()

# XX min runtime MR work laptop

In [None]:
df2.head()

# Fetch monthly (non-MDS data)

In [None]:
dfsm = list()
dfm = None

for f in files_monthly:
    #print(f.as_posix())
    data = pd.read_csv(f.as_posix())
   # print(f"Number of rows: {len(data)}")
    # .stem is method for pathlib objects to get the filename w/o the extension
    data['FILE_DATE'] = str(f)[15:25] # Varies depending on name, includes dir in filename string too, hence need to start at 15
    data.columns = [columns_to_merge.get(k,k) for k in data.columns]
    dfsm.append(data)

dfm = pd.concat(dfsm, ignore_index=True)

In [None]:
dfm.head()

In [None]:
list(dfm)

In [None]:
df1m = dfm.sort_values(by='FILE_DATE', 
                      ascending = False).groupby(['DATE', 'STAFF_GROUP','REASON',]).first().reset_index()

In [None]:
df1m['DATE'] = pd.to_datetime(df1m['DATE'], yearfirst = False, format="%d/%m/%Y")
df1m['FILE_DATE'] = pd.to_datetime(df1m['FILE_DATE'], yearfirst = True)
df1m['REASON'] = df1m['REASON'].str.lower()

In [None]:
df1m.head()

In [None]:
df_combo = pd.concat([df2, df1m])

In [None]:
df_combo.head()

In [None]:
df_combo1 = df_combo.sort_values(by='FILE_DATE', 
                      ascending = False).groupby(['DATE', 'STAFF_GROUP','REASON']).first().reset_index()

In [None]:
df_combo1.head()

In [None]:
df_combo1.to_csv('../sickness_reasons.csv', index = False)

In [None]:
df_r2 = pd.read_csv('../../03_dashboard/data/sickness_reasons.csv')
df_r2.rename(columns=str.lower,inplace=True)
df_r2['date'] = pd.to_datetime(df_r2['date'])

In [None]:
df_r2.head()

In [None]:
staff_group_list = sorted(df_r2['staff_group'].unique())
sickness_reason_list = sorted(df_r2['reason'].unique())

In [None]:
sickness_reason_list

In [None]:
import re
[x for x in sickness_reason_list if re.search(r'^s[0-9]+', x)]

In [None]:

df1 = df_r2[(df_r2['staff_group'].isin(["Nurses & health visitors"])) & (df_r2['reason'].isin(["s12 other musculoskeletal problems"]))]
df1['total_days_lost'] = df1.groupby(['date','reason'])['fte_days_lost'].transform('sum')

In [None]:
df1.head()