In [10]:
import pandas as pd
import os, re
import json
import ast
from datetime import datetime

In [11]:
os.listdir("./tempdir")

['Annual turnover from organisation benchmarking source data, February 2023.csv',
 'inactive_organisations.txt',
 'Monthly turnover from organisation benchmarking source data, February 2023.csv',
 'turnover.zip']

In [12]:
annual_csv = [f for f in os.listdir('./tempdir') if re.match(r"(?i)^annual", f)]
monthly_csv = [f for f in os.listdir('./tempdir') if re.match(r"(?i)^monthly", f)]

In [13]:
annual_csv

['Annual turnover from organisation benchmarking source data, February 2023.csv']

In [14]:
monthly_csv

['Monthly turnover from organisation benchmarking source data, February 2023.csv']

In [15]:
with open('./tempdir/inactive_organisations.txt', 'r') as file:
    # Read the contents of the file
    data = json.load(file)
    js_df = converted_dict = {row['org_id']: row['name'] for row in data}

file.close()


In [16]:
annual_df = pd.read_csv(f"./tempdir/{annual_csv[0]}")
monthly_df = pd.read_csv(f"./tempdir/{monthly_csv[0]}")

In [17]:
annual_df.head()

Unnamed: 0,Period,Type,Org code,Org name,NHSE region code,NHSE region name,ICS code,ICS name,Cluster group,Benchmark group,Staff group,HC,FTE
0,31/01/2022,Denoms,00Q,NHS Blackburn with Darwen CCG,Y62,North West,QE1,Lancashire and South Cumbria,Clinical Commissioning Group,Clinical Commissioning Group,Nurses & health visitors,4,3.1
1,31/01/2022,Denoms,00Q,NHS Blackburn with Darwen CCG,Y62,North West,QE1,Lancashire and South Cumbria,Clinical Commissioning Group,Clinical Commissioning Group,"Scientific, therapeutic & technical staff",6,5.33334
2,31/01/2022,Denoms,00Q,NHS Blackburn with Darwen CCG,Y62,North West,QE1,Lancashire and South Cumbria,Clinical Commissioning Group,Clinical Commissioning Group,Senior managers,11,7.07254
3,31/01/2022,Denoms,00R,NHS Blackpool CCG,Y62,North West,QE1,Lancashire and South Cumbria,Clinical Commissioning Group,Clinical Commissioning Group,Managers,24,22.18
4,31/01/2022,Denoms,00T,NHS Bolton CCG,Y62,North West,QOP,Greater Manchester,Clinical Commissioning Group,Clinical Commissioning Group,Nurses & health visitors,16,14.55333


In [18]:
monthly_df.head()

Unnamed: 0,Period,Type,Org code,Org name,NHSE region code,NHSE region name,ICS code,ICS name,Cluster group,Benchmark group,Staff group,HC,FTE
0,202112 to 202201,Leavers,RXX,Surrey and Borders Partnership NHS Foundation ...,Y59,South East,QXU,Surrey Heartlands,Mental Health,Mental Health and Learning Disability,All staff groups,46,41.09334
1,202112 to 202201,Leavers,RJ2,Lewisham and Greenwich NHS Trust,Y56,London,QKK,South East London,Acute,Acute - Large,All staff groups,112,101.00739
2,202112 to 202201,Leavers,RDR,Sussex Community NHS Foundation Trust,Y59,South East,QNX,Sussex,Community Provider Trust,Community Provider Trust,All staff groups,86,67.40329
3,202112 to 202201,Leavers,26A,NHS Norfolk and Waveney CCG,Y61,East of England,QMM,Norfolk and Waveney,Clinical Commissioning Group,Clinical Commissioning Group,All staff groups,9,8.2
4,202112 to 202201,Leavers,RWX,Berkshire Healthcare NHS Foundation Trust,Y59,South East,QNQ,Frimley,Mental Health,Mental Health and Learning Disability,All staff groups,73,60.87147


In [19]:
def process_turnover_df(df):
    df1 = pd.DataFrame(df)
    
    # df1['date_string'] = df1['Period'].apply(lambda x: x.split(' to ')[-1]+'01' if x.endswith(('Leavers', 'Joiners')) else ('01/'+x[3:10] if x.startswith('Denoms') else None))
    #df1['thedate'] = pd.to_datetime(datetime.now())

    df1['thedate'] = df1.apply(lambda x: datetime.strptime(x['Period'].split(' to ')[0]+'01', '%Y%m%d') if x['Type'] in ('Leavers', 'Joiners') else (datetime.strptime('01/'+x['Period'][3:10], '%d/%m/%Y') if x['Type'] == 'Denoms' else None), axis = 1)   
    
    df2 = df1.groupby(['Org code', 'Staff group', 'thedate']).apply(lambda x: 
        pd.Series({
        'n': x.shape[0],
        'join_HC': x.loc[x['Type'].str.contains('Joiners'), 'HC'].values[0] if any(x['Type'] == 'Joiners') else None,
        'join_FTE': x.loc[x['Type'] == 'Joiners', 'FTE'].values[0] if any(x['Type'] == 'Joiners') else None,
        'leave_HC': x.loc[x['Type'] == 'Leavers', 'HC'].values[0] if any(x['Type'] == 'Leavers') else None,
        'leave_FTE': x.loc[x['Type'] == 'Leavers', 'FTE'].values[0] if any(x['Type'] == 'Leavers') else None,
        'denom_HC': x.loc[x['Type'] == 'Denoms', 'HC'].values[0] if any(x['Type'] == 'Denoms') else None,
        'denom_FTE': x.loc[x['Type'] == 'Denoms', 'FTE'].values[0] if any(x['Type'] == 'Denoms') else None,
    })).reset_index().rename(columns={
        'Org code': 'org_code',
        'Staff group': 'staff_group',
        'thedate': 'month_year'
    }).drop_duplicates()

    return df2


In [20]:
processed_annual_df = process_turnover_df(annual_df) 
processed_annual_df.shape[0]

246144

In [21]:
processed_annual_df.head()


Unnamed: 0,org_code,staff_group,month_year,n,join_HC,join_FTE,leave_HC,leave_FTE,denom_HC,denom_FTE
0,00D,All staff groups,2017-08-01,2.0,,,1.0,0.1,1.0,0.1
1,00D,All staff groups,2017-09-01,2.0,,,1.0,0.1,1.0,0.1
2,00D,All staff groups,2017-10-01,2.0,,,1.0,0.1,1.0,0.1
3,00D,All staff groups,2017-11-01,2.0,,,1.0,0.1,1.0,0.1
4,00D,All staff groups,2017-12-01,2.0,,,1.0,0.1,1.0,0.1


In [22]:
processed_annual_df.loc[processed_annual_df['org_code'] == '00Q'].sort_values(by=['n', 'month_year'], ascending=False).head()


Unnamed: 0,org_code,staff_group,month_year,n,join_HC,join_FTE,leave_HC,leave_FTE,denom_HC,denom_FTE
635,00Q,All staff groups,2021-05-01,3.0,2.0,2.0,6.0,5.69334,46.0,31.76955
693,00Q,Central functions,2021-05-01,3.0,1.0,1.0,2.0,1.86667,6.0,5.02667
969,00Q,"Scientific, therapeutic & technical staff",2021-05-01,3.0,1.0,1.0,2.0,1.82667,6.0,5.33334
634,00Q,All staff groups,2021-04-01,3.0,2.0,2.0,6.0,6.29334,47.0,32.66952
692,00Q,Central functions,2021-04-01,3.0,1.0,1.0,2.0,1.86667,6.0,5.02667


In [23]:
processed_monthly_df = process_turnover_df(monthly_df) 
processed_monthly_df.shape[0]

193650

In [24]:
processed_annual_df.to_csv('./tempdir/processed_annual_data.csv', index=False)
processed_monthly_df.to_csv('./tempdir/processed_monthly_data.csv', index=False)

In [58]:
#data_dir = 
processed_monthly_df.to_csv('../01_data/processed_monthly_turnover.csv', index=False)

OSError: Cannot save file into a non-existent directory: '..\01_data'

In [26]:
'03_data_wrangling'('python_data_wrangling'(file))

In [56]:
#os.path.abspath(os.path.join(os.path.dirname('./python_data_wrangling'),".."))
path = os.path.abspath('python')


TypeError: abspath() missing 1 required positional argument: 'path'

In [57]:
os.getcwd()

'c:\\Users\\MarieRogers\\NHS England\\Workforce Insights and Analytics - General\\Working_Folders\\Marie R\\HSMAproject2023\\03_data_wrangling\\python_data_wrangling'

In [55]:
path

'c:\\Users\\MarieRogers\\NHS England\\Workforce Insights and Analytics - General\\Working_Folders\\Marie R\\HSMAproject2023\\03_data_wrangling\\03_data_wrangling'