In [2]:
import pandas as pd
data_path = '../raw_data/'
df = pd.read_excel(data_path + 'HCP Dataset for Case Study.xlsx')

## Background Research
- HCP (Hospotical Casemix Protocol) 
    - standardised format of recording hospital episode data
    - provides demographical, financial and clinical information 
    - casemix funding - hospitals funded based on type and complexity of the cases that they treat, rather than a fixed flat rate.
    - hospitals provide to health insurers
    - health insurers provide to Department of Health, Disability and Ageing

- episode
    - period between admission and separation (formal or statistical) that a person spends in the hospital
    - formal: administrative process used by hospital to to record the commencement/end of undertaking, treating, or caring of a patient
    - statistical: administrative process used by hospital to to record the commencement/end of a new episode (typically by another department or institution, or when a different type of care is required)
    - essential date and time
    - Great resource with examples in ../resources/info-sheet-adm-sep-v1.0.pdf


## Initial Questions

- Can we determine which features are linked with readmissions within 28 days -> improvements within these areas could decrease readmissions; an indication of quality care and avoids Avoidable Hospital Readmissions (ADR)
- Can the complexity of the episode be measured as an interpretable index -> can predict overall funding from department based on collective measurement involving this index; 
- Can the funding be predicted using the HCP data, and which features contributes the most to this funding. We can simulate many "what-if" scenarios. e.g If we find more older patients are being administered, can we predict how much funding we would receive from their episodes and based on that, we can pivot into ways we can accomodate for this increase.
- Can the data be used to find which time periods are the most busy and pivot into ways we can accomodate for these periods. E.g more rostered staff, ensuring enough allocatable rooms/beds. 


In [5]:
df.head()

Unnamed: 0,InsurerIdentifier,EpisodeIdentifier,DateOfBirth,Postcode,Sex,AdmissionDate,SeparationDate,HospitalType,ICU_Days,ICU_Hours,...,BundledCharges,HIH_Charges,SCN_Charges,CCU_Charges,SCN_Hours,CCU_Hours,SCN_Days,CCU_Days,QualifiedDaysNewborns,PalliativeCareDays
0,INS1,1624122,1012000,6280,1,1012023,2012023,2,0,0,...,316700,0,0,0,0,0,0,0,0,0
1,INS5,1624177,1011937,6233,2,1012023,6012023,2,0,0,...,696100,0,0,0,0,0,0,0,0,0
2,INS9,1624113,1011968,6225,2,1012023,9012023,2,0,0,...,0,0,0,0,0,0,0,0,0,0
3,,1624005,1011955,6281,1,1022023,1022023,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,,1625135,1011963,6230,1,1022023,1022023,2,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
list(df.columns)

['InsurerIdentifier',
 'EpisodeIdentifier',
 'DateOfBirth',
 'Postcode',
 'Sex',
 'AdmissionDate',
 'SeparationDate',
 'HospitalType',
 'ICU_Days',
 'ICU_Hours',
 'TotalPyschCareDays',
 'DRG',
 'DRG_Version',
 'AdmissionTime',
 'UrgencyOfAdmission',
 'TransferInProviderNumber',
 'CareType',
 'SourceOfReferral',
 'DischargeIntention',
 'InterHospitalContracted',
 'MentalHealthLegalStatus',
 'PalliativeCareStatus',
 'Readmission28Days',
 'UnplannedTheatreVisit',
 'InfantWeight',
 'HoursMechVentilation',
 'ModeOfSeparation',
 'SeparationTime',
 'TotalLeaveDays',
 'TransferOutProviderNumber',
 'NonCertifiedDays',
 'HIH_Days',
 'PrincipalDiagnosis',
 'AdditionalDiagnosis1',
 'AdditionalDiagnosis2',
 'AdditionalDiagnosis3',
 'AdditionalDiagnosis4',
 'AdditionalDiagnosis5',
 'AdditionalDiagnosis6',
 'AdditionalDiagnosis7',
 'AdditionalDiagnosis8',
 'AdditionalDiagnosis9',
 'AdditionalDiagnosis10',
 'AdditionalDiagnosis11',
 'AdditionalDiagnosis12',
 'AdditionalDiagnosis13',
 'AdditionalDiagno

In [None]:
df['Readmission28Days'].value_counts()

# Due to limited distribution of readmissions, it is not feasible to perform any analysis on or involving readmissions, including predicting readmissisons through predictive discriminative models.

Readmission28Days
8    30613
1        1
3        1
Name: count, dtype: int64

In [10]:
df[[c for c in df.columns if 'charges' in c.lower()]].describe()

Unnamed: 0,OtherCharges,BundledCharges,HIH_Charges,SCN_Charges,CCU_Charges
count,30615.0,30615.0,30615.0,30615.0,30615.0
mean,430.1548,167218.4,7.172954,9.312429,309.7694
std,10406.64,316807.3,782.648622,1180.205679,15064.97
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,199200.0,0.0,0.0,0.0
max,1079500.0,5707700.0,109800.0,174200.0,1481400.0


In [13]:
# diagnoses codes
df[[c for c in df.columns if 'diagnosis' in c.lower() or 'diag' in c.lower()]].value_counts().reset_index()


Unnamed: 0,PrincipalDiagnosis,AdditionalDiagnosis1,AdditionalDiagnosis2,AdditionalDiagnosis3,AdditionalDiagnosis4,AdditionalDiagnosis5,AdditionalDiagnosis6,AdditionalDiagnosis7,AdditionalDiagnosis8,AdditionalDiagnosis9,...,AdditionalDiagnosis41,AdditionalDiagnosis42,AdditionalDiagnosis43,AdditionalDiagnosis44,AdditionalDiagnosis45,AdditionalDiagnosis46,AdditionalDiagnosis47,AdditionalDiagnosis48,AdditionalDiagnosis49,count
0,2Z491,,,,,,,,,,...,,,,,,,,,,9144
1,2H269,,,,,,,,,,...,,,,,,,,,,323
2,2Z511,2C9000,,,,,,,,,...,,,,,,,,,,279
3,2Z511,2C509,,,,,,,,,...,,,,,,,,,,263
4,2Z380,,,,,,,,,,...,,,,,,,,,,181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10959,2K20,,,,,,,,,,...,,,,,,,,,,1
10960,2K20,2K222,2K449,2Z8643,2U073,,,,,,...,,,,,,,,,,1
10961,2K20,2K2950,2Z720,2U781,,,,,,,...,,,,,,,,,,1
10962,2K20,2K2960,2K449,2Z980,2Z8643,2U073,2U793,2U823,,,...,,,,,,,,,,1


In [18]:
df[df['PrincipalDiagnosis'].str.strip() == '2Z491'].head()

Unnamed: 0,InsurerIdentifier,EpisodeIdentifier,DateOfBirth,Postcode,Sex,AdmissionDate,SeparationDate,HospitalType,ICU_Days,ICU_Hours,...,BundledCharges,HIH_Charges,SCN_Charges,CCU_Charges,SCN_Hours,CCU_Hours,SCN_Days,CCU_Days,QualifiedDaysNewborns,PalliativeCareDays
7,,1628222,1011939,6230,1,1022023,1022023,2,0,0,...,0,0,0,0,0,0,0,0,0,0
8,,1628226,1011959,6236,1,1022023,1022023,2,0,0,...,0,0,0,0,0,0,0,0,0,0
9,,1628229,1011965,6220,1,1022023,1022023,2,0,0,...,0,0,0,0,0,0,0,0,0,0
10,,1628233,1011953,6225,2,1022023,1022023,2,0,0,...,0,0,0,0,0,0,0,0,0,0
11,,1628237,1011953,6233,2,1022023,1022023,2,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
df_times = df.loc[:,['AdmissionDate','AdmissionTime']]
df_times['AdmissionDate'] = df_times['AdmissionDate'].astype(str).str.zfill(8)
df_times['AdmissionDate'] = pd.to_datetime(df_times['AdmissionDate'], format='%d%m%Y')
df_times

Unnamed: 0,AdmissionDate,AdmissionTime
0,2023-01-01,850
1,2023-01-01,1330
2,2023-01-01,1155
3,2023-02-01,1037
4,2023-02-01,1307
...,...,...
30610,2022-12-31,1500
30611,2022-12-31,1630
30612,2022-12-31,1325
30613,2022-12-31,1140


In [28]:
df_times['AdmissionDate'].describe()

count                            30615
mean     2022-12-31 18:35:04.615384832
min                2022-04-14 00:00:00
25%                2022-10-04 00:00:00
50%                2023-01-03 00:00:00
75%                2023-04-03 00:00:00
max                2023-06-30 00:00:00
Name: AdmissionDate, dtype: object