In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

from matplotlib import pyplot as plt

pd.set_option('display.max_rows', None)

In [3]:
#Open csv file.

data = pd.read_csv("/content/drive/MyDrive/TQP-MOST/combined_data_impact.csv", index_col=0)
data.head()

Unnamed: 0,AGEYEARS,SEX,WHITE,ASIAN,BLACK,AMERICANINDIAN,PACIFICISLANDER,RACEOTHER,ETHNICITY,WEIGHT,...,HC_PNEUMONIA,HC_PRESSUREULCER,HC_RESPIRATORY,HC_RETURNOR,HC_SEPSIS,HC_STROKECVA,HC_SUPERFICIALSSI,HC_UNPLANNEDICU,HC_UTI,HC_VAPNEUMONIA
190026915265,40.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,68.0,...,No,No,No,No,No,No,No,No,No,No
190026915423,56.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,84.0,...,No,No,No,No,No,No,No,No,No,No
190026915490,48.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,70.0,...,No,No,No,No,No,No,No,No,No,No
190026915495,60.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,60.0,...,No,No,No,No,No,No,No,No,No,No
190026915499,45.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,97.0,...,No,No,No,No,No,No,No,No,No,No


In [4]:
#See all columns.

print(list(data.columns))

['AGEYEARS', 'SEX', 'WHITE', 'ASIAN', 'BLACK', 'AMERICANINDIAN', 'PACIFICISLANDER', 'RACEOTHER', 'ETHNICITY', 'WEIGHT', 'HEIGHT', 'SBP', 'PULSERATE', 'SUPPLEMENTALOXYGEN', 'PULSEOXIMETRY', 'RESPIRATORYASSISTANCE', 'RESPIRATORYRATE', 'TEMPERATURE', 'PREHOSPITALCARDIACARREST', 'GCSEYE', 'GCSVERBAL', 'GCSMOTOR', 'TOTALGCS', 'TBIPUPILLARYRESPONSE', 'TBIMIDLINESHIFT', 'CC_SUBSTANCEABUSE', 'CC_DIABETES', 'CC_HYPERTENSION', 'CC_CHF', 'CC_MI', 'CC_ANGINAPECTORIS', 'CC_CVA', 'CC_PAD', 'CC_COPD', 'CC_RENAL', 'CC_CIRRHOSIS', 'CC_BLEEDING', 'CC_DISCANCER', 'CC_CHEMO', 'CC_DEMENTIA', 'CC_ADHD', 'CC_MENTALPERSONALITY', 'CC_FUNCTIONAL', 'CC_PREGNANCY', 'CC_ANTICOAGULANT', 'CC_STEROID', 'CC_ADLC', 'HOSPITALARRIVALDAYS', 'TRANSPORTMODE', 'INTERFACILITYTRANSFER', 'TRAUMATYPE', 'INTENT', 'MECHANISM', 'PROTDEV_AIRBAG_PRESENT', 'PROTDEV_CHILD_RESTRAINT', 'PROTDEV_EYE_PROTECT', 'PROTDEV_HELMET', 'PROTDEV_LAP_BELT', 'PROTDEV_NONE', 'PROTDEV_OTHER', 'PROTDEV_PER_FLOAT', 'PROTDEV_PROTECT_CLOTH', 'PROTDEV_PROTE

#Preparing Predictor Variables

In [5]:
data['TOTALGCS'] = data['GCSMOTOR'] + data['GCSVERBAL'] + data['GCSEYE']

In [6]:
#One-hot encoding for 'GCSMOTOR'.

data.loc[data['GCSMOTOR'] == 1, 'GCSMOTOR_NONE'] = 1
data.loc[data['GCSMOTOR_NONE'] != 1, 'GCSMOTOR_NONE'] = 0

data.loc[data['GCSMOTOR'] == 2, 'GCSMOTOR_EXTENSION'] = 1
data.loc[data['GCSMOTOR_EXTENSION'] != 1, 'GCSMOTOR_EXTENSION'] = 0

data.loc[data['GCSMOTOR'] == 3, 'GCSMOTOR_ABNORMALFLEXION'] = 1
data.loc[data['GCSMOTOR_ABNORMALFLEXION'] != 1, 'GCSMOTOR_ABNORMALFLEXION'] = 0

data.loc[data['GCSMOTOR'] == 4, 'GCSMOTOR_NORMALFLEXION'] = 1
data.loc[data['GCSMOTOR_NORMALFLEXION'] != 1, 'GCSMOTOR_NORMALFLEXION'] = 0

data.loc[data['GCSMOTOR'] == 5, 'GCSMOTOR_LOCALIZING'] = 1
data.loc[data['GCSMOTOR_LOCALIZING'] != 1, 'GCSMOTOR_LOCALIZING'] = 0

data.loc[data['GCSMOTOR'] == 6, 'GCSMOTOR_OBEYSCOMMANDS'] = 1
data.loc[data['GCSMOTOR_OBEYSCOMMANDS'] != 1, 'GCSMOTOR_OBEYSCOMMANDS'] = 0

In [7]:
#Change response values to strings for 'GCSMOTOR'.

data.loc[data['GCSMOTOR'] == 1, 'GCSMOTOR'] = 'None'
data.loc[data['GCSMOTOR'] == 2, 'GCSMOTOR'] = 'Extension'
data.loc[data['GCSMOTOR'] == 3, 'GCSMOTOR'] = 'Abnormal flexion'
data.loc[data['GCSMOTOR'] == 4, 'GCSMOTOR'] = 'Normal flexion'
data.loc[data['GCSMOTOR'] == 5, 'GCSMOTOR'] = 'Localizing'
data.loc[data['GCSMOTOR'] == 6, 'GCSMOTOR'] = 'Obeys commands'

data['GCSMOTOR'].value_counts(normalize=False, dropna=False)

Localizing          8938
None                7875
Normal flexion      5138
Obeys commands      3353
Abnormal flexion     990
Extension            631
Name: GCSMOTOR, dtype: int64

In [8]:
#Change response values to strings for 'GCSVERBAL'.

data.loc[data['GCSVERBAL'] == 1, 'GCSVERBAL'] = 'None'
data.loc[data['GCSVERBAL'] == 2, 'GCSVERBAL'] = 'Sounds'
data.loc[data['GCSVERBAL'] == 3, 'GCSVERBAL'] = 'Words'
data.loc[data['GCSVERBAL'] == 4, 'GCSVERBAL'] = 'Confused'
data.loc[data['GCSVERBAL'] == 5, 'GCSVERBAL'] = 'Oriented'

data['GCSVERBAL'].value_counts(normalize=False, dropna=False)

None        14097
Sounds       5527
Confused     3615
Words        3344
Oriented      342
Name: GCSVERBAL, dtype: int64

In [9]:
#Change response values to strings for 'GCSEYE'.

data.loc[data['GCSEYE'] == 1, 'GCSEYE'] = 'None'
data.loc[data['GCSEYE'] == 2, 'GCSEYE'] = 'To pressure'
data.loc[data['GCSEYE'] == 3, 'GCSEYE'] = 'To sound'
data.loc[data['GCSEYE'] == 4, 'GCSEYE'] = 'Spontaneous'

data['GCSEYE'].value_counts(normalize=False, dropna=False)

None           11619
Spontaneous     6213
To sound        4728
To pressure     4365
Name: GCSEYE, dtype: int64

In [10]:
#One-hot encoding for 'TBIPUPILLARYRESPONSE'.

data.loc[data['TBIPUPILLARYRESPONSE'] == 1, 'TBIPUPILLARYRESPONSE_BOTH'] = 1
data.loc[data['TBIPUPILLARYRESPONSE_BOTH'] != 1, 'TBIPUPILLARYRESPONSE_BOTH'] = 0

data.loc[data['TBIPUPILLARYRESPONSE'] == 2, 'TBIPUPILLARYRESPONSE_ONE'] = 1
data.loc[data['TBIPUPILLARYRESPONSE_ONE'] != 1, 'TBIPUPILLARYRESPONSE_ONE'] = 0

data.loc[data['TBIPUPILLARYRESPONSE'] == 3, 'TBIPUPILLARYRESPONSE_NEITHER'] = 1
data.loc[data['TBIPUPILLARYRESPONSE_NEITHER'] != 1, 'TBIPUPILLARYRESPONSE_NEITHER'] = 0

In [11]:
#Change response values to strings for 'TBIPUPILLARYRESPONSE'.

data.loc[data['TBIPUPILLARYRESPONSE'] == 1, 'TBIPUPILLARYRESPONSE'] = 'Both reactive'
data.loc[data['TBIPUPILLARYRESPONSE'] == 2, 'TBIPUPILLARYRESPONSE'] = 'One reactive'
data.loc[data['TBIPUPILLARYRESPONSE'] == 3, 'TBIPUPILLARYRESPONSE'] = 'Neither reactive'

data['TBIPUPILLARYRESPONSE'].value_counts(normalize=False, dropna=False)

Both reactive       23671
Neither reactive     2438
One reactive          816
Name: TBIPUPILLARYRESPONSE, dtype: int64

In [12]:
#Calculate IMPACT score.

data['IMPACT'] = -3.109 + data['AGEYEARS']*0.034 + data['GCSMOTOR_NONE']*1.447 + data['GCSMOTOR_EXTENSION']*1.397 + data['GCSMOTOR_ABNORMALFLEXION']*0.797 + data['GCSMOTOR_NORMALFLEXION']*0.390 + data['TBIPUPILLARYRESPONSE_ONE']*0.514 + data['TBIPUPILLARYRESPONSE_NEITHER']*1.239

In [13]:
drop = ['TOTALGCS', 'GCSMOTOR_NONE', 'GCSMOTOR_EXTENSION', 'GCSMOTOR_ABNORMALFLEXION', 'GCSMOTOR_NORMALFLEXION', 'GCSMOTOR_LOCALIZING', 'GCSMOTOR_OBEYSCOMMANDS', 'TBIPUPILLARYRESPONSE_BOTH', 'TBIPUPILLARYRESPONSE_ONE', 'TBIPUPILLARYRESPONSE_NEITHER']

data = data.drop(drop, axis = 1)

#Renaming and Merging Response Values

In [14]:
#Change response values to strings for 'SEX'.

data.loc[data['SEX'] == 1, 'SEX'] = 'Male'
data.loc[data['SEX'] == 2, 'SEX'] = 'Female'
data.loc[data['SEX'] == 3, 'SEX'] = 'Non-Binary'

data['SEX'].value_counts(normalize=False, dropna=False)

Male          19187
Female         7665
NaN              71
Non-Binary        2
Name: SEX, dtype: int64

In [15]:
#Merge race columns and drop former columns.

data.loc[data['AMERICANINDIAN'] == 1, 'RACE'] = 'American Indian'
data.loc[data['ASIAN'] == 1, 'RACE'] = 'Asian'
data.loc[data['BLACK'] == 1, 'RACE'] = 'Black'
data.loc[data['PACIFICISLANDER'] == 1, 'RACE'] = 'Pacific Islander'
data.loc[data['RACEOTHER'] == 1, 'RACE'] = 'Other/unknown'
data.loc[data['WHITE'] == 1, 'RACE'] = 'White'
data['RACE'] = data['RACE'].fillna('Other/unknown')

data = data.drop(columns=['AMERICANINDIAN', 'ASIAN', 'BLACK', 'PACIFICISLANDER', 'RACEOTHER', 'WHITE'])

data['RACE'].value_counts(normalize=False, dropna=False)

White               17240
Black                4514
Other/unknown        3831
Asian                 806
American Indian       438
Pacific Islander       96
Name: RACE, dtype: int64

In [16]:
#Change response values to strings for 'ETHNICITY'.

data.loc[data['ETHNICITY'] == 1, 'ETHNICITY'] = 'Hispanic or Latino'
data.loc[data['ETHNICITY'] == 2, 'ETHNICITY'] = 'Not Hispanic or Latino'

data['ETHNICITY'].value_counts(normalize=False, dropna=False)

Not Hispanic or Latino    21322
Hispanic or Latino         4236
NaN                        1367
Name: ETHNICITY, dtype: int64

In [17]:
#Change response values to strings for 'RESPIRATORYASSISTANCE'.

data.loc[data['RESPIRATORYASSISTANCE'] == 1, 'RESPIRATORYASSISTANCE'] = 'Unassisted respiratory rate'
data.loc[data['RESPIRATORYASSISTANCE'] == 2, 'RESPIRATORYASSISTANCE'] = 'Assisted respiratory rate'

data['RESPIRATORYASSISTANCE'].value_counts(normalize=False, dropna=False)

Unassisted respiratory rate    18581
Assisted respiratory rate       7206
NaN                             1138
Name: RESPIRATORYASSISTANCE, dtype: int64

In [18]:
#Change response values to strings for 'SUPPLEMENTALOXYGEN'.

data.loc[data['SUPPLEMENTALOXYGEN'] == 1, 'SUPPLEMENTALOXYGEN'] = 'No supplemental oxygen'
data.loc[data['SUPPLEMENTALOXYGEN'] == 2, 'SUPPLEMENTALOXYGEN'] = 'Supplemental oxygen'

data['SUPPLEMENTALOXYGEN'].value_counts(normalize=False, dropna=False)

No supplemental oxygen    13760
Supplemental oxygen       11830
NaN                        1335
Name: SUPPLEMENTALOXYGEN, dtype: int64

In [19]:
#Change response values to strings for 'PREHOSPITALCARDIACARREST'.

data.loc[data['PREHOSPITALCARDIACARREST'] == 1, 'PREHOSPITALCARDIACARREST'] = 'Yes'
data.loc[data['PREHOSPITALCARDIACARREST'] == 2, 'PREHOSPITALCARDIACARREST'] = 'No'

data['PREHOSPITALCARDIACARREST'].value_counts(normalize=False, dropna=False)

No     26280
Yes      471
NaN      174
Name: PREHOSPITALCARDIACARREST, dtype: int64

In [20]:
#Change response values to strings for 'TBIMIDLINESHIFT'.

data.loc[data['TBIMIDLINESHIFT'] == 1, 'TBIMIDLINESHIFT'] = 'Yes'
data.loc[data['TBIMIDLINESHIFT'] == 2, 'TBIMIDLINESHIFT'] = 'No'
data.loc[data['TBIMIDLINESHIFT'] == 3, 'TBIMIDLINESHIFT'] = 'Not imaged/unknown'
data['TBIMIDLINESHIFT'] = data['TBIMIDLINESHIFT'].fillna('Not imaged/unknown')


data['TBIMIDLINESHIFT'].value_counts(normalize=False, dropna=False)

No                    22189
Yes                    4300
Not imaged/unknown      436
Name: TBIMIDLINESHIFT, dtype: int64

In [21]:
#Assign male patients' pregnancy response values as 'Not applicable (male patient)

data.loc[data['SEX'] == 'Male', 'CC_PREGNANCY'] = 'Not applicable (male patient)'

data['CC_PREGNANCY'].value_counts(normalize=False, dropna=False)

Not applicable (male patient)    19187
No                                7612
Unknown                            103
Yes                                 23
Name: CC_PREGNANCY, dtype: int64

In [22]:
#Change response values to strings for 'TRANSPORTMODE'.

data.loc[data['TRANSPORTMODE'] == 1, 'TRANSPORTMODE'] = 'Ground ambulance'
data.loc[data['TRANSPORTMODE'] == 2, 'TRANSPORTMODE'] = 'Air ambulance'
data.loc[data['TRANSPORTMODE'] == 3, 'TRANSPORTMODE'] = 'Air ambulance'
data.loc[data['TRANSPORTMODE'] == 4, 'TRANSPORTMODE'] = 'Private/public vehicle/walk-in'
data.loc[data['TRANSPORTMODE'] == 5, 'TRANSPORTMODE'] = 'Other/police/unknown/etc.'
data.loc[data['TRANSPORTMODE'] == 6, 'TRANSPORTMODE'] = 'Other/police/unknown/etc.'
data['TRANSPORTMODE'] = data['TRANSPORTMODE'].fillna('Other/police/unknown/etc.')


data['TRANSPORTMODE'].value_counts(normalize=False, dropna=False)

Ground ambulance                  21365
Air ambulance                      4828
Private/public vehicle/walk-in      507
Other/police/unknown/etc.           225
Name: TRANSPORTMODE, dtype: int64

In [23]:
#Change response values to strings for 'INTERFACILITYTRANSFER'.

data.loc[data['INTERFACILITYTRANSFER'] == 1, 'INTERFACILITYTRANSFER'] = 'Yes'
data.loc[data['INTERFACILITYTRANSFER'] == 2, 'INTERFACILITYTRANSFER'] = 'No'

data['INTERFACILITYTRANSFER'].value_counts(normalize=False, dropna=False)

No     20209
Yes     6715
NaN        1
Name: INTERFACILITYTRANSFER, dtype: int64

In [24]:
#Change response values to strings for 'TRAUMATYPE'.

data.loc[data['TRAUMATYPE'] == 1, 'TRAUMATYPE'] = 'Blunt'
data.loc[data['TRAUMATYPE'] == 2, 'TRAUMATYPE'] = 'Penetrating'
data.loc[data['TRAUMATYPE'] == 3, 'TRAUMATYPE'] = 'Other/unknown'
data.loc[data['TRAUMATYPE'] == 4, 'TRAUMATYPE'] = 'Other/unknown'
data.loc[data['TRAUMATYPE'] == 9, 'TRAUMATYPE'] = 'Other/unknown'

data['TRAUMATYPE'].value_counts(normalize=False, dropna=False)

Blunt            24241
Penetrating       1408
NaN                683
Other/unknown      593
Name: TRAUMATYPE, dtype: int64

In [25]:
#Change response values to strings for 'INTENT'.

data.loc[data['INTENT'] == 1, 'INTENT'] = 'Unintentional'
data.loc[data['INTENT'] == 2, 'INTENT'] = 'Self-inflicted'
data.loc[data['INTENT'] == 3, 'INTENT'] = 'Assault'
data.loc[data['INTENT'] == 4, 'INTENT'] = 'Other/unknown'
data.loc[data['INTENT'] == 5, 'INTENT'] = 'Other/unknown'
data['INTENT'] = data['INTENT'].fillna('Other/unknown')

data['INTENT'].value_counts(normalize=False, dropna=False)

Unintentional     22165
Assault            3158
Self-inflicted      803
Other/unknown       799
Name: INTENT, dtype: int64

In [26]:
#Change response values to strings for 'MECHANISM'.

data.loc[data['MECHANISM'] == 1, 'MECHANISM'] = 'Cut/Pierce'
data.loc[data['MECHANISM'] == 2, 'MECHANISM'] = 'Drowning/Submersion'
data.loc[data['MECHANISM'] == 3, 'MECHANISM'] = 'Fall'
data.loc[data['MECHANISM'] == 4, 'MECHANISM'] = 'Fire/Flame'
data.loc[data['MECHANISM'] == 5, 'MECHANISM'] = 'Hot Object/Substance'
data.loc[data['MECHANISM'] == 6, 'MECHANISM'] = 'Firearm'
data.loc[data['MECHANISM'] == 7, 'MECHANISM'] = 'Machinery'
data.loc[data['MECHANISM'] == 8, 'MECHANISM'] = 'MVT Occupant'
data.loc[data['MECHANISM'] == 9, 'MECHANISM'] = 'MVT Motorcyclist'
data.loc[data['MECHANISM'] == 10, 'MECHANISM'] = 'MVT Pedal Cyclist'
data.loc[data['MECHANISM'] == 11, 'MECHANISM'] = 'MVT Pedestrian'
data.loc[data['MECHANISM'] == 12, 'MECHANISM'] = 'MVT Unspecified'
data.loc[data['MECHANISM'] == 13, 'MECHANISM'] = 'MVT Other'
data.loc[data['MECHANISM'] == 14, 'MECHANISM'] = 'Pedal Cyclist, Other'
data.loc[data['MECHANISM'] == 15, 'MECHANISM'] = 'Pedestrian, Other'
data.loc[data['MECHANISM'] == 16, 'MECHANISM'] = 'Transport, Other'
data.loc[data['MECHANISM'] == 17, 'MECHANISM'] = 'Natural/Environmental, Bites and Stings'
data.loc[data['MECHANISM'] == 18, 'MECHANISM'] = 'Natural/Environmental, Other'
data.loc[data['MECHANISM'] == 19, 'MECHANISM'] = 'Overexertion'
data.loc[data['MECHANISM'] == 20, 'MECHANISM'] = 'Poisoning'
data.loc[data['MECHANISM'] == 21, 'MECHANISM'] = 'Struck By/Against'
data.loc[data['MECHANISM'] == 22, 'MECHANISM'] = 'Suffocation'
data.loc[data['MECHANISM'] == 23, 'MECHANISM'] = 'Other Specified and Classifiable'
data.loc[data['MECHANISM'] == 24, 'MECHANISM'] = 'Other Specified, Not Elsewhere Classifiable'
data.loc[data['MECHANISM'] == 25, 'MECHANISM'] = 'Unspecified'
data.loc[data['MECHANISM'] == 26, 'MECHANISM'] = 'Adverse Effects, Medical Care'
data.loc[data['MECHANISM'] == 27, 'MECHANISM'] = 'Adverse Effects, Drugs'
data.loc[data['MECHANISM'] == 30, 'MECHANISM'] = 'Pedestrian Pedal'
data.loc[data['MECHANISM'] == 31, 'MECHANISM'] = 'MVT Occupant and Others'
data.loc[data['MECHANISM'] == 32, 'MECHANISM'] = 'Others'
data.loc[data['MECHANISM'] == 'Other Specified, Not Elsewhere Classifiable', 'MECHANISM'] = 'Other/unspecified/unknown'
data.loc[data['MECHANISM'] == 'Other Specified and Classifiable', 'MECHANISM'] = 'Other/unspecified/unknown'
data.loc[data['MECHANISM'] == 'Unspecified', 'MECHANISM'] = 'Other/unspecified/unknown'
data.loc[data['MECHANISM'] == 'Unknown', 'MECHANISM'] = 'Other/unspecified/unknown'
data.loc[data['MECHANISM'] == 'Transport, Other', 'MECHANISM'] = 'Other transport'
data.loc[data['MECHANISM'] == 'MVT Other', 'MECHANISM'] = 'Other MVT'
data.loc[data['MECHANISM'] == 'MVT Unspecified', 'MECHANISM'] = 'Other MVT'
data.loc[data['MECHANISM'] == 'Pedal Cyclist, Other', 'MECHANISM'] = 'Other pedal cyclist'
data.loc[data['MECHANISM'] == 'Pedestrian, Other', 'MECHANISM'] = 'Other pedestrian'
data.loc[data['MECHANISM'] == 'Natural/Environmental, Other', 'MECHANISM'] = 'Natural/environmental'
data.loc[data['MECHANISM'] == 'Struck By/Against', 'MECHANISM'] = 'Struck by or against'
data.loc[data['MECHANISM'] == 'MVT Occupant', 'MECHANISM'] = 'MVT occupant'
data.loc[data['MECHANISM'] == 'MVT Motorcyclist', 'MECHANISM'] = 'MVT motorcyclist'
data.loc[data['MECHANISM'] == 'MVT Pedestrian', 'MECHANISM'] = 'MVT pedestrian'
data.loc[data['MECHANISM'] == 'MVT Pedal Cyclist', 'MECHANISM'] = 'MVT pedal cyclist'
data.loc[data['MECHANISM'] == 'Cut/Pierce', 'MECHANISM'] = 'Cut/pierce'

data['MECHANISM'].value_counts(normalize=False, dropna=False)

Fall                                       13292
MVT occupant                                4085
Struck by or against                        2571
Firearm                                     1264
MVT pedestrian                              1008
MVT motorcyclist                             989
Other MVT                                    689
Other/unspecified/unknown                    671
Other transport                              633
NaN                                          507
Other pedal cyclist                          353
MVT pedal cyclist                            289
Other pedestrian                             169
Natural/environmental                        167
Cut/pierce                                   142
Suffocation                                   39
Machinery                                     26
Overexertion                                   9
Natural/Environmental, Bites and Stings        6
Fire/Flame                                     5
Poisoning           

In [27]:
#Merge protective device columns and drop former columns.

data.loc[data['PROTDEV_AIRBAG_PRESENT'] == 1, 'PROTDEV'] = 'Airbag present'
data.loc[data['PROTDEV_CHILD_RESTRAINT'] == 1, 'PROTDEV'] = 'Child restraint (booster seat or child car seat)'
data.loc[data['PROTDEV_EYE_PROTECT'] == 1, 'PROTDEV'] = 'Eye protection'
data.loc[data['PROTDEV_HELMET'] == 1, 'PROTDEV'] = 'Helmet'
data.loc[data['PROTDEV_LAP_BELT'] == 1, 'PROTDEV'] = 'Belt'
data.loc[data['PROTDEV_PER_FLOAT'] == 1, 'PROTDEV'] = 'Personal floatation device'
data.loc[data['PROTDEV_PROTECT_CLOTH'] == 1, 'PROTDEV'] = 'Protective clothing'
data.loc[data['PROTDEV_PROTECT_GEAR'] == 1, 'PROTDEV'] = 'Protective non-clothing gear'
data.loc[data['PROTDEV_SHOULDER_BELT'] == 1, 'PROTDEV'] = 'Belt'
data.loc[data['PROTDEV_OTHER'] == 1, 'PROTDEV'] = 'Other'
data.loc[data['PROTDEV_NONE'] == 1, 'PROTDEV'] = 'None'

data = data.drop(columns=['PROTDEV_AIRBAG_PRESENT', 'PROTDEV_CHILD_RESTRAINT', 'PROTDEV_EYE_PROTECT', 'PROTDEV_HELMET', 'PROTDEV_LAP_BELT', 'PROTDEV_PER_FLOAT', 'PROTDEV_PROTECT_CLOTH', 'PROTDEV_PROTECT_GEAR', 'PROTDEV_SHOULDER_BELT', 'PROTDEV_OTHER', 'PROTDEV_NONE'])

data['PROTDEV'].value_counts(normalize=False, dropna=False)

None                                                22701
Belt                                                 1548
Airbag present                                       1355
Helmet                                                598
NaN                                                   585
Protective clothing                                    84
Protective non-clothing gear                           31
Other                                                  16
Personal floatation device                              3
Child restraint (booster seat or child car seat)        3
Eye protection                                          1
Name: PROTDEV, dtype: int64

In [28]:
#Merge 'PATIENTSOCCUPATION' and 'WORKRELATED' and change response values to strings for.

data.loc[data['WORKRELATED'] == 2, 'WORKRELATED'] = 'No'
data.loc[data['WORKRELATED'] == 1, 'WORKRELATED'] = 'Yes'

data['WORKRELATED'].value_counts(normalize=False, dropna=False)

No     26077
Yes      611
NaN      237
Name: WORKRELATED, dtype: int64

In [29]:
#Merge ICP columns and drop former columns.

data.loc[data['ICPEVDRAIN'] == 1, 'ICP'] = 'Intraventricular drain/catheter'
data.loc[data['ICPJVBULB'] == 1, 'ICP'] = 'Jugular venous bulb'
data.loc[data['ICPO2MONITOR'] == 1, 'ICP'] = 'Intraparenchymal oxygen/pressure monitor'
data.loc[data['ICPPARENCH'] == 1, 'ICP'] = 'Intraparenchymal oxygen/pressure monitor'
data.loc[data['ICPNONE'] == 1, 'ICP'] = 'None'

data = data.drop(columns=['ICPEVDRAIN', 'ICPJVBULB', 'ICPO2MONITOR', 'ICPPARENCH', 'ICPNONE'])

data['ICP'].value_counts(normalize=False, dropna=False)

None                                        25214
Intraparenchymal oxygen/pressure monitor      895
Intraventricular drain/catheter               794
Jugular venous bulb                            14
NaN                                             8
Name: ICP, dtype: int64

In [30]:
#Change response values to strings for 'ALCOHOLSCREEN'.

data.loc[data['ALCOHOLSCREEN'] == 1, 'ALCOHOLSCREEN'] = 'Yes'
data.loc[data['ALCOHOLSCREEN'] == 2, 'ALCOHOLSCREEN'] = 'No'

data['ALCOHOLSCREEN'].value_counts(normalize=False, dropna=False)

Yes    20764
No      6113
NaN       48
Name: ALCOHOLSCREEN, dtype: int64

In [31]:
#Assign '0' for patients who were not screened for alcohol.

data['ALCOHOLSCREENRESULT'] = data['ALCOHOLSCREENRESULT'].fillna(0)

In [32]:
#Change response values to strings for 'ANTIBIOTICTHERAPY'.

data.loc[data['ANTIBIOTICTHERAPY'] == 1, 'ANTIBIOTICTHERAPY'] = 'Yes'
data.loc[data['ANTIBIOTICTHERAPY'] == 2, 'ANTIBIOTICTHERAPY'] = 'No'

data['ANTIBIOTICTHERAPY'].value_counts(normalize=False, dropna=False)

NaN    25051
Yes     1251
No       623
Name: ANTIBIOTICTHERAPY, dtype: int64

In [33]:
#Change response values to strings for 'DRGSCR_AMPHETAMINE'.

data.loc[data['DRGSCR_AMPHETAMINE'] == 1, 'DRGSCR_AMPHETAMINE'] = 'Yes'
data.loc[data['DRGSCR_AMPHETAMINE'] == 0, 'DRGSCR_AMPHETAMINE'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_AMPHETAMINE'] = 'Not tested'
data['DRGSCR_AMPHETAMINE'] = data['DRGSCR_AMPHETAMINE'].fillna('Not tested')

data['DRGSCR_AMPHETAMINE'].value_counts(normalize=False, dropna=False)

No            14909
Not tested    10006
Yes            2010
Name: DRGSCR_AMPHETAMINE, dtype: int64

In [34]:
#Change response values to strings for 'DRGSCR_BARBITURATE'.

data.loc[data['DRGSCR_BARBITURATE'] == 1, 'DRGSCR_BARBITURATE'] = 'Yes'
data.loc[data['DRGSCR_BARBITURATE'] == 0, 'DRGSCR_BARBITURATE'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_BARBITURATE'] = 'Not tested'
data['DRGSCR_BARBITURATE'] = data['DRGSCR_BARBITURATE'].fillna('Not tested')

data['DRGSCR_BARBITURATE'].value_counts(normalize=False, dropna=False)

No            16638
Not tested    10006
Yes             281
Name: DRGSCR_BARBITURATE, dtype: int64

In [35]:
#Change response values to strings for 'DRGSCR_BENZODIAZEPINES'.

data.loc[data['DRGSCR_BENZODIAZEPINES'] == 1, 'DRGSCR_BENZODIAZEPINES'] = 'Yes'
data.loc[data['DRGSCR_BENZODIAZEPINES'] == 0, 'DRGSCR_BENZODIAZEPINES'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_BENZODIAZEPINES'] = 'Not tested'
data['DRGSCR_BENZODIAZEPINES'] = data['DRGSCR_BENZODIAZEPINES'].fillna('Not tested')

data['DRGSCR_BENZODIAZEPINES'].value_counts(normalize=False, dropna=False)

No            15132
Not tested    10006
Yes            1787
Name: DRGSCR_BENZODIAZEPINES, dtype: int64

In [36]:
#Change response values to strings for 'DRGSCR_CANNABINOID'.

data.loc[data['DRGSCR_CANNABINOID'] == 1, 'DRGSCR_CANNABINOID'] = 'Yes'
data.loc[data['DRGSCR_CANNABINOID'] == 0, 'DRGSCR_CANNABINOID'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_CANNABINOID'] = 'Not tested'
data['DRGSCR_CANNABINOID'] = data['DRGSCR_CANNABINOID'].fillna('Not tested')

data['DRGSCR_CANNABINOID'].value_counts(normalize=False, dropna=False)

No            12442
Not tested    10006
Yes            4477
Name: DRGSCR_CANNABINOID, dtype: int64

In [37]:
#Change response values to strings for 'DRGSCR_ECSTASY'.

data.loc[data['DRGSCR_ECSTASY'] == 1, 'DRGSCR_ECSTASY'] = 'Yes'
data.loc[data['DRGSCR_ECSTASY'] == 0, 'DRGSCR_ECSTASY'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_ECSTASY'] = 'Not tested'
data['DRGSCR_ECSTASY'] = data['DRGSCR_ECSTASY'].fillna('Not tested')

data['DRGSCR_ECSTASY'].value_counts(normalize=False, dropna=False)

No            16719
Not tested    10006
Yes             200
Name: DRGSCR_ECSTASY, dtype: int64

In [38]:
#Change response values to strings for 'DRGSCR_METHADONE'.

data.loc[data['DRGSCR_METHADONE'] == 1, 'DRGSCR_METHADONE'] = 'Yes'
data.loc[data['DRGSCR_METHADONE'] == 0, 'DRGSCR_METHADONE'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_METHADONE'] = 'Not tested'
data['DRGSCR_METHADONE'] = data['DRGSCR_METHADONE'].fillna('Not tested')

data['DRGSCR_METHADONE'].value_counts(normalize=False, dropna=False)

No            16809
Not tested    10006
Yes             110
Name: DRGSCR_METHADONE, dtype: int64

In [39]:
#Change response values to strings for 'DRGSCR_METHAMPHETAMINE'.

data.loc[data['DRGSCR_METHAMPHETAMINE'] == 1, 'DRGSCR_METHAMPHETAMINE'] = 'Yes'
data.loc[data['DRGSCR_METHAMPHETAMINE'] == 0, 'DRGSCR_METHAMPHETAMINE'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_METHAMPHETAMINE'] = 'Not tested'
data['DRGSCR_METHAMPHETAMINE'] = data['DRGSCR_METHAMPHETAMINE'].fillna('Not tested')

data['DRGSCR_METHAMPHETAMINE'].value_counts(normalize=False, dropna=False)

No            16290
Not tested    10006
Yes             629
Name: DRGSCR_METHAMPHETAMINE, dtype: int64

In [40]:
#Change response values to strings for 'DRGSCR_OPIOID'.

data.loc[data['DRGSCR_OPIOID'] == 1, 'DRGSCR_OPIOID'] = 'Yes'
data.loc[data['DRGSCR_OPIOID'] == 0, 'DRGSCR_OPIOID'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_OPIOID'] = 'Not tested'
data['DRGSCR_OPIOID'] = data['DRGSCR_OPIOID'].fillna('Not tested')

data['DRGSCR_OPIOID'].value_counts(normalize=False, dropna=False)

No            16323
Not tested    10006
Yes             596
Name: DRGSCR_OPIOID, dtype: int64

In [41]:
#Change response values to strings for 'DRGSCR_OXYCODONE'.

data.loc[data['DRGSCR_OXYCODONE'] == 1, 'DRGSCR_OXYCODONE'] = 'Yes'
data.loc[data['DRGSCR_OXYCODONE'] == 0, 'DRGSCR_OXYCODONE'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_OXYCODONE'] = 'Not tested'
data['DRGSCR_OXYCODONE'] = data['DRGSCR_OXYCODONE'].fillna('Not tested')

data['DRGSCR_OXYCODONE'].value_counts(normalize=False, dropna=False)

No            16803
Not tested    10006
Yes             116
Name: DRGSCR_OXYCODONE, dtype: int64

In [42]:
#Change response values to strings for 'DRGSCR_PHENCYCLIDINE'.

data.loc[data['DRGSCR_PHENCYCLIDINE'] == 1, 'DRGSCR_PHENCYCLIDINE'] = 'Yes'
data.loc[data['DRGSCR_PHENCYCLIDINE'] == 0, 'DRGSCR_PHENCYCLIDINE'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_PHENCYCLIDINE'] = 'Not tested'
data['DRGSCR_PHENCYCLIDINE'] = data['DRGSCR_PHENCYCLIDINE'].fillna('Not tested')

data['DRGSCR_PHENCYCLIDINE'].value_counts(normalize=False, dropna=False)

No            16733
Not tested    10006
Yes             186
Name: DRGSCR_PHENCYCLIDINE, dtype: int64

In [43]:
#Change response values to strings for 'DRGSCR_TRICYCLICDEPRESS'.

data.loc[data['DRGSCR_TRICYCLICDEPRESS'] == 1, 'DRGSCR_TRICYCLICDEPRESS'] = 'Yes'
data.loc[data['DRGSCR_TRICYCLICDEPRESS'] == 0, 'DRGSCR_TRICYCLICDEPRESS'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_TRICYCLICDEPRESS'] = 'Not tested'
data['DRGSCR_TRICYCLICDEPRESS'] = data['DRGSCR_TRICYCLICDEPRESS'].fillna('Not tested')

data['DRGSCR_TRICYCLICDEPRESS'].value_counts(normalize=False, dropna=False)

No            16858
Not tested    10006
Yes              61
Name: DRGSCR_TRICYCLICDEPRESS, dtype: int64

In [44]:
#Drop 'DRGSCR_NOTTESTED' since it will not be utilized from this point.

data = data.drop(columns=['DRGSCR_NOTTESTED'])

In [45]:
#Change response values to strings for 'VERIFICATIONLEVEL'.

data.loc[data['VERIFICATIONLEVEL'] == 1, 'VERIFICATIONLEVEL'] = 'Level I Trauma Center'
data.loc[data['VERIFICATIONLEVEL'] == 2, 'VERIFICATIONLEVEL'] = 'Level II Trauma Center'
data.loc[data['VERIFICATIONLEVEL'] == 3, 'VERIFICATIONLEVEL'] = 'Level III Trauma Center'

data['VERIFICATIONLEVEL'].value_counts(normalize=False, dropna=False)

Level I Trauma Center     17342
Level II Trauma Center     9583
Name: VERIFICATIONLEVEL, dtype: int64

In [46]:
#Change response values to strings for 'HOSPITALTYPE'.

data.loc[data['HOSPITALTYPE'] == 1, 'HOSPITALTYPE'] = 'For profit'
data.loc[data['HOSPITALTYPE'] == 2, 'HOSPITALTYPE'] = 'Non-profit'
data.loc[data['HOSPITALTYPE'] == 3, 'HOSPITALTYPE'] = 'Government'

data['HOSPITALTYPE'].value_counts(normalize=False, dropna=False)

Non-profit    23840
For profit     2963
Government      122
Name: HOSPITALTYPE, dtype: int64

In [47]:
#Change response values to strings for 'BEDSIZE'.

data.loc[data['BEDSIZE'] == 1, 'BEDSIZE'] = '200 or fewer'
data.loc[data['BEDSIZE'] == 2, 'BEDSIZE'] = '201 to 400'
data.loc[data['BEDSIZE'] == 3, 'BEDSIZE'] = '401 to 600'
data.loc[data['BEDSIZE'] == 4, 'BEDSIZE'] = 'More than 600'
data = data.dropna(subset=['BEDSIZE'])

data['BEDSIZE'].value_counts(normalize=False, dropna=False)

More than 600    9899
401 to 600       8304
201 to 400       7462
200 or fewer     1260
Name: BEDSIZE, dtype: int64

In [48]:
#Change response values to strings for 'PRIMARYMETHODPAYMENT'.

data.loc[data['PRIMARYMETHODPAYMENT'] == 1, 'PRIMARYMETHODPAYMENT'] = 'Medicaid'
data.loc[data['PRIMARYMETHODPAYMENT'] == 2, 'PRIMARYMETHODPAYMENT'] = 'Not billed '
data.loc[data['PRIMARYMETHODPAYMENT'] == 3, 'PRIMARYMETHODPAYMENT'] = 'Self-pay'
data.loc[data['PRIMARYMETHODPAYMENT'] == 4, 'PRIMARYMETHODPAYMENT'] = 'Private/commercial insurance'
data.loc[data['PRIMARYMETHODPAYMENT'] == 6, 'PRIMARYMETHODPAYMENT'] = 'Medicare'
data.loc[data['PRIMARYMETHODPAYMENT'] == 7, 'PRIMARYMETHODPAYMENT'] = 'Other/unknown'
data.loc[data['PRIMARYMETHODPAYMENT'] == 10, 'PRIMARYMETHODPAYMENT'] = 'Other/unknown'
data['PRIMARYMETHODPAYMENT'] = data['PRIMARYMETHODPAYMENT'].fillna('Other/unknown')

data['PRIMARYMETHODPAYMENT'].value_counts(normalize=False, dropna=False)

Private/commercial insurance    8308
Medicare                        7019
Medicaid                        5851
Self-pay                        3669
Other/unknown                   2032
Not billed                        46
Name: PRIMARYMETHODPAYMENT, dtype: int64

In [49]:
#Change response values to strings for 'HOSPDISCHARGEDISPOSITION'.

data.loc[data['HOSPDISCHARGEDISPOSITION'] == 1, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to a short-term general hospital for inpatient care'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 2, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to an Intermediate Care Facility (ICF)'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 3, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to home under care of organized home health service'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 4, 'HOSPDISCHARGEDISPOSITION'] = 'Left against medical advice or discontinued care'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 5, 'HOSPDISCHARGEDISPOSITION'] = 'Deceased/Expired'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 6, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged to home or self-care (routine discharge)'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 7, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to Skilled Nursing Facility (SNF)'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 8, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to hospice care'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 10, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to court/law enforcement.'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 11, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to inpatient rehab or designated unit'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 12, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to Long Term Care Hospital (LTCH)'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 13, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to a psychiatric hospital or psychiatric distinct part unit of a hospital'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 14, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to another type of institution not defined elsewhere'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 14, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to another type of institution not defined elsewhere'

data['HOSPDISCHARGEDISPOSITION'].value_counts(normalize=False, dropna=False)

Discharged to home or self-care (routine discharge)                                                 12042
Discharged/Transferred to inpatient rehab or designated unit                                         3658
Deceased/Expired                                                                                     2796
Discharged/Transferred to Skilled Nursing Facility (SNF)                                             2237
Discharged/Transferred to home under care of organized home health service                           1458
NaN                                                                                                  1240
Left against medical advice or discontinued care                                                      900
Discharged/Transferred to hospice care                                                                892
Discharged/Transferred to Long Term Care Hospital (LTCH)                                              468
Discharged/Transferred to a short-term general

In [50]:
#Check data shape.

data.shape

(26925, 114)

#Imputation

In [51]:
#Define numerical and categorical columns.

num_cols = list(data.select_dtypes('number').columns)
print('Numerical columns: {}'.format(num_cols), '\n')

cat_cols = list(data.select_dtypes('object').columns)
print('Categorical columns: {}'.format(cat_cols))

Numerical columns: ['AGEYEARS', 'WEIGHT', 'HEIGHT', 'SBP', 'PULSERATE', 'PULSEOXIMETRY', 'RESPIRATORYRATE', 'TEMPERATURE', 'HOSPITALARRIVALDAYS', 'AISSEVERITY1', 'AISSEVERITY2', 'AISSEVERITY3', 'AISSEVERITY4', 'AISSEVERITY5', 'AISSEVERITY6', 'AISSEVERITY7', 'AISSEVERITY8', 'AISSEVERITY9', 'ISS', 'BLOOD4HOURS', 'PLASMA4HOURS', 'PLATELETS4HOURS', 'CRYOPRECIPITATE4HOURS', 'ALCOHOLSCREENRESULT', 'DRGSCR_COCAINE', 'FINALDISCHARGEDAYS', 'TOTALICULOS', 'IMPACT'] 

Categorical columns: ['SEX', 'ETHNICITY', 'SUPPLEMENTALOXYGEN', 'RESPIRATORYASSISTANCE', 'PREHOSPITALCARDIACARREST', 'GCSEYE', 'GCSVERBAL', 'GCSMOTOR', 'TBIPUPILLARYRESPONSE', 'TBIMIDLINESHIFT', 'CC_SUBSTANCEABUSE', 'CC_DIABETES', 'CC_HYPERTENSION', 'CC_CHF', 'CC_MI', 'CC_ANGINAPECTORIS', 'CC_CVA', 'CC_PAD', 'CC_COPD', 'CC_RENAL', 'CC_CIRRHOSIS', 'CC_BLEEDING', 'CC_DISCANCER', 'CC_CHEMO', 'CC_DEMENTIA', 'CC_ADHD', 'CC_MENTALPERSONALITY', 'CC_FUNCTIONAL', 'CC_PREGNANCY', 'CC_ANTICOAGULANT', 'CC_STEROID', 'CC_ADLC', 'TRANSPORTMODE', '

In [52]:
#Remove outcomes.

num_cols_remove = ['FINALDISCHARGEDAYS', 'TOTALICULOS']
cat_cols_remove = ['HC_CARDARREST', 'HC_CAUTI', 'HC_CLABSI', 'HC_CRBSI', 'HC_DEEPSSI', 'HC_DELIRIUM', 'HC_DRUGALCOHOLWITHDRAWAL', 'HC_DVTHROMBOSIS', 'HC_EMBOLISM', 'HC_EXTREMITYCS', 'HC_GRAFTFAIL', 'HC_INTUBATION', 'HC_KIDNEY', 'HC_MI', 'HC_ORGANSPACESSI', 'HC_OSTEOMYELITIS', 'HC_PNEUMONIA', 'HC_PRESSUREULCER', 'HC_RESPIRATORY', 'HC_RETURNOR', 'HC_SEPSIS', 'HC_STROKECVA', 'HC_SUPERFICIALSSI', 'HC_UNPLANNEDICU', 'HC_UTI', 'HC_VAPNEUMONIA', 'HOSPDISCHARGEDISPOSITION']

num_cols = [i for i in num_cols if i not in num_cols_remove]
cat_cols = [i for i in cat_cols if i not in cat_cols_remove]

In [53]:
#Check missing values for numerical columns.

missing_num = data[num_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

print(missing_num, '\n')

missing_num = pd.DataFrame(missing_num)

missing_num.columns = ['Value']

missing_num = missing_num[missing_num['Value'] > 0]

print('Numerical variables with missing values: ', list(missing_num.index), '\n')

print('Number of numerical variables with missing values: ', len(list(missing_num.index)), '\n')

missing_num = missing_num[missing_num['Value'] > 25]

missing_num = list(missing_num.index)

print('Excluded numerical variables: ', missing_num)

CRYOPRECIPITATE4HOURS    32.16
PLASMA4HOURS             32.16
PLATELETS4HOURS          32.16
TEMPERATURE              14.07
HEIGHT                   11.03
WEIGHT                    4.21
RESPIRATORYRATE           3.00
PULSEOXIMETRY             1.88
SBP                       1.25
PULSERATE                 0.98
HOSPITALARRIVALDAYS       0.81
ISS                       0.22
BLOOD4HOURS               0.07
ALCOHOLSCREENRESULT       0.00
DRGSCR_COCAINE            0.00
AISSEVERITY9              0.00
AGEYEARS                  0.00
AISSEVERITY5              0.00
AISSEVERITY8              0.00
AISSEVERITY7              0.00
AISSEVERITY6              0.00
AISSEVERITY4              0.00
AISSEVERITY3              0.00
AISSEVERITY2              0.00
AISSEVERITY1              0.00
IMPACT                    0.00
dtype: float64 

Numerical variables with missing values:  ['CRYOPRECIPITATE4HOURS', 'PLASMA4HOURS', 'PLATELETS4HOURS', 'TEMPERATURE', 'HEIGHT', 'WEIGHT', 'RESPIRATORYRATE', 'PULSEOXIMETRY', 'SB

In [54]:
#Drop numerical columns with missing values over 25%.

data.drop(missing_num, axis=1, inplace=True)

In [55]:
#Define new numerical columns.

num_cols = [x for x in num_cols if x not in missing_num]

In [56]:
#Impute missing numerical values.

num_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
data[num_cols] = num_imputer.fit_transform(data[num_cols])

In [57]:
#Check missing values for categorical columns.

missing_cat = data[cat_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

print(missing_cat, '\n')

missing_cat = pd.DataFrame(missing_cat)

missing_cat.columns = ['Value']

missing_cat = missing_cat[missing_cat['Value'] > 0]

print('Categorical variables with missing values: ', list(missing_cat.index), '\n')

print('Number of categorical variables with missing values: ', len(list(missing_cat.index)), '\n')

missing_cat = missing_cat[missing_cat['Value'] > 25]

missing_cat = list(missing_cat.index)

print('Excluded categorical variables: ', missing_cat)

ANTIBIOTICTHERAPY           93.04
ETHNICITY                    5.08
SUPPLEMENTALOXYGEN           4.96
RESPIRATORYASSISTANCE        4.23
TRAUMATYPE                   2.54
PROTDEV                      2.17
MECHANISM                    1.88
WORKRELATED                  0.88
PREHOSPITALCARDIACARREST     0.65
SEX                          0.26
ALCOHOLSCREEN                0.18
ICP                          0.03
CC_HYPERTENSION              0.00
DRGSCR_METHAMPHETAMINE       0.00
INTERVENTION                 0.00
GCSMOTOR                     0.00
GCSVERBAL                    0.00
DRGSCR_AMPHETAMINE           0.00
DRGSCR_BARBITURATE           0.00
DRGSCR_BENZODIAZEPINES       0.00
DRGSCR_CANNABINOID           0.00
DRGSCR_ECSTASY               0.00
DRGSCR_METHADONE             0.00
DRGSCR_OPIOID                0.00
TBIMIDLINESHIFT              0.00
DRGSCR_OXYCODONE             0.00
DRGSCR_PHENCYCLIDINE         0.00
DRGSCR_TRICYCLICDEPRESS      0.00
VERIFICATIONLEVEL            0.00
HOSPITALTYPE  

In [58]:
#Drop categorical columns with missing values over 25%.

data.drop(missing_cat, axis=1, inplace=True)

In [59]:
#Define new categorical columns.

cat_cols = [x for x in cat_cols if x not in missing_cat]

In [60]:
#Replace missing categorical values with 'Unknown'.

for col in cat_cols:
    data[col].fillna(value='Unknown', inplace=True)

In [61]:
#Change variable names to field names.

data_dictionary = pd.read_csv("/content/drive/MyDrive/TQP-MOST/Modified Data Dictionary.csv", encoding = 'latin1', index_col = None, low_memory = False)
FieldNames = dict(zip(data_dictionary['Variable'], data_dictionary['Field Name']))
data.columns = data.columns.map(FieldNames)

In [62]:
#Save imputed data.

data.to_csv('/content/drive/MyDrive/TQP-MOST/imputed_data_impact.csv')

#Prepare Outcome Data

In [63]:
#Drop patients with unknown outcome of interest.

before = data.shape[0]
data = data[data['Discharge Disposition'].notna()]
data = data[data['Total Length of Stay'].notna()]

after = data.shape[0]

excluded = before - after
print('Number of patients before exclusion: ', before)
print('Number of patients after exclusion: ', after)
print('Number of patients excluded with this criteria: ', excluded)

Number of patients before exclusion:  26925
Number of patients after exclusion:  25381
Number of patients excluded with this criteria:  1544


In [64]:
#Define outcome of interest (overall in-hospital mortality).

data.loc[data['Discharge Disposition'] == 'Deceased/Expired', 'label'] = 1
data.loc[data['label'] != 1, 'label'] = 0

data['label'].value_counts(normalize=False, dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data['Discharge Disposition'] == 'Deceased/Expired', 'label'] = 1


0.0    22769
1.0     2612
Name: label, dtype: int64

In [65]:
#Split data into training, validation and test sets.

training, validation, test = \
              np.split(data.sample(frac=1, random_state=31),
                       [int(.6*len(data)), int(.8*len(data))])

training.to_csv('/content/drive/MyDrive/TQP-MOST/final_data_training_impact.csv')
validation.to_csv('/content/drive/MyDrive/TQP-MOST/final_data_validation_impact.csv')
test.to_csv('/content/drive/MyDrive/TQP-MOST/final_data_test_impact.csv')

training['Dataset'] = 'Training'
validation['Dataset'] = 'Validation'
test['Dataset'] = 'Test'

In [66]:
#Save final data.

data = pd.concat([training, validation, test], axis=0)

data.to_csv('/content/drive/MyDrive/TQP-MOST/final_data_impact.csv')

data['Dataset'].value_counts()

Training      15228
Test           5077
Validation     5076
Name: Dataset, dtype: int64