In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

from matplotlib import pyplot as plt

pd.set_option('display.max_rows', None)

In [None]:
#Open csv file.

data = pd.read_csv("/content/drive/MyDrive/TQP-atSDH/isolated_atSDH.csv", index_col=0)
data.shape

#Renaming and Merging Response Values




In [None]:
#See all columns.

print(list(data.columns))

In [None]:
#Change response values to strings for 'SEX'.

data.loc[data['SEX'] == 1, 'SEX'] = 'Male'
data.loc[data['SEX'] == 2, 'SEX'] = 'Female'
data.loc[data['SEX'] == 3, 'SEX'] = 'Non-Binary'

data['SEX'].value_counts(normalize=False, dropna=False)

In [None]:
#Merge race columns and drop former columns.

data.loc[data['AMERICANINDIAN'] == 1, 'RACE'] = 'American Indian'
data.loc[data['ASIAN'] == 1, 'RACE'] = 'Asian'
data.loc[data['BLACK'] == 1, 'RACE'] = 'Black'
data.loc[data['PACIFICISLANDER'] == 1, 'RACE'] = 'Pacific Islander'
data.loc[data['RACEOTHER'] == 1, 'RACE'] = 'Other/unknown'
data.loc[data['WHITE'] == 1, 'RACE'] = 'White'
data['RACE'] = data['RACE'].fillna('Other/unknown')

data = data.drop(columns=['AMERICANINDIAN', 'ASIAN', 'BLACK', 'PACIFICISLANDER', 'RACEOTHER', 'WHITE'])

data['RACE'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'ETHNICITY'.

data.loc[data['ETHNICITY'] == 1, 'ETHNICITY'] = 'Hispanic or Latino'
data.loc[data['ETHNICITY'] == 2, 'ETHNICITY'] = 'Not Hispanic or Latino'

data['ETHNICITY'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'RESPIRATORYASSISTANCE'.

data.loc[data['RESPIRATORYASSISTANCE'] == 1, 'RESPIRATORYASSISTANCE'] = 'Unassisted respiratory rate'
data.loc[data['RESPIRATORYASSISTANCE'] == 2, 'RESPIRATORYASSISTANCE'] = 'Assisted respiratory rate'

data['RESPIRATORYASSISTANCE'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'SUPPLEMENTALOXYGEN'.

data.loc[data['SUPPLEMENTALOXYGEN'] == 1, 'SUPPLEMENTALOXYGEN'] = 'No supplemental oxygen'
data.loc[data['SUPPLEMENTALOXYGEN'] == 2, 'SUPPLEMENTALOXYGEN'] = 'Supplemental oxygen'

data['SUPPLEMENTALOXYGEN'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'PREHOSPITALCARDIACARREST'.

data.loc[data['PREHOSPITALCARDIACARREST'] == 1, 'PREHOSPITALCARDIACARREST'] = 'Yes'
data.loc[data['PREHOSPITALCARDIACARREST'] == 2, 'PREHOSPITALCARDIACARREST'] = 'No'

data['PREHOSPITALCARDIACARREST'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'TBIMIDLINESHIFT'.

data.loc[data['TBIMIDLINESHIFT'] == 1, 'TBIMIDLINESHIFT'] = 'Yes'
data.loc[data['TBIMIDLINESHIFT'] == 2, 'TBIMIDLINESHIFT'] = 'No'
data.loc[data['TBIMIDLINESHIFT'] == 3, 'TBIMIDLINESHIFT'] = 'Not imaged/unknown'
data['TBIMIDLINESHIFT'] = data['TBIMIDLINESHIFT'].fillna('Not imaged/unknown')


data['TBIMIDLINESHIFT'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'TBIPUPILLARYRESPONSE'.

data.loc[data['TBIPUPILLARYRESPONSE'] == 1, 'TBIPUPILLARYRESPONSE'] = 'Both reactive'
data.loc[data['TBIPUPILLARYRESPONSE'] == 2, 'TBIPUPILLARYRESPONSE'] = 'One reactive'
data.loc[data['TBIPUPILLARYRESPONSE'] == 3, 'TBIPUPILLARYRESPONSE'] = 'Neither reactive'

data['TBIPUPILLARYRESPONSE'].value_counts(normalize=False, dropna=False)

In [None]:
#Fill NaN for bleeding variables.

data['LOCALIZATION'] = data['LOCALIZATION'].fillna('Unknown')
data['SIZE'] = data['SIZE'].fillna('Unknown')

print(data['LOCALIZATION'].value_counts(normalize=False, dropna=False))
print(data['SIZE'].value_counts(normalize=False, dropna=False))

In [None]:
#Assign male patients' pregnancy response values as 'Not applicable (male patient)

data.loc[data['SEX'] == 'Male', 'CC_PREGNANCY'] = 'Not applicable (male patient)'

data['CC_PREGNANCY'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'TRANSPORTMODE'.

data.loc[data['TRANSPORTMODE'] == 1, 'TRANSPORTMODE'] = 'Ground ambulance'
data.loc[data['TRANSPORTMODE'] == 2, 'TRANSPORTMODE'] = 'Air ambulance'
data.loc[data['TRANSPORTMODE'] == 3, 'TRANSPORTMODE'] = 'Air ambulance'
data.loc[data['TRANSPORTMODE'] == 4, 'TRANSPORTMODE'] = 'Private/public vehicle/walk-in'
data.loc[data['TRANSPORTMODE'] == 5, 'TRANSPORTMODE'] = 'Other/police/unknown/etc.'
data.loc[data['TRANSPORTMODE'] == 6, 'TRANSPORTMODE'] = 'Other/police/unknown/etc.'
data['TRANSPORTMODE'] = data['TRANSPORTMODE'].fillna('Other/police/unknown/etc.')


data['TRANSPORTMODE'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'INTERFACILITYTRANSFER'.

data.loc[data['INTERFACILITYTRANSFER'] == 1, 'INTERFACILITYTRANSFER'] = 'Yes'
data.loc[data['INTERFACILITYTRANSFER'] == 2, 'INTERFACILITYTRANSFER'] = 'No'

data['INTERFACILITYTRANSFER'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'TRAUMATYPE'.

data.loc[data['TRAUMATYPE'] == 1, 'TRAUMATYPE'] = 'Blunt'
data.loc[data['TRAUMATYPE'] == 2, 'TRAUMATYPE'] = 'Penetrating'
data.loc[data['TRAUMATYPE'] == 3, 'TRAUMATYPE'] = 'Other/unknown'
data.loc[data['TRAUMATYPE'] == 4, 'TRAUMATYPE'] = 'Other/unknown'
data.loc[data['TRAUMATYPE'] == 9, 'TRAUMATYPE'] = 'Other/unknown'
data['TRAUMATYPE'] = data['TRAUMATYPE'].fillna('Other/unknown')

data['TRAUMATYPE'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'INTENT'.

data.loc[data['INTENT'] == 1, 'INTENT'] = 'Unintentional'
data.loc[data['INTENT'] == 2, 'INTENT'] = 'Self-inflicted'
data.loc[data['INTENT'] == 3, 'INTENT'] = 'Assault'
data.loc[data['INTENT'] == 4, 'INTENT'] = 'Other/unknown'
data.loc[data['INTENT'] == 5, 'INTENT'] = 'Other/unknown'
data['INTENT'] = data['INTENT'].fillna('Other/unknown')

data['INTENT'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'MECHANISM'.

data.loc[data['MECHANISM'] == 1, 'MECHANISM'] = 'Cut/pierce'
data.loc[data['MECHANISM'] == 2, 'MECHANISM'] = 'Other/unspecified/unknown'
data.loc[data['MECHANISM'] == 3, 'MECHANISM'] = 'Fall'
data.loc[data['MECHANISM'] == 4, 'MECHANISM'] = 'Other/unspecified/unknown'
data.loc[data['MECHANISM'] == 5, 'MECHANISM'] = 'Other/unspecified/unknown'
data.loc[data['MECHANISM'] == 6, 'MECHANISM'] = 'Firearm'
data.loc[data['MECHANISM'] == 7, 'MECHANISM'] = 'Machinery'
data.loc[data['MECHANISM'] == 8, 'MECHANISM'] = 'MVT occupant'
data.loc[data['MECHANISM'] == 9, 'MECHANISM'] = 'MVT motorcyclist'
data.loc[data['MECHANISM'] == 10, 'MECHANISM'] = 'MVT pedal cyclist'
data.loc[data['MECHANISM'] == 11, 'MECHANISM'] = 'MVT pedestrian'
data.loc[data['MECHANISM'] == 12, 'MECHANISM'] = 'Other transport or MVT'
data.loc[data['MECHANISM'] == 13, 'MECHANISM'] = 'Other transport or MVT'
data.loc[data['MECHANISM'] == 14, 'MECHANISM'] = 'Other pedal cyclist'
data.loc[data['MECHANISM'] == 15, 'MECHANISM'] = 'Other pedestrian'
data.loc[data['MECHANISM'] == 16, 'MECHANISM'] = 'Other transport or MVT'
data.loc[data['MECHANISM'] == 17, 'MECHANISM'] = 'Natural or environmental'
data.loc[data['MECHANISM'] == 18, 'MECHANISM'] = 'Natural or environmental'
data.loc[data['MECHANISM'] == 19, 'MECHANISM'] = 'Other/unspecified/unknown'
data.loc[data['MECHANISM'] == 20, 'MECHANISM'] = 'Other/unspecified/unknown'
data.loc[data['MECHANISM'] == 21, 'MECHANISM'] = 'Struck by or against'
data.loc[data['MECHANISM'] == 22, 'MECHANISM'] = 'Other/unspecified/unknown'
data.loc[data['MECHANISM'] == 23, 'MECHANISM'] = 'Other/unspecified/unknown'
data.loc[data['MECHANISM'] == 24, 'MECHANISM'] = 'Other/unspecified/unknown'
data.loc[data['MECHANISM'] == 25, 'MECHANISM'] = 'Other/unspecified/unknown'
data.loc[data['MECHANISM'] == 26, 'MECHANISM'] = 'Other/unspecified/unknown'
data.loc[data['MECHANISM'] == 27, 'MECHANISM'] = 'Other/unspecified/unknown'
data.loc[data['MECHANISM'] == 30, 'MECHANISM'] = 'Other pedestrian'
data.loc[data['MECHANISM'] == 31, 'MECHANISM'] = 'Other/unspecified/unknown'
data.loc[data['MECHANISM'] == 32, 'MECHANISM'] = 'Other/unspecified/unknown'
data['MECHANISM'] = data['MECHANISM'].fillna('Other/unspecified/unknown')


data['MECHANISM'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'PROTDEV_AIRBAG_PRESENT'.

data.loc[data['PROTDEV_AIRBAG_PRESENT'] == 1, 'PROTDEV_AIRBAG_PRESENT'] = 'Present'
data.loc[data['PROTDEV_AIRBAG_PRESENT'] == 0, 'PROTDEV_AIRBAG_PRESENT'] = 'Non-present/non-MVT injury'

data['PROTDEV_AIRBAG_PRESENT'] = data['PROTDEV_AIRBAG_PRESENT'].fillna('Non-present/non-MVT injury')

data['PROTDEV_AIRBAG_PRESENT'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'PROTDEV_CHILD_RESTRAINT'.

data.loc[data['PROTDEV_CHILD_RESTRAINT'] == 1, 'PROTDEV_CHILD_RESTRAINT'] = 'Present'
data.loc[data['PROTDEV_CHILD_RESTRAINT'] == 0, 'PROTDEV_CHILD_RESTRAINT'] = 'Non-present/non-MVT injury'

data['PROTDEV_CHILD_RESTRAINT'] = data['PROTDEV_CHILD_RESTRAINT'].fillna('Non-present/non-MVT injury')

data['PROTDEV_CHILD_RESTRAINT'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'PROTDEV_LAP_BELT'.

data.loc[data['PROTDEV_LAP_BELT'] == 1, 'PROTDEV_LAP_BELT'] = 'Present'
data.loc[data['PROTDEV_LAP_BELT'] == 0, 'PROTDEV_LAP_BELT'] = 'Non-present/non-MVT injury'

data['PROTDEV_LAP_BELT'] = data['PROTDEV_LAP_BELT'].fillna('Non-present/non-MVT injury')

data['PROTDEV_LAP_BELT'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'PROTDEV_SHOULDER_BELT'.

data.loc[data['PROTDEV_SHOULDER_BELT'] == 1, 'PROTDEV_SHOULDER_BELT'] = 'Present'
data.loc[data['PROTDEV_SHOULDER_BELT'] == 0, 'PROTDEV_SHOULDER_BELT'] = 'Non-present/non-MVT injury'

data['PROTDEV_SHOULDER_BELT'] = data['PROTDEV_SHOULDER_BELT'].fillna('Non-present/non-MVT injury')

data['PROTDEV_SHOULDER_BELT'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'PROTDEV_HELMET'.

data.loc[data['PROTDEV_HELMET'] == 1, 'PROTDEV_HELMET'] = 'Present'
data.loc[data['PROTDEV_HELMET'] == 0, 'PROTDEV_HELMET'] = 'Non-present/non-MVT injury'

data['PROTDEV_HELMET'] = data['PROTDEV_HELMET'].fillna('Non-present/non-MVT injury')

data['PROTDEV_HELMET'].value_counts(normalize=False, dropna=False)

In [None]:
#Merge 'PATIENTSOCCUPATION' and 'WORKRELATED' and change response values to strings for.

data.loc[data['WORKRELATED'] == 2, 'WORKRELATED'] = 'No'
data.loc[data['WORKRELATED'] == 1, 'WORKRELATED'] = 'Yes'

data['WORKRELATED'].value_counts(normalize=False, dropna=False)

In [None]:
#Merge ICP columns and drop former columns.

data.loc[data['ICPEVDRAIN'] == 1, 'ICP'] = 'Intraventricular drain/catheter'
data.loc[data['ICPJVBULB'] == 1, 'ICP'] = 'Jugular venous bulb'
data.loc[data['ICPO2MONITOR'] == 1, 'ICP'] = 'Intraparenchymal oxygen/pressure monitor'
data.loc[data['ICPPARENCH'] == 1, 'ICP'] = 'Intraparenchymal oxygen/pressure monitor'
data.loc[data['ICPNONE'] == 1, 'ICP'] = 'None'

data = data.drop(columns=['ICPEVDRAIN', 'ICPJVBULB', 'ICPO2MONITOR', 'ICPPARENCH', 'ICPNONE'])

data['ICP'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'ANTIBIOTICTHERAPY'.

data.loc[data['ANTIBIOTICTHERAPY'] == 1, 'ANTIBIOTICTHERAPY'] = 'Yes'
data.loc[data['ANTIBIOTICTHERAPY'] == 2, 'ANTIBIOTICTHERAPY'] = 'No'

data['ANTIBIOTICTHERAPY'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'ALCOHOLSCREEN'.

data.loc[data['ALCOHOLSCREEN'] == 1, 'ALCOHOLSCREEN'] = 'Yes'
data.loc[data['ALCOHOLSCREEN'] == 2, 'ALCOHOLSCREEN'] = 'No'

data['ALCOHOLSCREEN'].value_counts(normalize=False, dropna=False)

In [None]:
#Assign '0' for patients who were not screened for alcohol.

data['ALCOHOLSCREENRESULT'] = data['ALCOHOLSCREENRESULT'].fillna(0)

In [None]:
#Change response values to strings for 'DRGSCR_AMPHETAMINE'.

data.loc[data['DRGSCR_AMPHETAMINE'] == 1, 'DRGSCR_AMPHETAMINE'] = 'Yes'
data.loc[data['DRGSCR_AMPHETAMINE'] == 0, 'DRGSCR_AMPHETAMINE'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_AMPHETAMINE'] = 'Not tested'
data['DRGSCR_AMPHETAMINE'] = data['DRGSCR_AMPHETAMINE'].fillna('Not tested')

data['DRGSCR_AMPHETAMINE'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'DRGSCR_BARBITURATE'.

data.loc[data['DRGSCR_BARBITURATE'] == 1, 'DRGSCR_BARBITURATE'] = 'Yes'
data.loc[data['DRGSCR_BARBITURATE'] == 0, 'DRGSCR_BARBITURATE'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_BARBITURATE'] = 'Not tested'
data['DRGSCR_BARBITURATE'] = data['DRGSCR_BARBITURATE'].fillna('Not tested')

data['DRGSCR_BARBITURATE'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'DRGSCR_BENZODIAZEPINES'.

data.loc[data['DRGSCR_BENZODIAZEPINES'] == 1, 'DRGSCR_BENZODIAZEPINES'] = 'Yes'
data.loc[data['DRGSCR_BENZODIAZEPINES'] == 0, 'DRGSCR_BENZODIAZEPINES'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_BENZODIAZEPINES'] = 'Not tested'
data['DRGSCR_BENZODIAZEPINES'] = data['DRGSCR_BENZODIAZEPINES'].fillna('Not tested')

data['DRGSCR_BENZODIAZEPINES'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'DRGSCR_CANNABINOID'.

data.loc[data['DRGSCR_CANNABINOID'] == 1, 'DRGSCR_CANNABINOID'] = 'Yes'
data.loc[data['DRGSCR_CANNABINOID'] == 0, 'DRGSCR_CANNABINOID'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_CANNABINOID'] = 'Not tested'
data['DRGSCR_CANNABINOID'] = data['DRGSCR_CANNABINOID'].fillna('Not tested')

data['DRGSCR_CANNABINOID'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'DRGSCR_COCAINE'.

data.loc[data['DRGSCR_COCAINE'] == 1, 'DRGSCR_COCAINE'] = 'Yes'
data.loc[data['DRGSCR_COCAINE'] == 0, 'DRGSCR_COCAINE'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_COCAINE'] = 'Not tested'
data['DRGSCR_COCAINE'] = data['DRGSCR_COCAINE'].fillna('Not tested')

data['DRGSCR_COCAINE'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'DRGSCR_ECSTASY'.

data.loc[data['DRGSCR_ECSTASY'] == 1, 'DRGSCR_ECSTASY'] = 'Yes'
data.loc[data['DRGSCR_ECSTASY'] == 0, 'DRGSCR_ECSTASY'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_ECSTASY'] = 'Not tested'
data['DRGSCR_ECSTASY'] = data['DRGSCR_ECSTASY'].fillna('Not tested')

data['DRGSCR_ECSTASY'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'DRGSCR_METHADONE'.

data.loc[data['DRGSCR_METHADONE'] == 1, 'DRGSCR_METHADONE'] = 'Yes'
data.loc[data['DRGSCR_METHADONE'] == 0, 'DRGSCR_METHADONE'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_METHADONE'] = 'Not tested'
data['DRGSCR_METHADONE'] = data['DRGSCR_METHADONE'].fillna('Not tested')

data['DRGSCR_METHADONE'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'DRGSCR_METHAMPHETAMINE'.

data.loc[data['DRGSCR_METHAMPHETAMINE'] == 1, 'DRGSCR_METHAMPHETAMINE'] = 'Yes'
data.loc[data['DRGSCR_METHAMPHETAMINE'] == 0, 'DRGSCR_METHAMPHETAMINE'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_METHAMPHETAMINE'] = 'Not tested'
data['DRGSCR_METHAMPHETAMINE'] = data['DRGSCR_METHAMPHETAMINE'].fillna('Not tested')

data['DRGSCR_METHAMPHETAMINE'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'DRGSCR_OPIOID'.

data.loc[data['DRGSCR_OPIOID'] == 1, 'DRGSCR_OPIOID'] = 'Yes'
data.loc[data['DRGSCR_OPIOID'] == 0, 'DRGSCR_OPIOID'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_OPIOID'] = 'Not tested'
data['DRGSCR_OPIOID'] = data['DRGSCR_OPIOID'].fillna('Not tested')

data['DRGSCR_OPIOID'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'DRGSCR_OXYCODONE'.

data.loc[data['DRGSCR_OXYCODONE'] == 1, 'DRGSCR_OXYCODONE'] = 'Yes'
data.loc[data['DRGSCR_OXYCODONE'] == 0, 'DRGSCR_OXYCODONE'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_OXYCODONE'] = 'Not tested'
data['DRGSCR_OXYCODONE'] = data['DRGSCR_OXYCODONE'].fillna('Not tested')

data['DRGSCR_OXYCODONE'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'DRGSCR_PHENCYCLIDINE'.

data.loc[data['DRGSCR_PHENCYCLIDINE'] == 1, 'DRGSCR_PHENCYCLIDINE'] = 'Yes'
data.loc[data['DRGSCR_PHENCYCLIDINE'] == 0, 'DRGSCR_PHENCYCLIDINE'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_PHENCYCLIDINE'] = 'Not tested'
data['DRGSCR_PHENCYCLIDINE'] = data['DRGSCR_PHENCYCLIDINE'].fillna('Not tested')

data['DRGSCR_PHENCYCLIDINE'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'DRGSCR_TRICYCLICDEPRESS'.

data.loc[data['DRGSCR_TRICYCLICDEPRESS'] == 1, 'DRGSCR_TRICYCLICDEPRESS'] = 'Yes'
data.loc[data['DRGSCR_TRICYCLICDEPRESS'] == 0, 'DRGSCR_TRICYCLICDEPRESS'] = 'No'
data.loc[data['DRGSCR_NOTTESTED'] == 1, 'DRGSCR_TRICYCLICDEPRESS'] = 'Not tested'
data['DRGSCR_TRICYCLICDEPRESS'] = data['DRGSCR_TRICYCLICDEPRESS'].fillna('Not tested')

data['DRGSCR_TRICYCLICDEPRESS'].value_counts(normalize=False, dropna=False)

In [None]:
#Drop 'DRGSCR_NOTTESTED' since it will not be utilized from this point.

data = data.drop(columns=['DRGSCR_NOTTESTED'])

In [None]:
#Change response values to strings for 'VERIFICATIONLEVEL'.

data.loc[data['VERIFICATIONLEVEL'] == 1, 'VERIFICATIONLEVEL'] = 'Level I Trauma Center'
data.loc[data['VERIFICATIONLEVEL'] == 2, 'VERIFICATIONLEVEL'] = 'Level II Trauma Center'
data.loc[data['VERIFICATIONLEVEL'] == 3, 'VERIFICATIONLEVEL'] = 'Level III Trauma Center'

data['VERIFICATIONLEVEL'] = data['VERIFICATIONLEVEL'].fillna('Unknown')

data['VERIFICATIONLEVEL'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'HOSPITALTYPE'.

data.loc[data['HOSPITALTYPE'] == 1, 'HOSPITALTYPE'] = 'For profit'
data.loc[data['HOSPITALTYPE'] == 2, 'HOSPITALTYPE'] = 'Non-profit'
data.loc[data['HOSPITALTYPE'] == 3, 'HOSPITALTYPE'] = 'Government'

data['HOSPITALTYPE'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'BEDSIZE'.

data.loc[data['BEDSIZE'] == 1, 'BEDSIZE'] = '200 or fewer'
data.loc[data['BEDSIZE'] == 2, 'BEDSIZE'] = '201 to 400'
data.loc[data['BEDSIZE'] == 3, 'BEDSIZE'] = '401 to 600'
data.loc[data['BEDSIZE'] == 4, 'BEDSIZE'] = 'More than 600'

data['BEDSIZE'] = data['BEDSIZE'].fillna('200 or fewer')

data['BEDSIZE'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'PRIMARYMETHODPAYMENT'.

data.loc[data['PRIMARYMETHODPAYMENT'] == 1, 'PRIMARYMETHODPAYMENT'] = 'Medicaid'
data.loc[data['PRIMARYMETHODPAYMENT'] == 2, 'PRIMARYMETHODPAYMENT'] = 'Not billed'
data.loc[data['PRIMARYMETHODPAYMENT'] == 3, 'PRIMARYMETHODPAYMENT'] = 'Self-pay'
data.loc[data['PRIMARYMETHODPAYMENT'] == 4, 'PRIMARYMETHODPAYMENT'] = 'Private/commercial insurance'
data.loc[data['PRIMARYMETHODPAYMENT'] == 6, 'PRIMARYMETHODPAYMENT'] = 'Medicare'
data.loc[data['PRIMARYMETHODPAYMENT'] == 7, 'PRIMARYMETHODPAYMENT'] = 'Other/unknown'
data.loc[data['PRIMARYMETHODPAYMENT'] == 10, 'PRIMARYMETHODPAYMENT'] = 'Other/unknown'
data['PRIMARYMETHODPAYMENT'] = data['PRIMARYMETHODPAYMENT'].fillna('Other/unknown')

data['PRIMARYMETHODPAYMENT'].value_counts(normalize=False, dropna=False)

In [None]:
#Change response values to strings for 'HOSPDISCHARGEDISPOSITION'.

data.loc[data['HOSPDISCHARGEDISPOSITION'] == 1, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to a short-term general hospital for inpatient care'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 2, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to an Intermediate Care Facility (ICF)'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 3, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to home under care of organized home health service'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 4, 'HOSPDISCHARGEDISPOSITION'] = 'Left against medical advice or discontinued care'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 5, 'HOSPDISCHARGEDISPOSITION'] = 'Deceased/Expired'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 6, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged to home or self-care (routine discharge)'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 7, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to Skilled Nursing Facility (SNF)'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 8, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to hospice care'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 10, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to court/law enforcement.'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 11, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to inpatient rehab or designated unit'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 12, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to Long Term Care Hospital (LTCH)'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 13, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to a psychiatric hospital or psychiatric distinct part unit of a hospital'
data.loc[data['HOSPDISCHARGEDISPOSITION'] == 14, 'HOSPDISCHARGEDISPOSITION'] = 'Discharged/Transferred to another type of institution not defined elsewhere'

data['HOSPDISCHARGEDISPOSITION'].value_counts(normalize=False, dropna=False)

In [None]:
#Check data shape.

data.shape

#Exclusion Criteria

In [None]:
#Exclude pediatric patients.

before = data.shape[0]
data = data[(data['AGEYEARS'] >= 18)]
after = data.shape[0]
excluded = before - after

print('Number of patients before exclusion: ', before)
print('Number of patients after exclusion: ', after)
print('Number of patients excluded with this criteria: ', excluded)

In [None]:
#Drop AIS injury severity scores head and neck (since they will not be used as exclusion criteria).

data = data.drop(columns=['AISSEVERITY1'])
data = data.drop(columns=['AISSEVERITY2'])

In [None]:
#Exclude AIS injury severity score ≥ 3 to neck.

before = data.shape[0]
data = data[data['AISSEVERITY3'] < 3]
after = data.shape[0]
excluded = before - after

data = data.drop(columns=['AISSEVERITY3'])

print('Number of patients before exclusion: ', before)
print('Number of patients after exclusion: ', after)
print('Number of patients excluded with this criteria: ', excluded)

In [None]:
#Exclude AIS injury severity score ≥ 3 to thorax.

before = data.shape[0]
data = data[data['AISSEVERITY4'] < 3]
after = data.shape[0]
excluded = before - after

data = data.drop(columns=['AISSEVERITY4'])

print('Number of patients before exclusion: ', before)
print('Number of patients after exclusion: ', after)
print('Number of patients excluded with this criteria: ', excluded)

In [None]:
#Exclude AIS injury severity score ≥ 3 to abdomen.

before = data.shape[0]
data = data[data['AISSEVERITY5'] < 3]
after = data.shape[0]
excluded = before - after

data = data.drop(columns=['AISSEVERITY5'])

print('Number of patients before exclusion: ', before)
print('Number of patients after exclusion: ', after)
print('Number of patients excluded with this criteria: ', excluded)

In [None]:
#Exclude AIS injury severity score ≥ 3 to spine.

before = data.shape[0]
data = data[data['AISSEVERITY6'] < 3]
after = data.shape[0]
excluded = before - after

data = data.drop(columns=['AISSEVERITY6'])

print('Number of patients before exclusion: ', before)
print('Number of patients after exclusion: ', after)
print('Number of patients excluded with this criteria: ', excluded)

In [None]:
#Exclude AIS injury severity score ≥ 3 to upper extremity.

before = data.shape[0]
data = data[data['AISSEVERITY7'] < 3]
after = data.shape[0]
excluded = before - after

data = data.drop(columns=['AISSEVERITY7'])

print('Number of patients before exclusion: ', before)
print('Number of patients after exclusion: ', after)
print('Number of patients excluded with this criteria: ', excluded)

In [None]:
#Exclude AIS injury severity score ≥ 3 to lower extremity.

before = data.shape[0]
data = data[data['AISSEVERITY8'] < 3]
after = data.shape[0]
excluded = before - after

data = data.drop(columns=['AISSEVERITY8'])

print('Number of patients before exclusion: ', before)
print('Number of patients after exclusion: ', after)
print('Number of patients excluded with this criteria: ', excluded)

In [None]:
#Exclude AIS injury severity score ≥ 3 to unspecified body regions.

before = data.shape[0]
data = data[data['AISSEVERITY9'] < 3]
after = data.shape[0]
excluded = before - after

data = data.drop(columns=['AISSEVERITY9'])

print('Number of patients before exclusion: ', before)
print('Number of patients after exclusion: ', after)
print('Number of patients excluded with this criteria: ', excluded)

In [None]:
#Exclude major polytrauma patients.

before = data.shape[0]
data = data[data['ISS'] < 27]
after = data.shape[0]
excluded = before - after

data = data.drop(columns=['ISS'])

print('Number of patients before exclusion: ', before)
print('Number of patients after exclusion: ', after)
print('Number of patients excluded with this criteria: ', excluded)

In [None]:
#Exclude patients with ADLC.

before = data.shape[0]
data = data[data['CC_ADLC'] == 'No']
after = data.shape[0]
excluded = before - after

data = data.drop(columns=['CC_ADLC'])

print('Number of patients before exclusion: ', before)
print('Number of patients after exclusion: ', after)
print('Number of patients excluded with this criteria: ', excluded)

In [None]:
#Exclude patients with ADLC.

before = data.shape[0]
data = data[data['PREHOSPITALCARDIACARREST'] == 'No']
after = data.shape[0]
excluded = before - after

data = data.drop(columns=['PREHOSPITALCARDIACARREST'])

print('Number of patients before exclusion: ', before)
print('Number of patients after exclusion: ', after)
print('Number of patients excluded with this criteria: ', excluded)

In [None]:
#Exclude patients dead on arrival (SBP=0).

before = data.shape[0]
data = data[data['SBP'] != 0]
after = data.shape[0]
excluded = before - after

print('Number of patients before exclusion: ', before)
print('Number of patients after exclusion: ', after)
print('Number of patients excluded with this criteria: ', excluded)

In [None]:
data.loc[data['WEIGHT'] > 250, 'WEIGHT'] = np.nan
data.loc[data['WEIGHT'] < 30, 'WEIGHT'] = np.nan

data.loc[data['HEIGHT'] > 220, 'HEIGHT'] = np.nan
data.loc[data['HEIGHT'] < 100, 'HEIGHT'] = np.nan

data.loc[data['SBP'] > 250, 'SBP'] = np.nan
data.loc[data['SBP'] < 50, 'SBP'] = np.nan

data.loc[data['PULSERATE'] > 220, 'PULSERATE'] = np.nan
data.loc[data['PULSERATE'] < 20, 'PULSERATE'] = np.nan

data.loc[data['PULSEOXIMETRY'] > 100, 'PULSEOXIMETRY'] = np.nan
data.loc[data['PULSEOXIMETRY'] < 70, 'PULSEOXIMETRY'] = np.nan

data.loc[data['RESPIRATORYRATE'] > 50, 'RESPIRATORYRATE'] = np.nan
data.loc[data['RESPIRATORYRATE'] < 4, 'RESPIRATORYRATE'] = np.nan

data.loc[data['TEMPERATURE'] > 44, 'TEMPERATURE'] = np.nan
data.loc[data['TEMPERATURE'] < 32, 'TEMPERATURE'] = np.nan

data.loc[data['HOSPITALARRIVALDAYS'] > 14, 'HOSPITALARRIVALDAYS'] = np.nan
data.loc[data['HOSPITALARRIVALDAYS'] < 0, 'HOSPITALARRIVALDAYS'] = np.nan

In [None]:
#Save data.

data.to_csv('/content/drive/MyDrive/TQP-atSDH/clean_data.csv')

#Imputation

In [None]:
#Define numerical and categorical columns.

num_cols = list(data.select_dtypes('number').columns)
print('Numerical columns: {}'.format(num_cols), '\n')

cat_cols = list(data.select_dtypes('object').columns)
print('Categorical columns: {}'.format(cat_cols))

In [None]:
#Remove outcomes.

num_cols_remove = ['FINALDISCHARGEDAYS', 'TOTALICULOS']
cat_cols_remove = ['HC_CARDARREST', 'HC_CAUTI', 'HC_CLABSI', 'HC_CRBSI', 'HC_DEEPSSI', 'HC_DELIRIUM', 'HC_DRUGALCOHOLWITHDRAWAL', 'HC_DVTHROMBOSIS', 'HC_EMBOLISM', 'HC_EXTREMITYCS', 'HC_GRAFTFAIL', 'HC_INTUBATION', 'HC_KIDNEY', 'HC_MI', 'HC_ORGANSPACESSI', 'HC_OSTEOMYELITIS', 'HC_PNEUMONIA', 'HC_PRESSUREULCER', 'HC_RESPIRATORY', 'HC_RETURNOR', 'HC_SEPSIS', 'HC_STROKECVA', 'HC_SUPERFICIALSSI', 'HC_UNPLANNEDICU', 'HC_UTI', 'HC_VAPNEUMONIA', 'HOSPDISCHARGEDISPOSITION']

num_cols = [i for i in num_cols if i not in num_cols_remove]
cat_cols = [i for i in cat_cols if i not in cat_cols_remove]

In [None]:
#Check missing values for numerical columns.

missing_num = data[num_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

print(missing_num, '\n')

missing_num = pd.DataFrame(missing_num)

missing_num.columns = ['Value']

missing_num = missing_num[missing_num['Value'] > 0]

print('Numerical variables with missing values: ', list(missing_num.index), '\n')

print('Number of numerical variables with missing values: ', len(list(missing_num.index)), '\n')

missing_num = missing_num[missing_num['Value'] > 25]

missing_num = list(missing_num.index)

print('Excluded numerical variables: ', missing_num)

In [None]:
#Drop numerical columns with missing values over 25%.

data.drop(missing_num, axis=1, inplace=True)

In [None]:
#Define new numerical columns.

num_cols = [x for x in num_cols if x not in missing_num]

In [None]:
#Impute missing numerical values.

num_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
data[num_cols] = num_imputer.fit_transform(data[num_cols])

In [None]:
#Check missing values for categorical columns.

missing_cat = data[cat_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

print(missing_cat, '\n')

missing_cat = pd.DataFrame(missing_cat)

missing_cat.columns = ['Value']

missing_cat = missing_cat[missing_cat['Value'] > 0]

print('Categorical variables with missing values: ', list(missing_cat.index), '\n')

print('Number of categorical variables with missing values: ', len(list(missing_cat.index)), '\n')

missing_cat = missing_cat[missing_cat['Value'] > 25]

missing_cat = list(missing_cat.index)

print('Excluded categorical variables: ', missing_cat)

In [None]:
#Drop categorical columns with missing values over 25%.

data.drop(missing_cat, axis=1, inplace=True)

In [None]:
#Define new categorical columns.

cat_cols = [x for x in cat_cols if x not in missing_cat]

In [None]:
#Replace missing categorical values with 'Unknown'.

for col in cat_cols:
    data[col].fillna(value='Unknown', inplace=True)

#Final Touches

In [None]:
#Change variable names to field names.

data_dictionary = pd.read_csv("/content/drive/MyDrive/TQP-atSDH/Modified Data Dictionary (Selected).csv", encoding = 'latin1', index_col = None, low_memory = False)
FieldNames = dict(zip(data_dictionary['Variable'], data_dictionary['Field Name']))
data.columns = data.columns.map(FieldNames)

In [None]:
#Save imputed data.

data.to_csv('/content/drive/MyDrive/TQP-atSDH/imputed_data.csv')

In [None]:
#Manual label encoding.

data.loc[data['Sex'] == 'Male', 'Sex'] = 0
data.loc[data['Sex'] == 'Female', 'Sex'] = 1
data.loc[data['Sex'] == 'Non-Binary', 'Sex'] = 2
data.loc[data['Sex'] == 'Unknown', 'Sex'] = 3
data.loc[data['Ethnicity'] == 'Not Hispanic or Latino', 'Ethnicity'] = 0
data.loc[data['Ethnicity'] == 'Hispanic or Latino', 'Ethnicity'] = 1
data.loc[data['Ethnicity'] == 'Unknown', 'Ethnicity'] = 2
data.loc[data['Supplemental Oxygen'] == 'No supplemental oxygen', 'Supplemental Oxygen'] = 0
data.loc[data['Supplemental Oxygen'] == 'Supplemental oxygen', 'Supplemental Oxygen'] = 1
data.loc[data['Supplemental Oxygen'] == 'Unknown', 'Supplemental Oxygen'] = 2
data.loc[data['Respiratory Assistance'] == 'Unassisted respiratory rate', 'Respiratory Assistance'] = 0
data.loc[data['Respiratory Assistance'] == 'Assisted respiratory rate', 'Respiratory Assistance'] = 1
data.loc[data['Respiratory Assistance'] == 'Unknown', 'Respiratory Assistance'] = 2
data.loc[data['Pupillary Response'] == 'Both reactive', 'Pupillary Response'] = 0
data.loc[data['Pupillary Response'] == 'One reactive', 'Pupillary Response'] = 1
data.loc[data['Pupillary Response'] == 'Neither reactive', 'Pupillary Response'] = 2
data.loc[data['Pupillary Response'] == 'Unknown', 'Pupillary Response'] = 3
data.loc[data['Midline Shift'] == 'No', 'Midline Shift'] = 0
data.loc[data['Midline Shift'] == 'Yes', 'Midline Shift'] = 1
data.loc[data['Midline Shift'] == 'Not imaged/unknown', 'Midline Shift'] = 2
data.loc[data['Bleeding Localization'] == 'Supratentorial', 'Bleeding Localization'] = 0
data.loc[data['Bleeding Localization'] == 'Infratentorial', 'Bleeding Localization'] = 1
data.loc[data['Bleeding Localization'] == 'Unknown', 'Bleeding Localization'] = 2
data.loc[data['Bleeding Size'] == 'Tiny (less than 0.6cm thick)', 'Bleeding Size'] = 0
data.loc[data['Bleeding Size'] == 'Small or moderate (0.6-1cm thick)'] = 1
data.loc[data['Bleeding Size'] == 'Large or massive (more than 1cm thick)', 'Bleeding Size'] = 2
data.loc[data['Bleeding Size'] == 'Bilateral large or massive (both sides more than 1cm thick)', 'Bleeding Size'] = 3
data.loc[data['Bleeding Size'] == 'Bilateral small or moderate (both sides 0.6-1cm thick)', 'Bleeding Size'] = 4
data.loc[data['Bleeding Size'] == 'Unknown', 'Bleeding Size'] = 5
data.loc[data['Current Smoker'] == 'No', 'Current Smoker'] = 0
data.loc[data['Current Smoker'] == 'Yes', 'Current Smoker'] = 1
data.loc[data['Current Smoker'] == 'Unknown', 'Current Smoker'] = 2
data.loc[data['Alcohol Use Disorder'] == 'No', 'Alcohol Use Disorder'] = 0
data.loc[data['Alcohol Use Disorder'] == 'Yes', 'Alcohol Use Disorder'] = 1
data.loc[data['Alcohol Use Disorder'] == 'Unknown', 'Alcohol Use Disorder'] = 2
data.loc[data['Substance Abuse Disorder'] == 'No', 'Substance Abuse Disorder'] = 0
data.loc[data['Substance Abuse Disorder'] == 'Yes', 'Substance Abuse Disorder'] = 1
data.loc[data['Substance Abuse Disorder'] == 'Unknown', 'Substance Abuse Disorder'] = 2
data.loc[data['Diabetes Mellitus'] == 'No', 'Diabetes Mellitus'] = 0
data.loc[data['Diabetes Mellitus'] == 'Yes', 'Diabetes Mellitus'] = 1
data.loc[data['Diabetes Mellitus'] == 'Unknown', 'Diabetes Mellitus'] = 2
data.loc[data['Hypertension'] == 'No', 'Hypertension'] = 0
data.loc[data['Hypertension'] == 'Yes', 'Hypertension'] = 1
data.loc[data['Hypertension'] == 'Unknown', 'Hypertension'] = 2
data.loc[data['Congestive Heart Failure'] == 'No', 'Congestive Heart Failure'] = 0
data.loc[data['Congestive Heart Failure'] == 'Yes', 'Congestive Heart Failure'] = 1
data.loc[data['Congestive Heart Failure'] == 'Unknown', 'Congestive Heart Failure'] = 2
data.loc[data['History of Myocardial Infarction'] == 'No', 'History of Myocardial Infarction'] = 0
data.loc[data['History of Myocardial Infarction'] == 'Yes', 'History of Myocardial Infarction'] = 1
data.loc[data['History of Myocardial Infarction'] == 'Unknown', 'History of Myocardial Infarction'] = 2
data.loc[data['Angina Pectoris'] == 'No', 'Angina Pectoris'] = 0
data.loc[data['Angina Pectoris'] == 'Yes', 'Angina Pectoris'] = 1
data.loc[data['Angina Pectoris'] == 'Unknown', 'Angina Pectoris'] = 2
data.loc[data['History of Cerebrovascular Accident'] == 'No', 'History of Cerebrovascular Accident'] = 0
data.loc[data['History of Cerebrovascular Accident'] == 'Yes', 'History of Cerebrovascular Accident'] = 1
data.loc[data['History of Cerebrovascular Accident'] == 'Unknown', 'History of Cerebrovascular Accident'] = 2
data.loc[data['Peripheral Arterial Disease'] == 'No', 'Peripheral Arterial Disease'] = 0
data.loc[data['Peripheral Arterial Disease'] == 'Yes', 'Peripheral Arterial Disease'] = 1
data.loc[data['Peripheral Arterial Disease'] == 'Unknown', 'Peripheral Arterial Disease'] = 2
data.loc[data['Chronic Obstructive Pulmonary Disease'] == 'No', 'Chronic Obstructive Pulmonary Disease'] = 0
data.loc[data['Chronic Obstructive Pulmonary Disease'] == 'Yes', 'Chronic Obstructive Pulmonary Disease'] = 1
data.loc[data['Chronic Obstructive Pulmonary Disease'] == 'Unknown', 'Chronic Obstructive Pulmonary Disease'] = 2
data.loc[data['Chronic Renal Failure'] == 'No', 'Chronic Renal Failure'] = 0
data.loc[data['Chronic Renal Failure'] == 'Yes', 'Chronic Renal Failure'] = 1
data.loc[data['Chronic Renal Failure'] == 'Unknown', 'Chronic Renal Failure'] = 2
data.loc[data['Cirrhosis'] == 'No', 'Cirrhosis'] = 0
data.loc[data['Cirrhosis'] == 'Yes', 'Cirrhosis'] = 1
data.loc[data['Cirrhosis'] == 'Unknown', 'Cirrhosis'] = 2
data.loc[data['Bleeding Disorder'] == 'No', 'Bleeding Disorder'] = 0
data.loc[data['Bleeding Disorder'] == 'Yes', 'Bleeding Disorder'] = 1
data.loc[data['Bleeding Disorder'] == 'Unknown', 'Bleeding Disorder'] = 2
data.loc[data['Disseminated Cancer'] == 'No', 'Disseminated Cancer'] = 0
data.loc[data['Disseminated Cancer'] == 'Yes', 'Disseminated Cancer'] = 1
data.loc[data['Disseminated Cancer'] == 'Unknown', 'Disseminated Cancer'] = 2
data.loc[data['Currently Receiving Chemotherapy for Cancer'] == 'No', 'Currently Receiving Chemotherapy for Cancer'] = 0
data.loc[data['Currently Receiving Chemotherapy for Cancer'] == 'Yes', 'Currently Receiving Chemotherapy for Cancer'] = 1
data.loc[data['Currently Receiving Chemotherapy for Cancer'] == 'Unknown', 'Currently Receiving Chemotherapy for Cancer'] = 2
data.loc[data['Dementia'] == 'No', 'Dementia'] = 0
data.loc[data['Dementia'] == 'Yes', 'Dementia'] = 1
data.loc[data['Dementia'] == 'Unknown', 'Dementia'] = 2
data.loc[data['Attention Deficit Disorder or Attention Deficit Hyperactivity Disorder'] == 'No', 'Attention Deficit Disorder or Attention Deficit Hyperactivity Disorder'] = 0
data.loc[data['Attention Deficit Disorder or Attention Deficit Hyperactivity Disorder'] == 'Yes', 'Attention Deficit Disorder or Attention Deficit Hyperactivity Disorder'] = 1
data.loc[data['Attention Deficit Disorder or Attention Deficit Hyperactivity Disorder'] == 'Unknown', 'Attention Deficit Disorder or Attention Deficit Hyperactivity Disorder'] = 2
data.loc[data['Mental or Personality Disorder'] == 'No', 'Mental or Personality Disorder'] = 0
data.loc[data['Mental or Personality Disorder'] == 'Yes', 'Mental or Personality Disorder'] = 1
data.loc[data['Mental or Personality Disorder'] == 'Unknown', 'Mental or Personality Disorder'] = 2
data.loc[data['Ability to Complete Age-Appropriate ADL'] == 'No', 'Ability to Complete Age-Appropriate ADL'] = 0
data.loc[data['Ability to Complete Age-Appropriate ADL'] == 'Yes', 'Ability to Complete Age-Appropriate ADL'] = 1
data.loc[data['Ability to Complete Age-Appropriate ADL'] == 'Unknown', 'Ability to Complete Age-Appropriate ADL'] = 2
data.loc[data['Pregnancy'] == 'Not applicable (male patient)', 'Pregnancy'] = 0
data.loc[data['Pregnancy'] == 'No', 'Pregnancy'] = 1
data.loc[data['Pregnancy'] == 'Yes', 'Pregnancy'] = 2
data.loc[data['Pregnancy'] == 'Unknown', 'Pregnancy'] = 3
data.loc[data['Anticoagulant Therapy'] == 'No', 'Anticoagulant Therapy'] = 0
data.loc[data['Anticoagulant Therapy'] == 'Yes', 'Anticoagulant Therapy'] = 1
data.loc[data['Anticoagulant Therapy'] == 'Unknown', 'Anticoagulant Therapy'] = 2
data.loc[data['Steroid Use'] == 'No', 'Steroid Use'] = 0
data.loc[data['Steroid Use'] == 'Yes', 'Steroid Use'] = 1
data.loc[data['Steroid Use'] == 'Unknown', 'Steroid Use'] = 2
data.loc[data['Transport Mode'] == 'Ground ambulance', 'Transport Mode'] = 0
data.loc[data['Transport Mode'] == 'Private/public vehicle/walk-in', 'Transport Mode'] = 1
data.loc[data['Transport Mode'] == 'Air ambulance', 'Transport Mode'] = 2
data.loc[data['Transport Mode'] == 'Other/police/unknown/etc.', 'Transport Mode'] = 3
data.loc[data['Inter-Facility Transfer'] == 'No', 'Inter-Facility Transfer'] = 0
data.loc[data['Inter-Facility Transfer'] == 'Yes', 'Inter-Facility Transfer'] = 1
data.loc[data['Inter-Facility Transfer'] == 'Unknown', 'Inter-Facility Transfer'] = 2
data.loc[data['Trauma Type'] == 'Blunt', 'Trauma Type'] = 0
data.loc[data['Trauma Type'] == 'Penetrating', 'Trauma Type'] = 1
data.loc[data['Trauma Type'] == 'Other/unknown', 'Trauma Type'] = 2
data.loc[data['Injury Intent'] == 'Unintentional', 'Injury Intent'] = 0
data.loc[data['Injury Intent'] == 'Assault', 'Injury Intent'] = 1
data.loc[data['Injury Intent'] == 'Self-inflicted', 'Injury Intent'] = 2
data.loc[data['Injury Intent'] == 'Other/unknown', 'Injury Intent'] = 3
data.loc[data['Mechanism of Injury'] == 'Fall', 'Mechanism of Injury'] = 0
data.loc[data['Mechanism of Injury'] == 'MVT occupant', 'Mechanism of Injury'] = 1
data.loc[data['Mechanism of Injury'] == 'Struck by or against', 'Mechanism of Injury'] = 2
data.loc[data['Mechanism of Injury'] == 'Other transport or MVT', 'Mechanism of Injury'] = 3
data.loc[data['Mechanism of Injury'] == 'MVT motorcyclist', 'Mechanism of Injury'] = 4
data.loc[data['Mechanism of Injury'] == 'MVT pedestrian', 'Mechanism of Injury'] = 5
data.loc[data['Mechanism of Injury'] == 'Other pedestrian', 'Mechanism of Injury'] = 6
data.loc[data['Mechanism of Injury'] == 'Other pedal cyclist', 'Mechanism of Injury'] = 7
data.loc[data['Mechanism of Injury'] == 'Firearm', 'Mechanism of Injury'] = 8
data.loc[data['Mechanism of Injury'] == 'MVT pedal cyclist', 'Mechanism of Injury'] = 9
data.loc[data['Mechanism of Injury'] == 'Natural or environmental', 'Mechanism of Injury'] = 10
data.loc[data['Mechanism of Injury'] == 'Cut/pierce', 'Mechanism of Injury'] = 11
data.loc[data['Mechanism of Injury'] == 'Machinery', 'Mechanism of Injury'] = 12
data.loc[data['Mechanism of Injury'] == 'Other/unspecified/unknown', 'Mechanism of Injury'] = 13
data.loc[data['Protective Device - Airbag'] == 'Non-present/non-MVT injury', 'Protective Device - Airbag'] = 0
data.loc[data['Protective Device - Airbag'] == 'Present', 'Protective Device - Airbag'] = 1
data.loc[data['Protective Device - Child Restraint (booster seat or child car seat)'] == 'Non-present/non-MVT injury', 'Protective Device - Child Restraint (booster seat or child car seat)'] = 0
data.loc[data['Protective Device - Child Restraint (booster seat or child car seat)'] == 'Present', 'Protective Device - Child Restraint (booster seat or child car seat)'] = 1
data.loc[data['Protective Device - Lap Belt'] == 'Present', 'Protective Device - Lap Belt'] = 1
data.loc[data['Protective Device - Lap Belt'] == 'Non-present/non-MVT injury', 'Protective Device - Lap Belt'] = 0
data.loc[data['Protective Device - Shoulder Belt'] == 'Present', 'Protective Device - Shoulder Belt'] = 1
data.loc[data['Protective Device - Shoulder Belt'] == 'Non-present/non-MVT injury', 'Protective Device - Shoulder Belt'] = 0
data.loc[data['Protective Device - Helmet'] == 'Present', 'Protective Device - Helmet'] = 1
data.loc[data['Protective Device - Helmet'] == 'Non-present/non-MVT injury', 'Protective Device - Helmet'] = 0
data.loc[data['Work-Related'] == 'No', 'Work-Related'] = 0
data.loc[data['Work-Related'] == 'Yes', 'Work-Related'] = 1
data.loc[data['Work-Related'] == 'Unknown', 'Work-Related'] = 2
data.loc[data['Neurosurgical Intervention'] == 'No', 'Neurosurgical Intervention'] = 0
data.loc[data['Neurosurgical Intervention'] == 'Yes', 'Neurosurgical Intervention'] = 1
data.loc[data['Alcohol Screen'] == 'No', 'Alcohol Screen'] = 0
data.loc[data['Alcohol Screen'] == 'Yes', 'Alcohol Screen'] = 1
data.loc[data['Alcohol Screen'] == 'Unknown', 'Alcohol Screen'] = 2
data.loc[data['Drug Screen - Amphetamine'] == 'No', 'Drug Screen - Amphetamine'] = 0
data.loc[data['Drug Screen - Amphetamine'] == 'Yes', 'Drug Screen - Amphetamine'] = 1
data.loc[data['Drug Screen - Amphetamine'] == 'Not tested', 'Drug Screen - Amphetamine'] = 2
data.loc[data['Drug Screen - Barbiturate'] == 'No', 'Drug Screen - Barbiturate'] = 0
data.loc[data['Drug Screen - Barbiturate'] == 'Yes', 'Drug Screen - Barbiturate'] = 1
data.loc[data['Drug Screen - Barbiturate'] == 'Not tested', 'Drug Screen - Barbiturate'] = 2
data.loc[data['Drug Screen - Benzodiazepines'] == 'No', 'Drug Screen - Benzodiazepines'] = 0
data.loc[data['Drug Screen - Benzodiazepines'] == 'Yes', 'Drug Screen - Benzodiazepines'] = 1
data.loc[data['Drug Screen - Benzodiazepines'] == 'Not tested', 'Drug Screen - Benzodiazepines'] = 2
data.loc[data['Drug Screen - Cannabinoid'] == 'No', 'Drug Screen - Cannabinoid'] = 0
data.loc[data['Drug Screen - Cannabinoid'] == 'Yes', 'Drug Screen - Cannabinoid'] = 1
data.loc[data['Drug Screen - Cannabinoid'] == 'Not tested', 'Drug Screen - Cannabinoid'] = 2
data.loc[data['Drug Screen - Cocaine'] == 'No', 'Drug Screen - Cocaine'] = 0
data.loc[data['Drug Screen - Cocaine'] == 'Yes', 'Drug Screen - Cocaine'] = 1
data.loc[data['Drug Screen - Cocaine'] == 'Not tested', 'Drug Screen - Cocaine'] = 2
data.loc[data['Drug Screen - MDMA or Ecstasy'] == 'No', 'Drug Screen - MDMA or Ecstasy'] = 0
data.loc[data['Drug Screen - MDMA or Ecstasy'] == 'Yes', 'Drug Screen - MDMA or Ecstasy'] = 1
data.loc[data['Drug Screen - MDMA or Ecstasy'] == 'Not tested', 'Drug Screen - MDMA or Ecstasy'] = 2
data.loc[data['Drug Screen - Methadone'] == 'No', 'Drug Screen - Methadone'] = 0
data.loc[data['Drug Screen - Methadone'] == 'Yes', 'Drug Screen - Methadone'] = 1
data.loc[data['Drug Screen - Methadone'] == 'Not tested', 'Drug Screen - Methadone'] = 2
data.loc[data['Drug Screen - Methamphetamine'] == 'No', 'Drug Screen - Methamphetamine'] = 0
data.loc[data['Drug Screen - Methamphetamine'] == 'Yes', 'Drug Screen - Methamphetamine'] = 1
data.loc[data['Drug Screen - Methamphetamine'] == 'Not tested', 'Drug Screen - Methamphetamine'] = 2
data.loc[data['Drug Screen - Opioid'] == 'No', 'Drug Screen - Opioid'] = 0
data.loc[data['Drug Screen - Opioid'] == 'Yes', 'Drug Screen - Opioid'] = 1
data.loc[data['Drug Screen - Opioid'] == 'Not tested', 'Drug Screen - Opioid'] = 2
data.loc[data['Drug Screen - Oxycodone'] == 'No', 'Drug Screen - Oxycodone'] = 0
data.loc[data['Drug Screen - Oxycodone'] == 'Yes', 'Drug Screen - Oxycodone'] = 1
data.loc[data['Drug Screen - Oxycodone'] == 'Not tested', 'Drug Screen - Oxycodone'] = 2
data.loc[data['Drug Screen - Phencyclidine'] == 'No', 'Drug Screen - Phencyclidine'] = 0
data.loc[data['Drug Screen - Phencyclidine'] == 'Yes', 'Drug Screen - Phencyclidine'] = 1
data.loc[data['Drug Screen - Phencyclidine'] == 'Not tested', 'Drug Screen - Phencyclidine'] = 2
data.loc[data['Drug Screen - Tricyclic Antidepressant'] == 'No', 'Drug Screen - Tricyclic Antidepressant'] = 0
data.loc[data['Drug Screen - Tricyclic Antidepressant'] == 'Yes', 'Drug Screen - Tricyclic Antidepressant'] = 1
data.loc[data['Drug Screen - Tricyclic Antidepressant'] == 'Not tested', 'Drug Screen - Tricyclic Antidepressant'] = 2
data.loc[data['ACS Verification Level'] == 'Level I Trauma Center', 'ACS Verification Level'] = 0
data.loc[data['ACS Verification Level'] == 'Level II Trauma Center', 'ACS Verification Level'] = 1
data.loc[data['ACS Verification Level'] == 'Level III Trauma Center', 'ACS Verification Level'] = 2
data.loc[data['ACS Verification Level'] == 'Unknown', 'ACS Verification Level'] = 3
data.loc[data['Hospital Type'] == 'Non-profit', 'Hospital Type'] = 0
data.loc[data['Hospital Type'] == 'For profit', 'Hospital Type'] = 1
data.loc[data['Hospital Type'] == 'Government', 'Hospital Type'] = 2
data.loc[data['Hospital Type'] == 'Unknown', 'Hospital Type'] = 3
data.loc[data['Facility Bed Size'] == 'More than 600', 'Facility Bed Size'] = 0
data.loc[data['Facility Bed Size'] == '401 to 600', 'Facility Bed Size'] = 1
data.loc[data['Facility Bed Size'] == '201 to 400', 'Facility Bed Size'] = 2
data.loc[data['Facility Bed Size'] == '200 or fewer', 'Facility Bed Size'] = 3
data.loc[data['Primary Method of Payment'] == 'Medicare', 'Primary Method of Payment'] = 0
data.loc[data['Primary Method of Payment'] == 'Private/commercial insurance', 'Primary Method of Payment'] = 1
data.loc[data['Primary Method of Payment'] == 'Medicaid', 'Primary Method of Payment'] = 2
data.loc[data['Primary Method of Payment'] == 'Self-pay', 'Primary Method of Payment'] = 3
data.loc[data['Primary Method of Payment'] == 'Not billed', 'Primary Method of Payment'] = 4
data.loc[data['Primary Method of Payment'] == 'Other/unknown', 'Primary Method of Payment'] = 5
data.loc[data['Race'] == 'White', 'Race'] = 0
data.loc[data['Race'] == 'Black', 'Race'] = 1
data.loc[data['Race'] == 'Asian', 'Race'] = 2
data.loc[data['Race'] == 'American Indian', 'Race'] = 3
data.loc[data['Race'] == 'Pacific Islander', 'Race'] = 4
data.loc[data['Race'] == 'Other/unknown', 'Race'] = 5
data.loc[data['Cerebral Monitoring'] == 'None', 'Cerebral Monitoring'] = 0
data.loc[data['Cerebral Monitoring'] == 'Intraventricular drain/catheter', 'Cerebral Monitoring'] = 1
data.loc[data['Cerebral Monitoring'] == 'Intraparenchymal oxygen/pressure monitor', 'Cerebral Monitoring'] = 2
data.loc[data['Cerebral Monitoring'] == 'Jugular venous bulb', 'Cerebral Monitoring'] = 3
data.loc[data['Cerebral Monitoring'] == 'Unknown', 'Cerebral Monitoring'] = 4

In [None]:
#Save final data.

data.to_csv('/content/drive/MyDrive/TQP-atSDH/final_data.csv')