In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

from matplotlib import pyplot as plt

pd.set_option('display.max_rows', None)

In [3]:
#Open csv file.

data = pd.read_csv("/content/drive/MyDrive/ACDF/acdf_combined.csv", index_col=0, na_values = -99)
data.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,PUFYEAR,SEX,RACE_NEW,ETHNICITY_HISPANIC,PRNCPTX,CPT,WORKRVU,INOUT,TRANST,AGE,...,READMRELICD105,READMUNRELICD95,READMUNRELICD105,WOUND_CLOSURE,PODIAG_OTHER,PODIAG_OTHER10,ANESTHES_OTHER,OTHCDIFF,NOTHCDIFF,DOTHCDIFF
5090996,2016,male,White,No,ARTHRD ANT INTERBODY DECOMPRESS CERVICAL BELW C2,22551,25.0,Inpatient,Not transferred (admitted from home),70,...,,,,All layers of incision (deep and superficial) ...,,,Local,No Complication,0,
5089868,2016,male,White,No,ARTHRD ANT INTERBODY DECOMPRESS CERVICAL BELW C2,22551,25.0,Inpatient,Not transferred (admitted from home),60,...,,,,All layers of incision (deep and superficial) ...,,,,No Complication,0,
5092013,2016,female,White,No,ARTHRD ANT INTERBODY DECOMPRESS CERVICAL BELW C2,22551,25.0,Inpatient,Not transferred (admitted from home),47,...,,,,All layers of incision (deep and superficial) ...,,,,No Complication,0,
5121528,2016,female,White,No,ARTHRD ANT INTERBODY DECOMPRESS CERVICAL BELW C2,22551,25.0,Inpatient,Not transferred (admitted from home),55,...,,,,All layers of incision (deep and superficial) ...,,,,No Complication,0,
5156517,2016,female,White,No,ARTHRD ANT INTERBODY DECOMPRESS CERVICAL BELW C2,22551,25.0,Inpatient,Not transferred (admitted from home),60,...,,,,All layers of incision (deep and superficial) ...,,,Local,No Complication,0,


In [4]:
#See all columns.

print(list(data.columns))

['PUFYEAR', 'SEX', 'RACE_NEW', 'ETHNICITY_HISPANIC', 'PRNCPTX', 'CPT', 'WORKRVU', 'INOUT', 'TRANST', 'AGE', 'ADMYR', 'OPERYR', 'DISCHDEST', 'ANESTHES', 'SURGSPEC', 'ELECTSURG', 'HEIGHT', 'WEIGHT', 'DIABETES', 'SMOKE', 'DYSPNEA', 'FNSTATUS2', 'VENTILAT', 'HXCOPD', 'ASCITES', 'HXCHF', 'HYPERMED', 'RENAFAIL', 'DIALYSIS', 'DISCANCR', 'WNDINF', 'STEROID', 'WTLOSS', 'BLEEDDIS', 'TRANSFUS', 'PRSEPIS', 'DPRNA', 'DPRBUN', 'DPRCREAT', 'DPRALBUM', 'DPRBILI', 'DPRSGOT', 'DPRALKPH', 'DPRWBC', 'DPRHCT', 'DPRPLATE', 'DPRPTT', 'DPRPT', 'DPRINR', 'PRSODM', 'PRBUN', 'PRCREAT', 'PRALBUM', 'PRBILI', 'PRSGOT', 'PRALKPH', 'PRWBC', 'PRHCT', 'PRPLATE', 'PRPTT', 'PRINR', 'PRPT', 'OTHERPROC1', 'OTHERCPT1', 'OTHERWRVU1', 'OTHERPROC2', 'OTHERCPT2', 'OTHERWRVU2', 'OTHERPROC3', 'OTHERCPT3', 'OTHERWRVU3', 'OTHERPROC4', 'OTHERCPT4', 'OTHERWRVU4', 'OTHERPROC5', 'OTHERCPT5', 'OTHERWRVU5', 'OTHERPROC6', 'OTHERCPT6', 'OTHERWRVU6', 'OTHERPROC7', 'OTHERCPT7', 'OTHERWRVU7', 'OTHERPROC8', 'OTHERCPT8', 'OTHERWRVU8', 'OTHERPRO

In [5]:
#Check data shape.

data.shape

(53085, 273)

In [6]:
#Define variables of interest (predictor variables, inclusion/exclusion criteria, outcomes of interest).

variables = ['SEX', 'RACE_NEW', 'ETHNICITY_HISPANIC', 'CPT', 'INOUT', 'TRANST', 'AGE', 'DISCHDEST', 'ANESTHES', 'SURGSPEC', 'ELECTSURG', 'HEIGHT', 'WEIGHT', 'DIABETES', 'SMOKE', 'DYSPNEA', 'FNSTATUS2', 'VENTILAT', 'HXCOPD', 'ASCITES', 'HXCHF', 'HYPERMED', 'RENAFAIL', 'DIALYSIS', 'DISCANCR', 'WNDINF', 'STEROID', 'WTLOSS', 'BLEEDDIS', 'TRANSFUS', 'PRSEPIS', 'PRSODM', 'PRBUN', 'PRCREAT', 'PRALBUM', 'PRBILI', 'PRSGOT', 'PRALKPH', 'PRWBC', 'PRHCT', 'PRPLATE', 'PRPTT', 'PRINR', 'PRPT', 'OTHERCPT1', 'OTHERCPT2', 'OTHERCPT3', 'OTHERCPT4', 'OTHERCPT5', 'OTHERCPT6', 'OTHERCPT7', 'OTHERCPT8', 'OTHERCPT9', 'OTHERCPT10', 'CONCPT1', 'CONCPT2', 'CONCPT3', 'CONCPT4', 'CONCPT5', 'CONCPT6', 'CONCPT7', 'CONCPT8', 'CONCPT9', 'CONCPT10', 'EMERGNCY', 'WNDCLAS', 'ASACLAS', 'OPTIME', 'TOTHLOS', 'HTOODAY', 'NSUPINFEC', 'NWNDINFD', 'NORGSPCSSI', 'NDEHIS', 'NOUPNEUMO', 'NREINTUB', 'NPULEMBOL', 'NFAILWEAN', 'NRENAINSF', 'NOPRENAFL', 'NURNINFEC', 'NCNSCVA', 'NCDARREST', 'NCDMI', 'NOTHBLEED', 'NOTHDVT', 'NOTHSYSEP', 'NOTHSESHOCK', 'PODIAG', 'PODIAG10', 'STILLINHOSP', 'READMISSION1']

In [7]:
#Remove unwanted columns and check data shape.

data = data[variables]

data.shape

(53085, 92)

In [8]:
#See the ICD codes' unique value counts for the patient cohort.

icd_codes_df = data['PODIAG10'].value_counts(normalize=False, dropna=False).to_frame()
icd_codes_index = icd_codes_df.index.tolist()

In [9]:
#Exclude patients with ICD codes that were used less than 10 in the patient population.

icd_codes_df = data['PODIAG10'].value_counts(normalize=False, dropna=False).to_frame()

icd_codes_df.columns =['Value']

icd_codes_df = icd_codes_df[icd_codes_df['Value'] >= 10]

icd_to_include = icd_codes_df.index.tolist()

data = data[data.PODIAG10.isin(icd_to_include)]

In [10]:
#Get the descriptions for ICD codes.

icd10 = pd.read_csv("/content/drive/MyDrive/ACDF/icd10_descriptions.csv", index_col = 'PODIAG10', encoding = 'latin1', low_memory = False)
icd10 = icd10.filter(items = icd_to_include, axis=0)

In [11]:
#Save the ICD codes with descriptions and value counts.

icd = pd.concat([icd10, icd_codes_df], axis=1)
icd.to_csv('/content/drive/MyDrive/ACDF/icd.csv')
icd

Unnamed: 0,PODIAGTX10,Value
M48.02,"Spinal stenosis, cervical region",13234
M47.12,"Other spondylosis with myelopathy, cervical re...",5901
M47.22,"Other spondylosis with radiculopathy, cervical...",4346
M54.12,"Radiculopathy, cervical region",3016
M50.122,Cervical disc disorder at C5-C6 level with rad...,2463
M50.123,Cervical disc disorder at C6-C7 level with rad...,1794
M50.12,"Cervical disc disorder with radiculopathy, mid...",1499
M47.812,Spondylosis without myelopathy or radiculopath...,1411
M50.01,"Cervical disc disorder with myelopathy, high c...",1347
M50.022,Cervical disc disorder at C5-C6 level with mye...,1198


In [12]:
#Review the ICD table for excluding patients with exclude patients diagnosed with a fracture, neoplasm, infection, instrumentation related complications or lumbar/thoracic/sacral site diagnoses.

icd_to_exclude = ['S12.500A',' G06.1',' M48.06', 'M51.16',' M46.22', 'S12.400A', 'C79.51', 'M43.16', 'T84.226A', 'S12.600A', 'M84.58XA', 'M48.061', 'S12.590A', 'S12.9XXA', 'M47.012', 'T84.216A', 'M54.16', 'S12.100A', 'S12.300A', 'T84.296A', 'M51.36', 'T84.89XA', 'S12.490A', 'M47.26', 'G06.2', 'T84.84XA', 'T84.098A', 'M48.32', 'S12.690A', 'S12.501A', 'S12.430A', 'M47.816', 'M84.48XA', 'M54.5', 'M51.37']

data = data[~data.PODIAG10.isin(icd_to_exclude)]

In [13]:
#Drop patients with missing ICD codes.

data = data[data['PODIAG10'].notna()]

In [14]:
#Check data for ICD codes.

data['PODIAG10'].value_counts(normalize=False, dropna=False)

M48.02      13234
M47.12       5901
M47.22       4346
M54.12       3016
M50.122      2463
M50.123      1794
M50.12       1499
M47.812      1411
M50.01       1347
M50.022      1198
M50.121      1101
M50.021      1040
M50.20        930
M50.02        802
M50.222       720
M50.22        708
M50.11        635
M50.10        511
M50.223       496
M50.023       461
G95.9         423
M50.221       418
M50.00        396
M43.12        384
M47.892       338
M50.21        322
M96.0         317
M54.2         309
M50.30        291
M50.13        282
M50.0         237
M25.78        205
M50.1         177
G95.20        172
M50.322       154
M50.32        138
M53.2X2       114
S13.161A      110
M50.323        99
M50.321        97
M50.31         96
M43.02         84
M40.202        81
G06.1          78
G95.89         76
M50.23         75
M50.03         75
S13.151A       71
M48.03         71
M48.06         63
M50.020        62
S14.123A       57
M48.062        48
M46.22         48
G95.29         48
S14.125A  

In [15]:
#Define the CPT to codes to exclude (posterior approach surgery and total disc arthroplasty).

data.loc[data['CONCPT1'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT1'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT1'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT1'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT1'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT1'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT1'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT1'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['CONCPT2'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT2'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT2'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT2'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT2'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT2'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT2'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT2'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['CONCPT3'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT3'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT3'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT3'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT3'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT3'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT3'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT3'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['CONCPT4'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT4'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT4'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT4'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT4'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT4'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT4'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT4'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['CONCPT5'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT5'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT5'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT5'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT5'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT5'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT5'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT5'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['CONCPT6'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT6'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT6'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT6'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT6'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT6'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT6'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT6'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['CONCPT7'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT7'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT7'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT7'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT7'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT7'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT7'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT7'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['CONCPT8'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT8'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT8'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT8'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT8'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT8'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT8'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT8'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['CONCPT9'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT9'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT9'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT9'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT9'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT9'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT9'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT9'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['CONCPT10'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT10'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT10'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT10'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT10'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT10'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT10'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['CONCPT10'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['OTHERCPT1'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT1'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT1'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT1'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT1'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT1'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT1'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT1'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['OTHERCPT2'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT2'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT2'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT2'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT2'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT2'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT2'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT2'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['OTHERCPT3'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT3'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT3'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT3'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT3'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT3'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT3'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT3'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['OTHERCPT4'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT4'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT4'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT4'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT4'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT4'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT4'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT4'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['OTHERCPT5'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT5'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT5'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT5'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT5'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT5'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT5'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT5'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['OTHERCPT6'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT6'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT6'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT6'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT6'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT6'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT6'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT6'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['OTHERCPT7'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT7'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT7'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT7'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT7'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT7'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT7'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT7'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['OTHERCPT8'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT8'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT8'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT8'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT8'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT8'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT8'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT8'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['OTHERCPT9'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT9'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT9'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT9'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT9'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT9'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT9'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT9'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['OTHERCPT10'] == 22590, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT10'] == 22595, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT10'] == 22600, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT10'] == 22614, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT10'] == 22856, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT10'] == 22858, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT10'] == 22861, 'CPT_EX'] = 'Yes'
data.loc[data['OTHERCPT10'] == 22864, 'CPT_EX'] = 'Yes'

data.loc[data['CPT_EX'] != 'Yes', 'CPT_EX'] = 'No'

data['CPT_EX'].value_counts(dropna=False)

No     49624
Yes     1138
Name: CPT_EX, dtype: int64

In [16]:
#Exclude patients with CPT codes that were assigned to be excluding.

data = data[(data['CPT_EX'] == 'No')]

data['CPT_EX'].value_counts(normalize=False, dropna=False)

No    49624
Name: CPT_EX, dtype: int64

In [17]:
#Check data for elective surgeries.

data['ELECTSURG'].value_counts(normalize=False, dropna=False)

Yes        46517
No          3086
Unknown       21
Name: ELECTSURG, dtype: int64

In [18]:
#Apply inclusion criteria for elective surgeries.

data = data[(data['ELECTSURG'] == 'Yes')]

data['ELECTSURG'].value_counts(normalize=False, dropna=False)

Yes    46517
Name: ELECTSURG, dtype: int64

In [19]:
#Check data for emergency surgery.

data['EMERGNCY'].value_counts(normalize=False, dropna=False)

No     46500
Yes       17
Name: EMERGNCY, dtype: int64

In [20]:
#Apply exclusion criteria for emergency surgery.

data = data[(data['EMERGNCY'] == 'No')]

data['EMERGNCY'].value_counts(normalize=False, dropna=False)

No    46500
Name: EMERGNCY, dtype: int64

In [21]:
#Check data for anesthesia type.

data['ANESTHES'].value_counts(normalize=False, dropna=False)

General            46340
Other                 85
Spinal                36
MAC/IV Sedation       34
None                   1
Local                  1
Regional               1
Unknown                1
Epidural               1
Name: ANESTHES, dtype: int64

In [22]:
#Apply inclusion criteria for general anesthesia.

data = data[(data['ANESTHES'] == 'General')]

data['ANESTHES'].value_counts(normalize=False, dropna=False)

General    46340
Name: ANESTHES, dtype: int64

In [23]:
#Check data for surgical specialties.

data['SURGSPEC'].value_counts(normalize=False, dropna=False)

Neurosurgery                  33335
Orthopedics                   12850
General Surgery                 109
Otolaryngology (ENT)             17
Thoracic                         10
Cardiac Surgery                   4
Urology                           3
Vascular                          3
Gynecology                        3
Obstetrics                        3
Plastics                          2
Interventional Radiologist        1
Name: SURGSPEC, dtype: int64

In [24]:
#Apply inclusion criteria for surgical specialties.

data = data[(data['SURGSPEC'] == 'Neurosurgery') | (data['SURGSPEC'] == 'Orthopedics')]

data['SURGSPEC'].value_counts(normalize=False, dropna=False)

Neurosurgery    33335
Orthopedics     12850
Name: SURGSPEC, dtype: int64

In [25]:
#Check data for wound class.

data['WNDCLAS'].value_counts(normalize=False, dropna=False)

1-Clean                 46004
2-Clean/Contaminated      148
3-Contaminated             19
4-Dirty/Infected           14
Name: WNDCLAS, dtype: int64

In [26]:
#Apply exclusion criteria for wound class.

data = data[(data['WNDCLAS'] == '1-Clean')]

data['WNDCLAS'].value_counts(normalize=False, dropna=False)

1-Clean    46004
Name: WNDCLAS, dtype: int64

In [27]:
#Check data for preoperative sepsis.

data['PRSEPIS'].value_counts(normalize=False, dropna=False)

None            45882
SIRS              116
Sepsis              5
Septic Shock        1
Name: PRSEPIS, dtype: int64

In [28]:
#Apply exclusion criteria for preoperative sepsis.

data = data[(data['PRSEPIS'] == 'None')]

data['PRSEPIS'].value_counts(normalize=False, dropna=False)

None    45882
Name: PRSEPIS, dtype: int64

In [29]:
#Check data for ASA Class.

data['ASACLAS'].value_counts(normalize=False, dropna=False)

2-Mild Disturb      22973
3-Severe Disturb    20776
1-No Disturb         1296
4-Life Threat         798
None assigned          37
5-Moribund              2
Name: ASACLAS, dtype: int64

In [30]:
#Apply exclusion criteria for ASA class.

data = data[(data['ASACLAS'] != '4-Life Threat') & (data['ASACLAS'] != '5-Moribund') & (data['ASACLAS'] != 'None assigned')]

data['ASACLAS'].value_counts(normalize=False, dropna=False)

2-Mild Disturb      22973
3-Severe Disturb    20776
1-No Disturb         1296
Name: ASACLAS, dtype: int64

In [31]:
#Check data for patients still in hospital after 30 days postoperatively.

data['STILLINHOSP'].value_counts(normalize=False, dropna=False)

No     45024
Yes       21
Name: STILLINHOSP, dtype: int64

In [32]:
#Apply exclusion criteria for patients still in hospital after 30 days postoperatively.

data = data[(data['STILLINHOSP'] == 'No')]

data['STILLINHOSP'].value_counts(normalize=False, dropna=False)

No    45024
Name: STILLINHOSP, dtype: int64

In [33]:
#Create BMI column.

lbs_to_kg_ratio = 0.453592
inch_to_meter_ratio = 0.0254

data['HEIGHT'] *= inch_to_meter_ratio
data['WEIGHT'] *= lbs_to_kg_ratio

data['BMI'] = data['WEIGHT']/(data['HEIGHT']**2)
print(min(data['BMI']))
print(max(data['BMI']))

14.266228532457065
130.91171475348625


In [34]:
#Check data for race.

data['RACE_NEW'].value_counts(normalize=False, dropna=False)

White                                        35248
Black or African American                     4774
Unknown/Not Reported                          3856
Asian                                          769
American Indian or Alaska Native               228
Native Hawaiian or Pacific Islander            120
Some Other Race                                 15
Native Hawaiian or Other Pacific Islander       11
Race combinations with low frequency             3
Name: RACE_NEW, dtype: int64

In [35]:
#Check data for ethnicity.

data['ETHNICITY_HISPANIC'].value_counts(normalize=False, dropna=False)

No         39184
Unknown     3286
Yes         2554
Name: ETHNICITY_HISPANIC, dtype: int64

In [36]:
#Simplify race and ethnicity columns.

data.loc[data['RACE_NEW'] == 'White', 'RACE'] = 'White'
data.loc[data['RACE_NEW'] == 'Black or African American', 'RACE'] = 'Black or African American'
data.loc[data['RACE_NEW'] == 'Asian', 'RACE'] = 'Asian'
data.loc[data['RACE_NEW'] == 'American Indian or Alaska Native', 'RACE'] = 'Other'
data.loc[data['RACE_NEW'] == 'Native Hawaiian or Pacific Islander', 'RACE'] = 'Other'
data.loc[data['RACE_NEW'] == 'Some Other Race', 'RACE'] = 'Other'
data.loc[data['RACE_NEW'] == 'Native Hawaiian or Other Pacific Islander', 'RACE'] = 'Other'
data.loc[data['RACE_NEW'] == 'Race combinations with low frequency e', 'RACE'] = 'Other'

data.loc[data['ETHNICITY_HISPANIC'] == 'Yes', 'RACE'] = 'Hispanic'

data['RACE'].value_counts(normalize=False, dropna=False)

White                        33605
Black or African American     4727
NaN                           3056
Hispanic                      2554
Asian                          756
Other                          326
Name: RACE, dtype: int64

In [37]:
#Check data for transfer status.

data['TRANST'].value_counts(normalize=False, dropna=False)

Not transferred (admitted from home)               44742
From acute care hospital inpatient                   118
Nursing home - Chronic care - Intermediate care       63
Transfer from other                                   58
Outside emergency department                          36
Unknown                                                7
Name: TRANST, dtype: int64

In [38]:
#Simplify transfer status column.

data.loc[data['TRANST'] == 'Not transferred (admitted from home)', 'TRANST'] = 'Not transferred'
data.loc[data['TRANST'] == 'From acute care hospital inpatient', 'TRANST'] = 'Transferred'
data.loc[data['TRANST'] == 'Outside emergency department', 'TRANST'] = 'Transferred'
data.loc[data['TRANST'] == 'Nursing home - Chronic care - Intermediate care', 'TRANST'] = 'Transferred'
data.loc[data['TRANST'] == 'Transfer from other', 'TRANST'] = 'Transferred'

data['TRANST'].value_counts(normalize=False, dropna=False)

Not transferred    44742
Transferred          275
Unknown                7
Name: TRANST, dtype: int64

In [39]:
#Check data for dyspnea.

data['DYSPNEA'].value_counts(normalize=False, dropna=False)

No                   42941
MODERATE EXERTION     1963
AT REST                120
Name: DYSPNEA, dtype: int64

In [40]:
#Simplify dyspnea column.

data.loc[data['DYSPNEA'] == 'No', 'DYSPNEA'] = 'No'
data.loc[data['DYSPNEA'] == 'MODERATE EXERTION', 'DYSPNEA'] = 'Yes'
data.loc[data['DYSPNEA'] == 'AT REST', 'DYSPNEA'] = 'Yes'

data['DYSPNEA'].value_counts(normalize=False, dropna=False)

No     42941
Yes     2083
Name: DYSPNEA, dtype: int64

In [41]:
#Check data for diabetes status.

data['DIABETES'].value_counts(normalize=False, dropna=False)

NO             37305
NON-INSULIN     5144
INSULIN         2575
Name: DIABETES, dtype: int64

In [42]:
#Simplify diabetes column.

data.loc[data['DIABETES'] == 'NO', 'DIABETES'] = 'No'
data.loc[data['DIABETES'] == 'NON-INSULIN', 'DIABETES'] = 'Yes'
data.loc[data['DIABETES'] == 'INSULIN', 'DIABETES'] = 'Yes'

data['DIABETES'].value_counts(normalize=False, dropna=False)

No     37305
Yes     7719
Name: DIABETES, dtype: int64

In [43]:
#Cast ASA class as ordered categorical.

cat_type1 = CategoricalDtype(categories=['1-No Disturb','2-Mild Disturb','3-Severe Disturb'], ordered=True)
data['ASACLAS'].astype(cat_type1)

5090996     3-Severe Disturb
5089868       2-Mild Disturb
5092013     3-Severe Disturb
5121528     3-Severe Disturb
5157828       2-Mild Disturb
5246673       2-Mild Disturb
5248778     3-Severe Disturb
5270376       2-Mild Disturb
5270905       2-Mild Disturb
5301219       2-Mild Disturb
5310530     3-Severe Disturb
5327772       2-Mild Disturb
5368354       2-Mild Disturb
5375538     3-Severe Disturb
5423817     3-Severe Disturb
5423407       2-Mild Disturb
5423979       2-Mild Disturb
5551375     3-Severe Disturb
5573149       2-Mild Disturb
5595869       2-Mild Disturb
5598310       2-Mild Disturb
5664020       2-Mild Disturb
5665558       2-Mild Disturb
5716421       2-Mild Disturb
5780052       2-Mild Disturb
5865095       2-Mild Disturb
5863819       2-Mild Disturb
5885127     3-Severe Disturb
6484244     3-Severe Disturb
6483433       2-Mild Disturb
6505306       2-Mild Disturb
6565459       2-Mild Disturb
6575975     3-Severe Disturb
5839419       2-Mild Disturb
5867058     3-

In [44]:
#Cast functional status as ordered categorical.

cat_type2 = CategoricalDtype(categories=['Unknown','Independent','Partiallly Dependent', 'Totally Dependent'], ordered=True)
data['FNSTATUS2'].astype(cat_type2)

5090996           Independent
5089868           Independent
5092013           Independent
5121528           Independent
5157828           Independent
5246673           Independent
5248778           Independent
5270376           Independent
5270905           Independent
5301219           Independent
5310530           Independent
5327772           Independent
5368354           Independent
5375538           Independent
5423817           Independent
5423407           Independent
5423979           Independent
5551375                   NaN
5573149           Independent
5595869           Independent
5598310           Independent
5664020           Independent
5665558           Independent
5716421           Independent
5780052           Independent
5865095           Independent
5863819           Independent
5885127           Independent
6484244           Independent
6483433           Independent
6505306           Independent
6565459           Independent
6575975           Independent
5839419   

In [45]:
#Convert 90+ to 91 and AGE column to integer.

data.loc[data['AGE'] == '90+', 'AGE'] = 91
data['AGE'] = pd.to_numeric(data['AGE'], downcast='integer')

In [46]:
#Show patients for each CPT code.

data['CPT'].value_counts()

22551    41989
22554     2688
22552      312
22585       35
Name: CPT, dtype: int64

In [47]:
#Classify tumors into extradural vs. intradural and create a column named 'IEDUR' for it.

data.loc[data['CPT'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT1'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT2'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT3'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT4'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT5'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT6'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT7'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT9'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT10'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT1'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT2'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT3'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT4'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT5'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT6'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT7'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT9'] == 22552, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT10'] == 22552, 'LEVELS'] = 'Multi'

data.loc[data['CPT'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT1'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT2'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT3'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT4'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT5'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT6'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT7'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT9'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['CONCPT10'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT1'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT2'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT3'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT4'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT5'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT6'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT7'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT9'] == 22585, 'LEVELS'] = 'Multi'
data.loc[data['OTHERCPT10'] == 22585, 'LEVELS'] = 'Multi'

data.loc[data['LEVELS'] != 'Multi', 'LEVELS'] = 'Single'

data['LEVELS'].value_counts(dropna=False)

Single    23531
Multi     21493
Name: LEVELS, dtype: int64

In [48]:
#Define major complications.

data['MAJRCOMP'] = data['NWNDINFD'] + data['NORGSPCSSI'] + data['NDEHIS'] + data['NREINTUB'] + data['NPULEMBOL'] + data['NFAILWEAN'] + data['NRENAINSF'] + data['NOPRENAFL'] + data['NCNSCVA'] + data['NCDARREST'] + data['NCDMI'] + data['NOTHBLEED'] + data['NOTHDVT'] + data['NOTHSYSEP'] + data['NOTHSESHOCK']

In [49]:
#Show number of major complications per patient.

data['MAJRCOMP'].value_counts()

0    44388
1      455
2      112
3       50
4       12
5        6
7        1
Name: MAJRCOMP, dtype: int64

In [50]:
#Convert major complications into categorical data in a column named 'COMP'.

data.loc[data['MAJRCOMP'] == 0, 'COMP'] = 'No'
data.loc[data['MAJRCOMP'] >= 1, 'COMP'] = 'Yes'

In [51]:
#Show major complications as categorical.

data['COMP'].value_counts()

No     44388
Yes      636
Name: COMP, dtype: int64

In [52]:
#See LOS per patient.

data['TOTHLOS'].value_counts()

1.0     27682
2.0      6902
0.0      5398
3.0      2252
4.0       943
5.0       515
6.0       337
7.0       259
8.0       178
9.0       104
10.0       90
11.0       68
12.0       32
14.0       29
13.0       27
15.0       24
20.0       17
19.0       14
18.0       14
17.0       13
22.0       12
21.0       11
16.0        9
23.0        7
30.0        7
31.0        6
29.0        5
24.0        5
32.0        5
35.0        4
28.0        4
25.0        3
41.0        2
36.0        2
59.0        2
27.0        2
43.0        2
39.0        1
63.0        1
37.0        1
33.0        1
34.0        1
72.0        1
46.0        1
44.0        1
57.0        1
50.0        1
93.0        1
48.0        1
26.0        1
Name: TOTHLOS, dtype: int64

In [53]:
#See 75th percentile of LOS.
data.TOTHLOS.quantile(0.75)

2.0

In [54]:
#Convert total length of stay into categorical data in a column named 'LOS'.

data.loc[data['TOTHLOS'] <= data.TOTHLOS.quantile(0.75), 'LOS'] = 'No'
data.loc[data['TOTHLOS'] > data.TOTHLOS.quantile(0.75), 'LOS'] = 'Yes'

In [55]:
#Show major complications as categorical.

data['LOS'].value_counts(dropna=False)

No     39982
Yes     5017
NaN       25
Name: LOS, dtype: int64

In [56]:
#Drop patients with unknown LOS.

data = data[data['LOS'].notna()]

In [57]:
#Show readmission status.

data['READMISSION1'].value_counts(dropna=False)

No     43685
Yes     1308
NaN        6
Name: READMISSION1, dtype: int64

In [58]:
#Drop patients with unknown readmission status.

data = data[data['READMISSION1'].notna()]

In [59]:
#Show readmission status after dropping patients with unknown readmission status.

data['READMISSION1'].value_counts(dropna=False)

No     43685
Yes     1308
Name: READMISSION1, dtype: int64

In [60]:
#Show discharge status.

data['DISCHDEST'].value_counts(dropna=False)

Home                            43264
Rehab                             792
Skilled Care, Not Home            600
Facility Which was Home           186
Separate Acute Care                59
Against Medical Advice (AMA)       33
Unskilled Facility Not Home        23
Unknown                            16
Expired                            14
Hospice                             4
Multi-level Senior Community        2
Name: DISCHDEST, dtype: int64

In [61]:
#Convert discharge destination into binary data (home vs. non-home discharge) in a column named 'DISCHARGE'.

data.loc[data['DISCHDEST'] == 'Home', 'DISCHARGE'] = 'No'
data.loc[data['DISCHDEST'] == 'Facility Which was Home', 'DISCHARGE'] = 'No'
data.loc[data['DISCHDEST'] == 'Rehab', 'DISCHARGE'] = 'Yes'
data.loc[data['DISCHDEST'] == 'Skilled Care, Not Home', 'DISCHARGE'] = 'Yes'
data.loc[data['DISCHDEST'] == 'Separate Acute Care', 'DISCHARGE'] = 'Yes'
data.loc[data['DISCHDEST'] == 'Multi-level Senior Community', 'DISCHARGE'] = 'Yes'

In [62]:
#Show discharge destination status after converting it to binary data.

data['DISCHARGE'].value_counts(dropna=False)

No     43450
Yes     1453
NaN       90
Name: DISCHARGE, dtype: int64

In [63]:
#Drop patients with unknown discharge status.

data = data[data['DISCHARGE'].notna()]

In [64]:
#Show discharge status after dropping patients with unknown discharge status.

data['DISCHARGE'].value_counts(dropna=False)

No     43450
Yes     1453
Name: DISCHARGE, dtype: int64

In [65]:
#Check data.

data.shape

(44903, 100)

In [66]:
#See all columns.

print(list(data.columns))

['SEX', 'RACE_NEW', 'ETHNICITY_HISPANIC', 'CPT', 'INOUT', 'TRANST', 'AGE', 'DISCHDEST', 'ANESTHES', 'SURGSPEC', 'ELECTSURG', 'HEIGHT', 'WEIGHT', 'DIABETES', 'SMOKE', 'DYSPNEA', 'FNSTATUS2', 'VENTILAT', 'HXCOPD', 'ASCITES', 'HXCHF', 'HYPERMED', 'RENAFAIL', 'DIALYSIS', 'DISCANCR', 'WNDINF', 'STEROID', 'WTLOSS', 'BLEEDDIS', 'TRANSFUS', 'PRSEPIS', 'PRSODM', 'PRBUN', 'PRCREAT', 'PRALBUM', 'PRBILI', 'PRSGOT', 'PRALKPH', 'PRWBC', 'PRHCT', 'PRPLATE', 'PRPTT', 'PRINR', 'PRPT', 'OTHERCPT1', 'OTHERCPT2', 'OTHERCPT3', 'OTHERCPT4', 'OTHERCPT5', 'OTHERCPT6', 'OTHERCPT7', 'OTHERCPT8', 'OTHERCPT9', 'OTHERCPT10', 'CONCPT1', 'CONCPT2', 'CONCPT3', 'CONCPT4', 'CONCPT5', 'CONCPT6', 'CONCPT7', 'CONCPT8', 'CONCPT9', 'CONCPT10', 'EMERGNCY', 'WNDCLAS', 'ASACLAS', 'OPTIME', 'TOTHLOS', 'HTOODAY', 'NSUPINFEC', 'NWNDINFD', 'NORGSPCSSI', 'NDEHIS', 'NOUPNEUMO', 'NREINTUB', 'NPULEMBOL', 'NFAILWEAN', 'NRENAINSF', 'NOPRENAFL', 'NURNINFEC', 'NCNSCVA', 'NCDARREST', 'NCDMI', 'NOTHBLEED', 'NOTHDVT', 'NOTHSYSEP', 'NOTHSESHO

In [67]:
#Drop unwanted columns.

drop = ['RACE_NEW', 'ETHNICITY_HISPANIC', 'CPT',  'DISCHDEST', 'ANESTHES', 'ELECTSURG', 'PRSEPIS', 'OTHERCPT1', 'OTHERCPT2', 'OTHERCPT3', 'OTHERCPT4', 'OTHERCPT5', 'OTHERCPT6', 'OTHERCPT7', 'OTHERCPT8', 'OTHERCPT9', 'OTHERCPT10', 'CONCPT1', 'CONCPT2', 'CONCPT3', 'CONCPT4', 'CONCPT5', 'CONCPT6', 'CONCPT7', 'CONCPT8', 'CONCPT9', 'CONCPT10', 'EMERGNCY', 'WNDCLAS', 'OPTIME', 'HTOODAY', 'NSUPINFEC', 'NWNDINFD', 'NORGSPCSSI', 'NDEHIS', 'NOUPNEUMO', 'NREINTUB', 'NPULEMBOL', 'NFAILWEAN', 'NRENAINSF', 'NOPRENAFL', 'NURNINFEC', 'NCNSCVA', 'NCDARREST', 'NCDMI', 'NOTHBLEED', 'NOTHDVT', 'NOTHSYSEP', 'NOTHSESHOCK', 'PODIAG', 'PODIAG10', 'STILLINHOSP', 'CPT_EX', 'MAJRCOMP']
data.drop(drop, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [68]:
#Save data.

data.to_csv('/content/drive/MyDrive/ACDF/acdf_clean.csv')

In [69]:
#See categorical and continuous variables.

print('Numerical columns: {}'.format(list(data.select_dtypes('number').columns)))
print()
print('Categorical columns: {}'.format(list(data.select_dtypes('object').columns)))

Numerical columns: ['AGE', 'HEIGHT', 'WEIGHT', 'PRSODM', 'PRBUN', 'PRCREAT', 'PRALBUM', 'PRBILI', 'PRSGOT', 'PRALKPH', 'PRWBC', 'PRHCT', 'PRPLATE', 'PRPTT', 'PRINR', 'PRPT', 'TOTHLOS', 'BMI']

Categorical columns: ['SEX', 'INOUT', 'TRANST', 'SURGSPEC', 'DIABETES', 'SMOKE', 'DYSPNEA', 'FNSTATUS2', 'VENTILAT', 'HXCOPD', 'ASCITES', 'HXCHF', 'HYPERMED', 'RENAFAIL', 'DIALYSIS', 'DISCANCR', 'WNDINF', 'STEROID', 'WTLOSS', 'BLEEDDIS', 'TRANSFUS', 'ASACLAS', 'READMISSION1', 'RACE', 'LEVELS', 'COMP', 'LOS', 'DISCHARGE']


In [70]:
#Define numerical and categorical columns.

num_cols = ['AGE', 'HEIGHT', 'WEIGHT', 'PRSODM', 'PRBUN', 'PRCREAT', 'PRALBUM', 'PRBILI', 'PRSGOT', 'PRALKPH', 'PRWBC', 'PRHCT', 'PRPLATE', 'PRPTT', 'PRINR', 'PRPT', 'TOTHLOS', 'BMI']

cat_cols = ['SEX', 'INOUT', 'TRANST', 'SURGSPEC', 'DIABETES', 'SMOKE', 'DYSPNEA', 'FNSTATUS2', 'VENTILAT', 'HXCOPD', 'ASCITES', 'HXCHF', 'HYPERMED', 'RENAFAIL', 'DIALYSIS', 'DISCANCR', 'WNDINF', 'STEROID', 'WTLOSS', 'BLEEDDIS', 'TRANSFUS', 'ASACLAS', 'READMISSION1', 'RACE', 'LEVELS', 'COMP', 'LOS', 'DISCHARGE']

In [71]:
#Check missing values for numerical columns.

missing_num = data[num_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

missing_num = pd.DataFrame(missing_num)

missing_num.columns = ['Value']

missing_num = missing_num[missing_num['Value'] > 0]

print(missing_num.index)

missing_num = missing_num[missing_num['Value'] > 25]

missing_num = list(missing_num.index)

print(missing_num)

Index(['PRPT', 'PRBILI', 'PRALKPH', 'PRSGOT', 'PRALBUM', 'PRPTT', 'PRINR',
       'PRBUN', 'PRSODM', 'PRCREAT', 'PRPLATE', 'PRWBC', 'PRHCT', 'BMI',
       'HEIGHT', 'WEIGHT'],
      dtype='object')
['PRPT', 'PRBILI', 'PRALKPH', 'PRSGOT', 'PRALBUM', 'PRPTT', 'PRINR']


In [72]:
#Drop numerical columns with missing values over 25%.

data.drop(missing_num, axis=1, inplace=True)

In [73]:
#Define new numerical columns.

num_cols = [x for x in num_cols if x not in missing_num]
print(num_cols)

['AGE', 'HEIGHT', 'WEIGHT', 'PRSODM', 'PRBUN', 'PRCREAT', 'PRWBC', 'PRHCT', 'PRPLATE', 'TOTHLOS', 'BMI']


In [74]:
#Impute missing numerical values.

num_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
data[num_cols] = num_imputer.fit_transform(data[num_cols])

In [75]:
#Check numerical variables with missing values after imputation.

data[num_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

AGE        0.0
HEIGHT     0.0
WEIGHT     0.0
PRSODM     0.0
PRBUN      0.0
PRCREAT    0.0
PRWBC      0.0
PRHCT      0.0
PRPLATE    0.0
TOTHLOS    0.0
BMI        0.0
dtype: float64

In [76]:
#Check missing values for categorical columns.

missing_cat = data[cat_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

missing_cat = pd.DataFrame(missing_cat)

missing_cat.columns = ['Value']

missing_cat = missing_cat[missing_cat['Value'] > 0]

print(missing_cat.index)

missing_cat = missing_cat[missing_cat['Value'] > 25]

missing_cat = list(missing_cat.index)

print(missing_cat)

Index(['RACE'], dtype='object')
[]


In [77]:
#Drop categorical columns with missing values over 25%.

data.drop(missing_cat, axis=1, inplace=True)

In [78]:
#Define new categorical columns.

cat_cols = [x for x in cat_cols if x not in missing_cat]
print(cat_cols)

['SEX', 'INOUT', 'TRANST', 'SURGSPEC', 'DIABETES', 'SMOKE', 'DYSPNEA', 'FNSTATUS2', 'VENTILAT', 'HXCOPD', 'ASCITES', 'HXCHF', 'HYPERMED', 'RENAFAIL', 'DIALYSIS', 'DISCANCR', 'WNDINF', 'STEROID', 'WTLOSS', 'BLEEDDIS', 'TRANSFUS', 'ASACLAS', 'READMISSION1', 'RACE', 'LEVELS', 'COMP', 'LOS', 'DISCHARGE']


In [79]:
#Replace missing categorical values with 'Unknown'.

for col in cat_cols:
    data[col].fillna(value='Unknown', inplace=True)

In [80]:
#Check missing values after imputation.

data[cat_cols].isnull().mean().round(4).mul(100).sort_values(ascending=False)

SEX             0.0
INOUT           0.0
LOS             0.0
COMP            0.0
LEVELS          0.0
RACE            0.0
READMISSION1    0.0
ASACLAS         0.0
TRANSFUS        0.0
BLEEDDIS        0.0
WTLOSS          0.0
STEROID         0.0
WNDINF          0.0
DISCANCR        0.0
DIALYSIS        0.0
RENAFAIL        0.0
HYPERMED        0.0
HXCHF           0.0
ASCITES         0.0
HXCOPD          0.0
VENTILAT        0.0
FNSTATUS2       0.0
DYSPNEA         0.0
SMOKE           0.0
DIABETES        0.0
SURGSPEC        0.0
TRANST          0.0
DISCHARGE       0.0
dtype: float64

In [81]:
#Save imputed data.

data.to_csv('/content/drive/MyDrive/ACDF/acdf_imputed.csv')

In [82]:
#RobustScale data.

data[num_cols] = RobustScaler().fit_transform(data[num_cols])

In [83]:
#Normalize data.

data[num_cols] = MinMaxScaler().fit_transform(data[num_cols])

In [84]:
#Save scaled data.

data.to_csv('/content/drive/MyDrive/ACDF/acdf_scaled.csv')

In [85]:
#One hot encoding for categorical values.

data_final = pd.get_dummies(data, columns = cat_cols, drop_first = True)

In [86]:
#Save final data.

data_final.to_csv('/content/drive/MyDrive/ACDF/acdf_final.csv')

In [87]:
#One hot encoding for categorical values.

data_gradio = pd.get_dummies(data, columns = cat_cols, drop_first = False)

In [88]:
#Save final data.

data_gradio.to_csv('/content/drive/MyDrive/ACDF/acdf_gradio.csv')