**Notebook content:**
- create matrix of patients and events


In [622]:
%reset -f 

In [623]:
from myDefs.defs import *
from IPython.display import display

pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 200)
#pd.set_option('display.float_format', lambda x: '%.f' % x)
pd.options.display.precision = 4

#import sys
np.set_printoptions(threshold=sys.maxsize) #- print the full NumPy array

In [624]:
OUTPUT_PATH = DATA_PATH + 'matrix1.xlsx'

### **Read files**

In [625]:
#read events
path = DATA_PATH + "parseData2.csv"
df = pd.read_csv(path, sep=',') 
#parse date of surgery
df['eventStartDate'] = pd.to_datetime(df['eventStartDate'], format='%Y-%m-%d')
df['eventEndDate'] = pd.to_datetime(df['eventEndDate'], format='%Y-%m-%d')

In [626]:
#read annonymous file
path = "{}parseAnonymous0.csv".format(DATA_PATH)
anonymous = pd.read_csv(path, sep=',')
#parse date of surgery
anonymous['Date of surgery'] = pd.to_datetime(anonymous['Date of surgery'], format='%Y-%m-%d')

In [627]:
print(df.shape)
print(anonymous.shape)


(1683616, 22)
(718, 14)


# Filter events and slice data

### anonymous

In [628]:
#get only patient with a surgery date, which are not duplicate (first surgey)
anonymous = anonymous[anonymous['Date of surgery'].notna()].loc[~anonymous['pid'].duplicated()]
print("anonymous:", anonymous.shape)

anonymous: (525, 14)


In [629]:
#filter patients not in anonymous
df = df[df['pid'].isin(anonymous['pid'])]
df['pid'].nunique()

525

In [630]:
#get data from anonymous 

#map: 1 if Anastomotic Leak, 0 o.w.
anonymous['Anastomotic Leak'] = 0
anonymous.loc[anonymous['Complications']=='Anastomotic Leak', 'Anastomotic Leak'] = 1

#drop columns
cols_to_remove = ['Name of surgery',
                  'Days of hospitalization', 'Patient classification', 
                  'Responsible surgeon', 'Kk', 'Simple', 'Severe', 'Complications']
anonymous = anonymous.drop(columns=cols_to_remove).set_index('pid')

In [631]:
anonymous

Unnamed: 0_level_0,Sex,Age,Date of surgery,Urgency,Surgical approach,Anastomotic Leak
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1123813,male,59.0,2018-07-18,elective,Open,1
1135541,female,43.0,2018-03-12,elective,Open,0
3918745,female,57.0,2018-05-22,elective,Laparoscopic,0
4894647,,,2018-01-15,elective,Open,0
6541482,female,65.0,2018-07-06,urgent,Open,0
...,...,...,...,...,...,...
1031926105,female,63.0,2018-04-11,elective,Open,0
1032366104,female,63.0,2018-03-28,elective,Open,0
1033842103,male,61.0,2018-03-27,elective,Open,1
1034462102,male,76.0,2018-03-14,elective,Open,0


### events

In [632]:
#choose in model, laboratory, nPatients 
npatients = 40

laboratory = \
df[df['inModel']==1]\
.loc[df['category'] == 'laboratory']\
.loc[df.groupby('featureName')['pid']\
     .transform('nunique') > npatients].index

len(laboratory)

406075

In [633]:
#choose in model, physical, nPatients > 

npatients = 40

physical = \
df[df['inModel']==1]\
.loc[df['category'] == 'physical']\
.loc[df.groupby('featureName')['pid']\
     .transform('nunique') > npatients].index

len(physical)

434163

In [634]:
drug = \
df[df['inModel']==1]\
.loc[df['category'] == 'drug']\
.loc[df.groupby('featureName')['pid']\
     .transform('nunique') > 10].index

len(drug)

25641

In [635]:
#slice events
data = df.loc[laboratory | physical | drug]

data.shape

(865879, 22)

# print events

In [636]:
#print features
data['featureName'].value_counts().sort_values(ascending=False)

heart rate                       92465
blood pressure                   62621
body temperature                 57178
saturation                       51773
FiO2                             33969
SpO2.pr                          24613
SpO2.ir_amp                      24267
blood pressure diastolic         22393
blood pressure systolic          21942
respiratory rate, total          17816
respiratory rate, CO2            13802
Potassium                        12276
Sodium                           12259
HGB                              11802
Saturation, O2                   11801
WBC_1                            11469
NEUTRO (abs)                     10462
PSV                               9553
Nucleated RBC (abs)               8210
Nucleated RBC (%)                 8209
HCT                               8079
PLT                               8062
MPV                               8057
RBC                               8052
MCHC                              8049
MCH                      

# Create A Features Table

### MAT0: Anonymous

In [637]:

mat0 = pd.concat([anonymous], axis=1, keys=["general"], names = [""])
mat0 = pd.concat([mat0], axis=1, keys=["constant"], names = [""])
mat0.columns = mat0.columns.reorder_levels([1,2,0]) 
mat0

Unnamed: 0_level_0,general,general,general,general,general,general
Unnamed: 0_level_1,Sex,Age,Date of surgery,Urgency,Surgical approach,Anastomotic Leak
Unnamed: 0_level_2,constant,constant,constant,constant,constant,constant
pid,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
1123813,male,59.0,2018-07-18,elective,Open,1
1135541,female,43.0,2018-03-12,elective,Open,0
3918745,female,57.0,2018-05-22,elective,Laparoscopic,0
4894647,,,2018-01-15,elective,Open,0
6541482,female,65.0,2018-07-06,urgent,Open,0
...,...,...,...,...,...,...
1031926105,female,63.0,2018-04-11,elective,Open,0
1032366104,female,63.0,2018-03-28,elective,Open,0
1033842103,male,61.0,2018-03-27,elective,Open,1
1034462102,male,76.0,2018-03-14,elective,Open,0


### MAT1: laboratory & physical categories

In [638]:
data = df.loc[laboratory | physical]

In [639]:

before_frames = []
after_frames = []

#for each patient and surgery date in anonymous 
for pid, row in anonymous.iterrows(): 
    
    #get surgery date of current patient 
    surgery_date = row['Date of surgery']
    
    #print(pid, surgery_date)
    
    #get events before and after date of surgery
    patient = data.loc[data['pid'] == pid, :]
    before = patient[patient['eventStartDate'] < surgery_date]
    after = patient[patient['eventStartDate'] >= surgery_date]
    
    #print(row['pid'], "nBefore:", len(before), "nAfter:", len(after))
    
    #calculate statistics
    x1 = before.groupby('featureName')['numeric'].agg(['mean', 'median', 'min', 'max']) 
    x2 = after.groupby('featureName')['numeric'].agg(['mean', 'median', 'min', 'max'])
    
    #reshape and add pid
    y1 = x1.stack().to_frame().T.assign(pid=pid)
    y2 = x2.stack().to_frame().T.assign(pid=pid)

    #add to list
    before_frames.append(y1)
    after_frames.append(y2)
    
    #display(y1)
    #display(y2)
    
#y1 = pd.concat([x1], keys=['before_surgery'], names=['date'])
#y2 = pd.concat([x2], keys=['after_surgery'], names=['date'])

before_concat = pd.concat(before_frames).set_index('pid', drop=True) 
after_concat = pd.concat(after_frames).set_index('pid', drop=True) 

mat1 = pd.concat([before_concat, after_concat], axis=1, 
                        keys=['before_surgery', 'after_surgery'], 
                        names=['time','eventName', 'statistic'])

print("before shape:", before_concat.shape, "Num events:", before_concat.columns.get_level_values(0).nunique())
print("after shape:", after_concat.shape, "Num events:", after_concat.columns.get_level_values(0).nunique())
print("result shape:", mat1.shape)

before shape: (525, 360) Num events: 90
after shape: (525, 356) Num events: 89
result shape: (525, 716)


In [640]:
mat1.head()

time,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,...,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery
eventName,Albumin,Albumin,Albumin,Albumin,Alkaline Phosphatase,Alkaline Phosphatase,Alkaline Phosphatase,Alkaline Phosphatase,Amylase,Amylase,Amylase,Amylase,BASO (%),BASO (%),BASO (%),...,"respiratory rate, CO2","respiratory rate, CO2","respiratory rate, CO2","respiratory rate, total","respiratory rate, total","respiratory rate, total","respiratory rate, total",saturation,saturation,saturation,saturation,weight,weight,weight,weight
statistic,max,mean,median,min,max,mean,median,min,max,mean,median,min,max,mean,median,...,mean,median,min,max,mean,median,min,max,mean,median,min,max,mean,median,min
pid,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3,Unnamed: 24_level_3,Unnamed: 25_level_3,Unnamed: 26_level_3,Unnamed: 27_level_3,Unnamed: 28_level_3,Unnamed: 29_level_3,Unnamed: 30_level_3,Unnamed: 31_level_3
1123813,4.2,4.2,4.2,4.2,97.0,97.0,97.0,97.0,63.0,63.0,63.0,63.0,0.5,0.5,0.5,...,9.8,11.0,0.0,68.0,18.5,17.0,6.0,100.0,97.1693,98.0,85.0,108.0,96.9667,105.0,85.3
1135541,,,,,,,,,,,,,,,,...,6.6471,10.0,0.0,,,,,100.0,97.9277,98.0,94.0,68.0,68.0,68.0,68.0
3918745,3.9,3.9,3.9,3.9,90.0,90.0,90.0,90.0,53.0,53.0,53.0,53.0,0.4,0.4,0.4,...,4.6842,0.0,0.0,,,,,100.0,96.7879,97.0,91.0,,,,
4894647,4.3,4.3,4.3,4.3,70.0,70.0,70.0,70.0,,,,,0.3,0.3,0.3,...,6.5854,10.0,0.0,,,,,100.0,96.7396,98.0,0.0,85.0,71.125,74.6,0.0
6541482,3.1,1.9429,1.95,1.4,123.0,94.0,112.5,33.0,68.0,32.6923,29.0,9.0,0.3,0.1214,0.1,...,8.125,10.0,0.0,,,,,100.0,97.232,98.0,80.0,58.0,54.6667,53.0,53.0


In [641]:
print(mat1.columns.nlevels)
mat1.columns

3


MultiIndex([('before_surgery',                 'Albumin',    'max'),
            ('before_surgery',                 'Albumin',   'mean'),
            ('before_surgery',                 'Albumin', 'median'),
            ('before_surgery',                 'Albumin',    'min'),
            ('before_surgery',    'Alkaline Phosphatase',    'max'),
            ('before_surgery',    'Alkaline Phosphatase',   'mean'),
            ('before_surgery',    'Alkaline Phosphatase', 'median'),
            ('before_surgery',    'Alkaline Phosphatase',    'min'),
            ('before_surgery',                 'Amylase',    'max'),
            ('before_surgery',                 'Amylase',   'mean'),
            ...
            ( 'after_surgery', 'respiratory rate, total', 'median'),
            ( 'after_surgery', 'respiratory rate, total',    'min'),
            ( 'after_surgery',              'saturation',    'max'),
            ( 'after_surgery',              'saturation',   'mean'),
            ( 'aft

### MAT2: drug category

In [642]:
data = df.loc[drug]

In [643]:
mat2 = data.loc[:,['pid', 'featureName']].groupby(['pid', 'featureName']).apply(len).unstack().fillna(0) 

mat2 = pd.concat([mat2], axis=1, keys=['all_days'], names = ["time"])
mat2 = pd.concat([mat2], axis=1, keys=['count'], names = ["statistic"])
mat2.columns = mat2.columns.reorder_levels([1,2,0]) 

mat2

time,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days
featureName,Amoxicillin / clavulanic acid,Atorvastatin,Bisoprolol,Bondormin,Cefazolin,Ceftriaxone,Ciprofloxacin,Fentanyl,Fentanyl patch,Fleet enema,Furosemide,Hydrocortisone,Labetalol,Lorazepam,Magnesium sulfate,Magnesium sulfate,Meroken new,Meropenem,Metronidazole,Morphine hcl,Morphine hcl,Norepinephrine,Norepinephrine + dextrose,Optalgin,Piperacillin / tazobactam,Prednisone,Propranolol,Vaben,Vancomycin
statistic,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count
pid,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3,Unnamed: 24_level_3,Unnamed: 25_level_3,Unnamed: 26_level_3,Unnamed: 27_level_3,Unnamed: 28_level_3,Unnamed: 29_level_3
1.1238e+06,0.0,0.0,2.0,0.0,6.0,13.0,9.0,0.0,0.0,1.0,16.0,11.0,0.0,0.0,4.0,0.0,1.0,22.0,52.0,3.0,0.0,30.0,0.0,2.0,11.0,0.0,0.0,0.0,0.0
1.1355e+06,0.0,0.0,0.0,1.0,0.0,10.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.8946e+06,7.0,0.0,0.0,0.0,5.0,12.0,15.0,0.0,3.0,1.0,38.0,1.0,0.0,3.0,1.0,1.0,1.0,0.0,73.0,0.0,0.0,0.0,0.0,3.0,18.0,0.0,0.0,2.0,0.0
6.5415e+06,0.0,0.0,40.0,0.0,4.0,15.0,2.0,0.0,0.0,0.0,151.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,51.0,2.0,0.0,20.0,0.0,0.0,62.0,0.0,44.0,0.0,0.0
7.9673e+06,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1.0319e+09,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1.0324e+09,1.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,4.0,2.0,2.0,0.0,0.0,0.0,24.0,2.0,2.0,0.0,4.0,0.0,2.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0338e+09,0.0,0.0,0.0,19.0,4.0,27.0,2.0,1.0,32.0,1.0,8.0,0.0,0.0,0.0,13.0,10.0,1.0,35.0,62.0,0.0,5.0,0.0,0.0,2.0,25.0,0.0,0.0,0.0,1.0
1.0345e+09,0.0,0.0,14.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0.0,2.0,0.0,1.0,0.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### MATRIX: combine matrices

In [644]:
matrix = pd.concat([mat0, mat1, mat2], axis=1, keys=['metadata', 'numeric_events', 'drugs'])

In [645]:
matrix = matrix.drop(columns=['Date of surgery'], level=2)

In [646]:
matrix.columns = matrix.columns.set_names(['type', 'time','feature', 'statistic'])

In [647]:
matrix.head()

type,metadata,metadata,metadata,metadata,metadata,numeric_events,numeric_events,numeric_events,numeric_events,numeric_events,numeric_events,numeric_events,numeric_events,numeric_events,numeric_events,...,drugs,drugs,drugs,drugs,drugs,drugs,drugs,drugs,drugs,drugs,drugs,drugs,drugs,drugs,drugs
time,general,general,general,general,general,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,...,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days,all_days
feature,Sex,Age,Urgency,Surgical approach,Anastomotic Leak,Albumin,Albumin,Albumin,Albumin,Alkaline Phosphatase,Alkaline Phosphatase,Alkaline Phosphatase,Alkaline Phosphatase,Amylase,Amylase,...,Magnesium sulfate,Magnesium sulfate,Meroken new,Meropenem,Metronidazole,Morphine hcl,Morphine hcl,Norepinephrine,Norepinephrine + dextrose,Optalgin,Piperacillin / tazobactam,Prednisone,Propranolol,Vaben,Vancomycin
statistic,constant,constant,constant,constant,constant,max,mean,median,min,max,mean,median,min,max,mean,...,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count
pid,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4,Unnamed: 24_level_4,Unnamed: 25_level_4,Unnamed: 26_level_4,Unnamed: 27_level_4,Unnamed: 28_level_4,Unnamed: 29_level_4,Unnamed: 30_level_4,Unnamed: 31_level_4
1123813.0,male,59.0,elective,Open,1,4.2,4.2,4.2,4.2,97.0,97.0,97.0,97.0,63.0,63.0,...,4.0,0.0,1.0,22.0,52.0,3.0,0.0,30.0,0.0,2.0,11.0,0.0,0.0,0.0,0.0
1135541.0,female,43.0,elective,Open,0,,,,,,,,,,,...,0.0,0.0,0.0,0.0,26.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3918745.0,female,57.0,elective,Laparoscopic,0,3.9,3.9,3.9,3.9,90.0,90.0,90.0,90.0,53.0,53.0,...,,,,,,,,,,,,,,,
4894647.0,,,elective,Open,0,4.3,4.3,4.3,4.3,70.0,70.0,70.0,70.0,,,...,1.0,1.0,1.0,0.0,73.0,0.0,0.0,0.0,0.0,3.0,18.0,0.0,0.0,2.0,0.0
6541482.0,female,65.0,urgent,Open,0,3.1,1.9429,1.95,1.4,123.0,94.0,112.5,33.0,68.0,32.6923,...,7.0,0.0,0.0,0.0,51.0,2.0,0.0,20.0,0.0,0.0,62.0,0.0,44.0,0.0,0.0


In [648]:
matrix.index

Float64Index([   1123813.0,    1135541.0,    3918745.0,    4894647.0,
                 6541482.0,    7967340.0,   10766460.0,   10887548.0,
                10962340.0,   11194617.0,
              ...
              1025171111.0, 1026431110.0, 1027149109.0, 1028677108.0,
              1030904106.0, 1031926105.0, 1032366104.0, 1033842103.0,
              1034462102.0, 1035830101.0],
             dtype='float64', name='pid', length=525)

# Write outputs

In [649]:
#write output
if WRITE_FLAG:
    matrix.to_excel(DATA_PATH + 'matrix1.xlsx')

## שונות

In [None]:
ann['cumcount'] = ann.groupby('pid').cumcount()
ann['pid'].astype(str) +"_"+ ann['cumcount'].astype(str)