**Notebook content:**
-


In [200]:
%reset -f 

In [201]:
from myDefs.defs import *
from IPython.display import display

pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 200)
#pd.set_option('display.float_format', lambda x: '%.f' % x)
pd.options.display.precision = 4

#import sys
np.set_printoptions(threshold=sys.maxsize) #- print the full NumPy array

In [202]:
OUTPUT_PATH = DATA_PATH + "parseData3.csv"

### **Read files**

In [203]:
#read events
path = DATA_PATH + "parseData2.csv"
df = pd.read_csv(path, sep=',') 
#parse date of surgery
df['eventStartDate'] = pd.to_datetime(df['eventStartDate'], format='%Y-%m-%d')
df['eventEndDate'] = pd.to_datetime(df['eventEndDate'], format='%Y-%m-%d')

In [204]:
#read annonymous file
path = "{}parseAnonymous0.csv".format(DATA_PATH)
anonymous = pd.read_csv(path, sep=',')
#parse date of surgery
anonymous['Date of surgery'] = pd.to_datetime(anonymous['Date of surgery'], format='%Y-%m-%d')

In [205]:
#read events info
#eventsinfo = pd.read_excel(ORGANIZE_EVENTS_PATH + "EventsInfo.xlsx")

In [206]:
print(df.shape)
print(anonymous.shape)


(1683616, 22)
(718, 14)


# Filter events and slice data

### anonymous

In [207]:
#get only patient with a surgery date, which are not duplicate (first surgey)
anonymous_ix = anonymous[anonymous['Date of surgery'].notna()]\
                          .loc[~anonymous['pid'].duplicated()].index
print("anonymous:", anonymous.loc[anonymous_ix].shape)

anonymous: (525, 14)


In [208]:
#get data from anonymous 

anon = anonymous.loc[anonymous_ix]

#map: 1 if Anastomotic Leak, 0 o.w.
anon['Anastomotic Leak'] = 0
anon.loc[anon['Complications']=='Anastomotic Leak', 'Anastomotic Leak'] = 1

#drop columns
cols_to_remove = ['Name of surgery',
                  'Date of surgery', 'Days of hospitalization', 'Patient classification', 
                  'Responsible surgeon', 'Kk', 'Simple', 'Severe', 'Complications']
anon = anon.drop(columns=cols_to_remove).set_index('pid')

### events

In [209]:
#choose in model, laboratory, nPatients 
npatients = 40

laboratory = \
df[df['inModel']==1]\
.loc[df['category'] == 'laboratory']\
.loc[df.groupby('featureName')['pid']\
     .transform('nunique') > npatients].index

len(laboratory)

550110

In [210]:
#choose in model, physical, nPatients > 

npatients = 40

physical = \
df[df['inModel']==1]\
.loc[df['category'] == 'physical']\
.loc[df.groupby('featureName')['pid']\
     .transform('nunique') > npatients].index

len(physical)

621803

In [211]:
drugs = \
df[df['inModel']==1]\
.loc[df['category'] == 'drug']\
.loc[df.groupby('featureName')['pid']\
     .transform('nunique') > 10].index

len(drugs)

32943

In [212]:
#slice events
data = df.iloc[laboratory | physical | drugs]

data.shape

(1204856, 22)

# print events

In [213]:
#print features
data['featureName'].value_counts().sort_values(ascending=False)

heart rate                       119108
blood pressure                    82194
body temperature                  72884
saturation                        67932
FiO2                              40932
SpO2                              35402
SpO2.pr                           32507
SpO2.ir_amp                       32100
blood pressure diastolic          29303
blood pressure systolic           28682
respiratory rate, total           21208
respiratory rate, CO2             17861
PEEP                              17841
Potassium                         15952
Sodium                            15923
HGB                               15383
Saturation, O2                    15171
WBC_1                             14626
NEUTRO (abs)                      13023
PSV                               11162
Nucleated RBC (%)                 10607
Nucleated RBC (abs)               10607
HCT                               10481
PLT                               10463
MPV                               10447


In [214]:
data.loc[data['numeric'].notna(), 'numeric']

0            2.14
1          129.00
2            3.60
3          147.00
4           24.00
            ...  
1683611     64.00
1683612    103.00
1683613     18.00
1683614      2.70
1683615      8.10
Name: numeric, Length: 1200847, dtype: float64

# Create A Features Table

### laboratory & physical categories

In [215]:
data = df.iloc[laboratory | physical]

In [222]:

before_frames = []
after_frames = []

#for each patient and surgery date in anonymous 
for index, row in anonymous.loc[anonymous_ix].head().iterrows(): 
    
    #get surgery date of current patient 
    surgery_date = row['Date of surgery']
    
    #print(row['pid'], surgery_date)
    
    #get events before and after date of surgery
    patient = data.loc[data['pid'] == row['pid'], :]
    before = patient[patient['eventStartDate'] < surgery_date]
    after = patient[patient['eventStartDate'] >= surgery_date]
    
    #print(row['pid'], "nBefore:", len(before), "nAfter:", len(after))
    
    #calculate statistics
    x1 = before.groupby('featureName')['numeric'].agg(['mean', 'median', 'min', 'max']) 
    x2 = after.groupby('featureName')['numeric'].agg(['mean', 'median', 'min', 'max'])
    
    #reshape and add pid
    y1 = x1.stack().to_frame().T.assign(pid=row['pid'])
    y2 = x2.stack().to_frame().T.assign(pid=row['pid'])

    #add to list
    before_frames.append(y1)
    after_frames.append(y2)
    
    #display(y1)
    #display(y2)
    
#y1 = pd.concat([x1], keys=['before_surgery'], names=['date'])
#y2 = pd.concat([x2], keys=['after_surgery'], names=['date'])

before_concat = pd.concat(before_frames).set_index('pid', drop=True) 
after_concat = pd.concat(after_frames).set_index('pid', drop=True) 

result_data = pd.concat([before_concat, after_concat], axis=1, 
                        keys=['before_surgery', 'after_surgery'], 
                        names=['date','eventName', 'statistics'])

print("before shape:", before_concat.shape, "Num events:", before_concat.columns.get_level_values(1).nunique())
print("before shape:", after_concat.shape, "Num events:", after_concat.columns.get_level_values(1).nunique())
print("result shape:", result_data.shape)

before shape: (5, 28) Num events: 4
before shape: (5, 92) Num events: 4
result shape: (5, 120)


In [220]:
result_data

date,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,...,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery
eventName,Albumin,Albumin,Albumin,Albumin,Alkaline Phosphatase,Alkaline Phosphatase,Alkaline Phosphatase,Alkaline Phosphatase,Amylase,Amylase,Amylase,Amylase,BASO (%),BASO (%),BASO (%),...,"respiratory rate, CO2","respiratory rate, CO2","respiratory rate, CO2","respiratory rate, total","respiratory rate, total","respiratory rate, total","respiratory rate, total",saturation,saturation,saturation,saturation,weight,weight,weight,weight
statistics,max,mean,median,min,max,mean,median,min,max,mean,median,min,max,mean,median,...,mean,median,min,max,mean,median,min,max,mean,median,min,max,mean,median,min
pid,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3,Unnamed: 24_level_3,Unnamed: 25_level_3,Unnamed: 26_level_3,Unnamed: 27_level_3,Unnamed: 28_level_3,Unnamed: 29_level_3,Unnamed: 30_level_3,Unnamed: 31_level_3
1123813,4.2,4.2,4.2,4.2,97.0,97.0,97.0,97.0,63.0,63.0,63.0,63.0,0.5,0.5,0.5,...,9.8,11.0,0.0,68.0,18.5,17.0,6.0,100.0,97.1693,98.0,85.0,108.0,96.9667,105.0,85.3
1135541,,,,,,,,,,,,,,,,...,6.6471,10.0,0.0,,,,,100.0,97.9277,98.0,94.0,68.0,68.0,68.0,68.0
3918745,3.9,3.9,3.9,3.9,90.0,90.0,90.0,90.0,53.0,53.0,53.0,53.0,0.4,0.4,0.4,...,4.6842,0.0,0.0,,,,,100.0,96.7879,97.0,91.0,,,,
4894647,4.3,4.3,4.3,4.3,70.0,70.0,70.0,70.0,,,,,0.3,0.3,0.3,...,6.5854,10.0,0.0,,,,,100.0,96.7396,98.0,0.0,85.0,71.125,74.6,0.0
6541482,3.1,1.9429,1.95,1.4,123.0,94.0,112.5,33.0,68.0,32.6923,29.0,9.0,0.3,0.1214,0.1,...,8.125,10.0,0.0,,,,,100.0,97.232,98.0,80.0,58.0,54.6667,53.0,53.0


### drug categorie

In [250]:
data = df.iloc[drugs]

In [249]:

dr = data.loc[:,['pid', 'featureName']] 
dr.groupby(['pid', 'featureName']).apply(len)


pid         featureName      
1.1238e+06  Bisoprolol            2
            Cefazolin             6
            Ceftriaxone          13
            Ciprofloxacin         9
            Fleet enema           1
                                 ..
1.0358e+09  Magnesium sulfate     1
            Meroken new           1
            Metronidazole         1
            Morphine hcl          1
            Propranolol          26
Length: 2701, dtype: int64

In [None]:
metadata.columns
a.merge(res, on='pid').to_csv(DATA_PATH + "/draft1.csv", index=False)

#pd.concat([metadata.columns], keys=['a'], names=['nomi'])

In [None]:
pd.concat([metadata, result_data], axis=1, levels=['date','eventName', 'statistics'])


In [None]:
if WRITE_FLAG:
#write output
result.to_excel(DATA_PATH + 'draft1_.xlsx')

## שונות

In [None]:
def create_feature(df, feature_name, alter_names, column_value='dValue', print_output=True):

    '''
    df - data frame of all events 
    feature_name - name of feature 
    alter_names - alternative names of feature
    
    '''
   
    #get the required events
    sub_events = df[df["eventName"].isin(alter_names)]

    #select only rellevant columns 
    feature = sub_events.loc[:, ['pid', 'eventStartDate', 'eventEndDate', 'unitOfMeasure']]
    
    #get values of event 
    feature[feature_name] = sub_events[column_value]
    
    if print_output:
        
        print("Num {} events:".format(feature_name), sub_events.shape[0])     

        print("\nHas sValue:", sub_events['sValue'].any())
        print("Has event description:", sub_events['eventDesc'].any())

        print("\nDescribe numeric values:\n", sub_events[['dValue', 'iValue']].describe())

        print("\nUnit of measure:", sub_events['unitOfMeasure'].unique())
    
        print("\nResult:")
        display(feature.head())


    return feature


In [None]:
'''
Adding suffix to duplicate patients (pateints who underwent > 1 surgery) 
'''

res = anonymous.copy()
anonymous.head()
res = res[['pid', 'Sex', 'Age', 'Date of surgery']]

#convert column to strig 
res['pid'] = res['pid'].astype(str)

#Number each item in each group from 1 to the length of that group.
cumcount = res.groupby('pid').cumcount() + 1


#which patients are duplicated
idx = res['pid'].duplicated(keep=False)

#concatenate suffixes
res.loc[idx, 'pid'] = res['pid'] + "_" + cumcount.astype(str)

res[idx];


In [None]:
ann['cumcount'] = ann.groupby('pid').cumcount()
ann['pid'].astype(str) +"_"+ ann['cumcount'].astype(str)