**Notebook content:**
- 
- 

In [52]:
%reset -f 

In [53]:
import sys
import pandas as pd
import numpy as np
import datetime
from IPython.display import display

pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 200)
#pd.set_option('display.float_format', lambda x: '%.f' % x)
pd.options.display.precision = 4

#import sys
np.set_printoptions(threshold=sys.maxsize) #- print the full NumPy array

from myDefs.defs import *

# visualization
import seaborn as sns
import matplotlib.pyplot as plt 
from matplotlib.pyplot import figure
%matplotlib inline


**Read events file**

In [54]:
#read events
path = DATA_PATH + "parseData1.csv"
df = pd.read_csv(path, sep=',') 
#parse date of surgery
df['eventStartDate'] = pd.to_datetime(df['eventStartDate'], format='%Y-%m-%d')
df['eventEndDate'] = pd.to_datetime(df['eventEndDate'], format='%Y-%m-%d')

#read annonymous file
path = "{}parseAnonymous0.csv".format(DATA_PATH)
anonymous = pd.read_csv(path, sep=',')
#parse date of surgery
anonymous['Date of surgery'] = pd.to_datetime(anonymous['Date of surgery'], format='%Y-%m-%d')

#read events info
eventsinfo = pd.read_excel(EVENTS_INFO_PATH)

In [55]:
print(df.shape)
print(anonymous.shape)
print(eventsinfo.shape)

(1683616, 17)
(718, 14)
(3755, 15)


In [8]:
df.head();

# slice data

In [201]:
#get only patient with a surgery date, which are not duplicate (first surgey)
anonymous_ix = anonymous[(anonymous['Date of surgery'].notna()) 
                         & (~anonymous['pid'].duplicated())].index
print("anonymous:", anonymous.loc[anonymous_ix].shape)


#filter events by thier info
info_ix = eventsinfo[(eventsinfo['category'] == 'laboratory') 
                     & (eventsinfo['inMatrix'] == 1) 
                     & (eventsinfo['sValue'].isna())
                    & (eventsinfo['nPatients'] > 300)].index
print("eventsinfo:", eventsinfo.loc[info_ix].shape)

#get filtered data events 
df_ix = df[df['eventName'].isin(eventsinfo.loc[info_ix,'eventName']) ].index
print("df:", df.loc[df_ix].shape)



anonymous: (525, 14)
eventsinfo: (14, 15)
df: (91304, 17)


In [None]:
#get meta data from anonymous 

metadata = anonymous.loc[anonymous_ix]

#map: 1 if Anastomotic Leak, 0 o.w.
metadata['Anastomotic Leak'] = 0
metadata.loc[metadata['Complications']=='Anastomotic Leak', 'Anastomotic Leak'] = 1

#drop columns
cols_to_remove = ['Date of surgery', 'Days of hospitalization', 'Patient classification', 
                  'Responsible surgeon', 'Kk', 'Simple', 'Severe', 'Complications']
metadata = metadata.drop(columns=cols_to_remove).set_index('pid')

# Create A Features Table

In [243]:

before_frames = []
after_frames = []

for index, row in anonymous.loc[anonymous_ix].head().iterrows(): #.head(10)

    #get surgery date of current patient 
    surgery_date = row['Date of surgery']
    
    #print(row['pid'], surgery_date)
    
    #get events before and after date of surgery
    data = df.loc[df_ix]
    data = data[data['pid'] == row['pid']]
    before = data[data['eventStartDate'] < surgery_date]
    after = data[data['eventStartDate'] >= surgery_date]
    
    #display(data['eventName'].value_counts())
    
    #calculate statistics
    x1 = before.groupby('eventName')['dValue'].agg(['mean', 'median', 'min', 'max']) 
    x2 = after.groupby('eventName')['dValue'].agg([ 'mean', 'median', 'min', 'max'])
    
    #reshape and add pid
    y1 = x1.stack().to_frame().T.assign(pid=row['pid'])
    y2 = x2.stack().to_frame().T.assign(pid=row['pid'])

    #add to list
    before_frames.append(y1)
    after_frames.append(y2)
    
    #display(y1)
    #display(y2)
    
#y1 = pd.concat([x1], keys=['before_surgery'], names=['date'])
#y2 = pd.concat([x2], keys=['after_surgery'], names=['date'])

before_concat = pd.concat(before_frames).set_index('pid', drop=True) 
after_concat = pd.concat(after_frames).set_index('pid', drop=True) 

result_data = pd.concat([before_concat, after_concat], axis=1, 
                        keys=['before_surgery', 'after_surgery'], 
                        names=['date','eventName', 'statistics'])

print("before shape:", before_concat.shape, "Num events:", before_concat.columns.get_level_values(1).nunique())
print("before shape:", after_concat.shape, "Num events:", after_concat.columns.get_level_values(1).nunique())
print("result shape:", result_data.shape)

before shape: (5, 56) Num events: 4
before shape: (5, 56) Num events: 4
result shape: (5, 112)


In [246]:
result_data

date,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,before_surgery,...,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery,after_surgery
eventName,Chloride -Blood,Chloride -Blood,Chloride -Blood,Chloride -Blood,Estimated Globulin -Blood,Estimated Globulin -Blood,Estimated Globulin -Blood,Estimated Globulin -Blood,Fibrinogen,Fibrinogen,Fibrinogen,Fibrinogen,Magnesium -Blood,Magnesium -Blood,Magnesium -Blood,...,PO2 - general,PO2 - general,PO2 - general,Potassium -Blood,Potassium -Blood,Potassium -Blood,Potassium -Blood,SGOT (AST) -Blood,SGOT (AST) -Blood,SGOT (AST) -Blood,SGOT (AST) -Blood,Sodium -Blood,Sodium -Blood,Sodium -Blood,Sodium -Blood
statistics,max,mean,median,min,max,mean,median,min,max,mean,median,min,max,mean,median,...,mean,median,min,max,mean,median,min,max,mean,median,min,max,mean,median,min
pid,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3,Unnamed: 24_level_3,Unnamed: 25_level_3,Unnamed: 26_level_3,Unnamed: 27_level_3,Unnamed: 28_level_3,Unnamed: 29_level_3,Unnamed: 30_level_3,Unnamed: 31_level_3
1123813,105.0,105.0,105.0,105.0,3.7,3.7,3.7,3.7,,,,,2.1,2.1,2.1,...,147.536,119.15,85.9,5.7,4.4222,4.5,3.3,205.0,35.6863,28.0,10.0,143.0,135.0833,136.0,126.0
1135541,,,,,,,,,,,,,,,,...,238.775,229.05,195.2,6.0,4.1,3.95,3.4,59.0,27.0714,20.5,14.0,141.0,138.3125,139.0,134.0
3918745,102.0,102.0,102.0,102.0,3.0,3.0,3.0,3.0,,,,,2.2,2.2,2.2,...,27.3,27.3,27.3,5.8,4.85,4.7,4.2,104.0,104.0,104.0,104.0,139.0,138.5,138.5,138.0
4894647,101.0,101.0,101.0,101.0,2.6,2.6,2.6,2.6,,,,,,,,...,142.8417,138.0,24.9,4.9,3.8845,3.9,2.9,779.0,104.3529,21.0,6.0,150.0,139.5,139.0,129.0
6541482,110.0,100.1429,99.5,90.0,3.1,2.3929,2.45,1.2,507.0,507.0,507.0,507.0,2.7,2.0385,2.0,...,152.3083,91.0,38.2,5.1,3.99,3.95,3.1,52.0,27.4839,26.0,13.0,142.0,134.65,134.0,129.0


In [245]:
metadata
#a.merge(res, on='pid').to_csv(DATA_PATH + "/draft1.csv", index=False)

Unnamed: 0_level_0,Sex,Age,Name of surgery,Urgency,Surgical approach,Anastomotic Leak
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1123813,male,59.0,CYTOREDUCTIVE SURGERY WITH HIPEC (HYPERTHERMIC...,elective,Open,1
1135541,female,43.0,OPEN DISTAL SUBTOTAL GASTRECTOMY 436320,elective,Open,0
3918745,female,57.0,LAPAROSCOPIC GASTRIC BYPASS WITH GASTRIC BAND ...,elective,Laparoscopic,0
4894647,,,LUMPECTOMY (WITHOUT LYMPH NODE EXCISION) 19120...,elective,Open,0
6541482,female,65.0,EXPLORATORY LAPAROTOMY 490000,urgent,Open,0
...,...,...,...,...,...,...
1031926105,female,63.0,APPENDECTOMY;CYTOREDUCTIVE SURGERY WITH HIPEC ...,elective,Open,0
1032366104,female,63.0,"ABDOMINAL HYSTERECTOMY, TOTAL WITH BSO (T.A.H ...",elective,Open,0
1033842103,male,61.0,CYTOREDUCTIVE SURGERY WITH HIPEC (HYPERTHERMIC...,elective,Open,1
1034462102,male,76.0,CYTOREDUCTIVE SURGERY WITH HIPEC (HYPERTHERMIC...,elective,Open,0


In [259]:
pd.concat([metadata, result_data], axis=1, levels=['date','eventName', 'statistics'])


Unnamed: 0_level_0,Sex,Age,Name of surgery,Urgency,Surgical approach,Anastomotic Leak,"(before_surgery, Chloride -Blood, max)","(before_surgery, Chloride -Blood, mean)","(before_surgery, Chloride -Blood, median)","(before_surgery, Chloride -Blood, min)","(before_surgery, Estimated Globulin -Blood, max)","(before_surgery, Estimated Globulin -Blood, mean)","(before_surgery, Estimated Globulin -Blood, median)","(before_surgery, Estimated Globulin -Blood, min)","(before_surgery, Fibrinogen, max)",...,"(after_surgery, PO2 - general, mean)","(after_surgery, PO2 - general, median)","(after_surgery, PO2 - general, min)","(after_surgery, Potassium -Blood, max)","(after_surgery, Potassium -Blood, mean)","(after_surgery, Potassium -Blood, median)","(after_surgery, Potassium -Blood, min)","(after_surgery, SGOT (AST) -Blood, max)","(after_surgery, SGOT (AST) -Blood, mean)","(after_surgery, SGOT (AST) -Blood, median)","(after_surgery, SGOT (AST) -Blood, min)","(after_surgery, Sodium -Blood, max)","(after_surgery, Sodium -Blood, mean)","(after_surgery, Sodium -Blood, median)","(after_surgery, Sodium -Blood, min)"
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
1123813,male,59.0,CYTOREDUCTIVE SURGERY WITH HIPEC (HYPERTHERMIC...,elective,Open,1,105.0,105.0000,105.0,105.0,3.7,3.7000,3.70,3.7,,...,147.5360,119.15,85.9,5.7,4.4222,4.50,3.3,205.0,35.6863,28.0,10.0,143.0,135.0833,136.0,126.0
1135541,female,43.0,OPEN DISTAL SUBTOTAL GASTRECTOMY 436320,elective,Open,0,,,,,,,,,,...,238.7750,229.05,195.2,6.0,4.1000,3.95,3.4,59.0,27.0714,20.5,14.0,141.0,138.3125,139.0,134.0
3918745,female,57.0,LAPAROSCOPIC GASTRIC BYPASS WITH GASTRIC BAND ...,elective,Laparoscopic,0,102.0,102.0000,102.0,102.0,3.0,3.0000,3.00,3.0,,...,27.3000,27.30,27.3,5.8,4.8500,4.70,4.2,104.0,104.0000,104.0,104.0,139.0,138.5000,138.5,138.0
4894647,,,LUMPECTOMY (WITHOUT LYMPH NODE EXCISION) 19120...,elective,Open,0,101.0,101.0000,101.0,101.0,2.6,2.6000,2.60,2.6,,...,142.8417,138.00,24.9,4.9,3.8845,3.90,2.9,779.0,104.3529,21.0,6.0,150.0,139.5000,139.0,129.0
6541482,female,65.0,EXPLORATORY LAPAROTOMY 490000,urgent,Open,0,110.0,100.1429,99.5,90.0,3.1,2.3929,2.45,1.2,507.0,...,152.3083,91.00,38.2,5.1,3.9900,3.95,3.1,52.0,27.4839,26.0,13.0,142.0,134.6500,134.0,129.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1031926105,female,63.0,APPENDECTOMY;CYTOREDUCTIVE SURGERY WITH HIPEC ...,elective,Open,0,,,,,,,,,,...,,,,,,,,,,,,,,,
1032366104,female,63.0,"ABDOMINAL HYSTERECTOMY, TOTAL WITH BSO (T.A.H ...",elective,Open,0,,,,,,,,,,...,,,,,,,,,,,,,,,
1033842103,male,61.0,CYTOREDUCTIVE SURGERY WITH HIPEC (HYPERTHERMIC...,elective,Open,1,,,,,,,,,,...,,,,,,,,,,,,,,,
1034462102,male,76.0,CYTOREDUCTIVE SURGERY WITH HIPEC (HYPERTHERMIC...,elective,Open,0,,,,,,,,,,...,,,,,,,,,,,,,,,


In [238]:
head = result_data.head().iloc[:,:8]
head["nomi"] = "nomi"
head

eventName,Chloride -Blood,Chloride -Blood,Chloride -Blood,Chloride -Blood,Estimated Globulin -Blood,Estimated Globulin -Blood,Estimated Globulin -Blood,Estimated Globulin -Blood,nomi
Unnamed: 0_level_1,max,mean,median,min,max,mean,median,min,Unnamed: 9_level_1
pid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1123813,105.0,105.0,105.0,105.0,3.7,3.7,3.7,3.7,nomi
1135541,,,,,,,,,nomi
3918745,102.0,102.0,102.0,102.0,3.0,3.0,3.0,3.0,nomi
4894647,101.0,101.0,101.0,101.0,2.6,2.6,2.6,2.6,nomi
6541482,110.0,100.1429,99.5,90.0,3.1,2.3929,2.45,1.2,nomi


In [200]:
#write output
result.to_excel(DATA_PATH + 'draft1.xlsx')

In [196]:
#indexing
result.loc[:,('before_surgery','Chloride -Blood')]

Unnamed: 0_level_0,max,mean,median,min
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1123813,105.0,105.0000,105.0,105.0
1135541,,,,
3918745,102.0,102.0000,102.0,102.0
4894647,101.0,101.0000,101.0,101.0
6541482,110.0,100.1429,99.5,90.0
...,...,...,...,...
1031926105,,,,
1032366104,107.0,107.0000,107.0,107.0
1033842103,107.0,107.0000,107.0,107.0
1034462102,104.0,104.0000,104.0,104.0


In [63]:
'''
Adding suffix to duplicate patients (pateints who underwent > 1 surgery) 
'''

res = anonymous.copy()
anonymous.head()
res = res[['pid', 'Sex', 'Age', 'Date of surgery']]

#convert column to strig 
res['pid'] = res['pid'].astype(str)

#Number each item in each group from 1 to the length of that group.
cumcount = res.groupby('pid').cumcount() + 1


#which patients are duplicated
idx = res['pid'].duplicated(keep=False)

#concatenate suffixes
res.loc[idx, 'pid'] = res['pid'] + "_" + cumcount.astype(str)

res[idx];


In [285]:
    
def stupid(row, df, cat):
        
    #get surgery date of current patient 
    surgery_date = row['Date of surgery']
    
    #get events before and after date of surgery
    
    df = df[df['pid'] == row['pid']]
    events_before = df[df['eventStartDate'] < surgery_date]
    events_after = df[df['eventStartDate'] >= surgery_date]
    
    #get relevant columns
    before = events_before[['eventName', 'dValue']]
    after = events_after[['eventName', 'dValue']]
    
    
    x = before.groupby('eventName')['dValue'].agg(['min', 'max']) #, 'mean', 'median'
    
    
    
    y1 = x.stack().to_frame().T
    y2 = x.stack().to_frame().T
    
    display(y1)
    
    z= pd.concat([y1.iloc[:,1:], y2], axis=0,sort=False)
    display(z)
    return 0
    ''' 
    #groupby 
    before = before.groupby('eventName')['dValue'].agg({'sum_col' : np.sum,
                                                         'date' : [np.min, np.max]})
    after = after.groupby('eventName')['dValue'].describe()[['min', 'max']]
    
    print(before)
    #empty data frame of all events 
    events_names = cat['eventName'].drop_duplicates()
    all_events = pd.Series(np.nan, index=events_names)
    
    print(pd.concat([before, after]))
   
    #merge
    merged1 = pd.concat([all_events, values_before], axis=1,sort=False).sort_index()
    merged2 = pd.concat([all_events, values_after], axis=1,sort=False).sort_index()

    
    #convert to dict
    d1 = merged1.to_dict()
    d2 = merged2.to_dict()
   
    titles = ["pid", "before_surgery", "after_surgery"]
    
    res = pd.Series([row["pid"], d1,d2], index=titles)
    #display(res)
    return res
    '''

    
 
    

In [223]:
row = sub_anon.iloc[0,:]
#get surgery date of current patient 
surgery_date = row['Date of surgery']
    
#get events before and after date of surgery

events_before = df.loc[(df['pid'] == row['pid']) & (df['eventStartDate'] < surgery_date), ['eventName', 'dValue']]

#groupby 
yy = events_before.groupby('eventName')['dValue'].describe()[['min', 'max']].head()

## שונות

In [12]:
def create_feature(df, feature_name, alter_names, column_value='dValue', print_output=True):

    '''
    df - data frame of all events 
    feature_name - name of feature 
    alter_names - alternative names of feature
    
    '''
   
    #get the required events
    sub_events = df[df["eventName"].isin(alter_names)]

    #select only rellevant columns 
    feature = sub_events.loc[:, ['pid', 'eventStartDate', 'eventEndDate', 'unitOfMeasure']]
    
    #get values of event 
    feature[feature_name] = sub_events[column_value]
    
    if print_output:
        
        print("Num {} events:".format(feature_name), sub_events.shape[0])     

        print("\nHas sValue:", sub_events['sValue'].any())
        print("Has event description:", sub_events['eventDesc'].any())

        print("\nDescribe numeric values:\n", sub_events[['dValue', 'iValue']].describe())

        print("\nUnit of measure:", sub_events['unitOfMeasure'].unique())
    
        print("\nResult:")
        display(feature.head())


    return feature


In [13]:
df[(df['eventName']=='LDH -Blood') & df['sValue'].notna()]#['dValue'].sum() # & (df['dValue']>100)

Unnamed: 0,pid,admissionId,eventName,eventStartDate,eventEndDate,dValue,iValue,sValue,eventDesc,unitOfMeasure,orderNumber,organismId,eventCode,eventCodeOrg,eventType,eventTypeOrg,sourceName
1277065,532580000.0,1853037.0,LDH -Blood,2018-08-25 04:33:00,2018-08-25 04:33:00,0.0,0.0,>12000,,IU/l,,,14805-6,100183615,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results]
1433372,1001200000.0,1923493.0,LDH -Blood,2018-10-02 22:11:00,2018-10-02 22:11:00,0.0,0.0,>1200,,IU/l,,,14805-6,100183615,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results]


In [14]:

df[df['eventName']=='Potassium']['unitOfMeasure'].value_counts()

mmol/l    5691
meq/l       11
mmol/L       4
Name: unitOfMeasure, dtype: int64

In [15]:
alter_names = ['חום.', 'חום', 'Temperature', 'DATEX.body temp.value_1', 'DATEX.body temp.value_2']
body_heat = create_feature(df, 'body heat', alter_names)

Num body heat events: 72884

Has sValue: False
Has event description: False

Describe numeric values:
            dValue   iValue
count  72884.0000  72884.0
mean      55.7426      0.0
std       44.4312      0.0
min      -36.6000      0.0
25%       36.6000      0.0
50%       36.9000      0.0
75%       96.9800      0.0
max     6582.2000      0.0

Unit of measure: ['C' nan 'F']

Result:


Unnamed: 0,pid,eventStartDate,eventEndDate,unitOfMeasure,body heat
78,1013300000.0,2018-02-16 01:51:02.320,2018-02-16 01:51:02.320,C,36.6
80,74266000.0,2018-02-16 00:38:30.073,2018-02-16 00:38:30.073,C,36.9
87,485460000.0,2018-02-16 03:21:22.237,2018-02-16 03:21:22.237,C,36.6
128,74266000.0,2018-02-16 07:52:17.077,2018-02-16 07:52:17.077,C,37.2
133,485460000.0,2018-02-16 07:39:44.650,2018-02-16 07:39:44.650,C,37.0


In [16]:
def fahrenheit_to_celsius(f):
    return (f - 32) * 5.0/9.0

#.apply(fahrenheit_to_celsius)
fahrenheits = body_heat['unitOfMeasure']=='F'
body_heat.loc[fahrenheits, 'body heat'] = body_heat.loc[fahrenheits, 'body heat'].apply(fahrenheit_to_celsius)
body_heat.loc[fahrenheits, 'unitOfMeasure'] = 'C'

body_heat[body_heat['body heat'] > 40];

In [17]:
ann.groupby('pid').apply(lambda x:np.arange(len(x)))

NameError: name 'ann' is not defined

In [None]:
ann['cumcount'] = ann.groupby('pid').cumcount()
ann['pid'].astype(str) +"_"+ ann['cumcount'].astype(str)