**Notebook content:**
- 
- 

In [82]:
%reset -f 

In [171]:
import sys
import pandas as pd
import numpy as np
import datetime
import json
from IPython.display import display

pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 200)
#pd.set_option('display.float_format', lambda x: '%.f' % x)
pd.options.display.precision = 4

#import sys
np.set_printoptions(threshold=sys.maxsize) #- print the full NumPy array

from myDefs.defs import *

# visualization
import seaborn as sns
import matplotlib.pyplot as plt 
from matplotlib.pyplot import figure
%matplotlib inline


**Read events file**

In [172]:
#read events
path = DATA_PATH + "parseData1.csv"
df = pd.read_csv(path, sep=',') 
#parse date of surgery
df['eventStartDate'] = pd.to_datetime(df['eventStartDate'], format='%Y-%m-%d')
df['eventEndDate'] = pd.to_datetime(df['eventEndDate'], format='%Y-%m-%d')

#read annonymous file
path = "{}parseAnonymous0.csv".format(DATA_PATH)
anonymous = pd.read_csv(path, sep=',')
#parse date of surgery
anonymous['Date of surgery'] = pd.to_datetime(anonymous['Date of surgery'], format='%Y-%m-%d')


In [185]:
#read events info
eventsinfo = pd.read_excel(EVENTS_INFO_PATH + "EventsInfo.xlsx")

In [174]:
print(df.shape)
print(anonymous.shape)
print(eventsinfo.shape)

(1683616, 17)
(718, 14)
(3755, 15)


In [222]:
df.head()

Unnamed: 0,pid,admissionId,eventName,eventStartDate,eventEndDate,dValue,iValue,sValue,eventDesc,unitOfMeasure,orderNumber,organismId,eventCode,eventCodeOrg,eventType,eventTypeOrg,sourceName,numeric,feature_name
0,2.0,1840641.0,WBC,2018-02-15 10:58:00,2018-02-15 10:58:00,2.14,0.0,,,K/microL,,,26464-8,100109500,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results],2.14,WBC
1,1013300000.0,1840641.0,Alkaline Phosphatase -Blood,2018-02-15 10:25:00,2018-02-15 10:25:00,129.0,0.0,,,IU/l,,,77141-0,100184075,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results],129.0,Alkaline Phosphatase
2,1013300000.0,1840641.0,"Protein -Blood, total",2018-02-15 10:25:00,2018-02-15 10:25:00,3.6,0.0,,,g/dl,,,2885-2,100184155,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results],3.6,Protein
3,1013300000.0,1840641.0,Sodium -Blood,2018-02-15 10:25:00,2018-02-15 10:25:00,147.0,0.0,,,meq/l,,,2951-2,100184295,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results],147.0,Sodium
4,1013300000.0,1840641.0,Urea -Blood,2018-02-15 10:25:00,2018-02-15 10:25:00,24.0,0.0,,,mg/dl,,,3091-6,100184520,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results],24.0,Urea


# Replace values (sValue -> dValue)

In [202]:
def convert_to_dict(x):
    s = x.replace('""', '"')
    d = json.loads(s)
    for key in d:
        if d[key] == 'None':
            d[key] = None
    return d

#replace dValue by values in dictionary
def replace_values(x, info):
    #get dictionary 
    d = info.loc[info['eventName'] == x['eventName'].iloc[0], 'dict'].iloc[0]
    #copy sValue to dValue
    x.loc[x['sValue'].notna(),'dValue'] = x['sValue']
    #replace dValues by by sValues 
    return x['dValue'].replace(d)

#convert dict column to dictionary 
eventsinfo.loc[eventsinfo['dict'].notna(),'dict'] = eventsinfo.loc[eventsinfo['dict'].notna(),'dict'].apply(convert_to_dict)

#replace values
df['numeric'] = df.groupby('eventName', group_keys=False).apply(replace_values, eventsinfo)

In [176]:
#create column feature name 
names_dict = eventsinfo.loc[eventsinfo['feature_name'].notna(), :]
names_dict = dict(zip(names_dict['eventName'], names_dict['feature_name']))

df['feature_name'] = df['eventName']
df['feature_name'] = df['feature_name'].replace(names_dict)

In [157]:
df['feature_name'].nunique()

3688

# Filter events

# slice data

In [177]:
#get only patient with a surgery date, which are not duplicate (first surgey)
anonymous_ix = anonymous[(anonymous['Date of surgery'].notna()) 
                         & (~anonymous['pid'].duplicated())].index
print("anonymous:", anonymous.loc[anonymous_ix].shape)

anonymous: (525, 14)


In [186]:
#filter events by thier info
#filter 1
filter1 = ((eventsinfo['category'] == 'laboratory') & (eventsinfo['inMatrix'] == 1) \
         & (eventsinfo['nPatients'] > 300))

filtered = eventsinfo.loc[filter1, 'dict'].dropna().head().replace()

#filter 2
filter2 = ((eventsinfo['category'] == 'physical') & \
        (eventsinfo['inMatrix'] == 1) & \
           
        (eventsinfo['nPatients'] > 30)) 

#filter 3
filter3 = ((eventsinfo['category'] == 'drug') & \
        (eventsinfo['inMatrix'] == 1) & \
            (eventsinfo['sValue'].isna()) & \
        (eventsinfo['nPatients'] > 30)) 

            
info_ix = eventsinfo[filter1 | filter2].index


info_ix.shape
#'Fentanyl'

(66,)

In [325]:
#slice events data  
df_ix = df[df['eventName'].isin(eventsinfo.loc[info_ix,'eventName']) ].index
print("df:", df.loc[df_ix].shape)


df: (741740, 20)


In [None]:
#get meta data from anonymous 

metadata = anonymous.loc[anonymous_ix]

#map: 1 if Anastomotic Leak, 0 o.w.
metadata['Anastomotic Leak'] = 0
metadata.loc[metadata['Complications']=='Anastomotic Leak', 'Anastomotic Leak'] = 1

#drop columns
cols_to_remove = ['Date of surgery', 'Days of hospitalization', 'Patient classification', 
                  'Responsible surgeon', 'Kk', 'Simple', 'Severe', 'Complications']
metadata = metadata.drop(columns=cols_to_remove).set_index('pid')

# convert units

In [189]:
#see which events should be converted
eventsinfo.loc[info_ix].loc[eventsinfo['convertUnit'].notna(), ['eventName', 'feature_name', 'unit']]

Unnamed: 0,eventName,feature_name,unit
15,חום,body temperature,"['F', 'C']"
17,Temperature,body temperature,['F']
182,WBC,,"[nan, '/HPF', 'K/microL', 'cells/mm3', 'LEU/UL']"


In [226]:
#create new column of unit
df['unit'] = df['unitOfMeasure']

#### Body temperature

In [274]:
def fahrenheit_to_celsius(f):
    return (f - 32) * 5.0/9.0

temp_ix = df.loc[df['feature_name'] == 'body temperature'].loc[df['unitOfMeasure'] == 'F'].index
df.loc[temp_ix, 'numeric'] = df.loc[temp_ix, 'numeric'].apply(fahrenheit_to_celsius)
df.loc[temp_ix, 'unit'] = 'C'

#remove noise
temp = df.loc[df['feature_name'] == 'body temperature', 'numeric']
temp_ix = temp[temp > 45].index
df.loc[temp_ix, 'numeric'] = None

#### WBC

In [332]:
#temporarily leave only "K/microL"
wbc_ix = df.loc[df['feature_name'] == 'WBC'].loc[df['unitOfMeasure'] != 'K/microL'].index

df.drop(wbc_ix, inplace=True)

# print events

In [337]:
df.groupby('feature_name').apply(len)

feature_name
 heart rate / Ventricular Rate    942
% Alpha1                            9
% Alpha2                            9
% Gamma                             9
% LYMPHO                            2
                                 ... 
תרבית מניקור מותניCSF               7
תרגול אקטיבי                        2
תרופות קבועות                       3
תשובת CT חזה בטן אגן                1
תשובת ייעוץ מרפאת כאב               1
Length: 3684, dtype: int64

# Create A Features Table

In [None]:

before_frames = []
after_frames = []

for index, row in anonymous.loc[anonymous_ix].head().iterrows(): #.head(10)

    #get surgery date of current patient 
    surgery_date = row['Date of surgery']
    
    #print(row['pid'], surgery_date)
    
    #get events before and after date of surgery
    data = df.loc[df_ix].loc[data['pid'] == row['pid'], :]
    before = data[data['eventStartDate'] < surgery_date]
    after = data[data['eventStartDate'] >= surgery_date]
    
    #display(data['eventName'].value_counts())
    
    #calculate statistics
    x1 = before.groupby('feature_name')['numeric'].agg(['mean', 'median', 'min', 'max']) 
    x2 = after.groupby('feature_name')['numeric'].agg([ 'mean', 'median', 'min', 'max'])
    
    #reshape and add pid
    y1 = x1.stack().to_frame().T.assign(pid=row['pid'])
    y2 = x2.stack().to_frame().T.assign(pid=row['pid'])

    #add to list
    before_frames.append(y1)
    after_frames.append(y2)
    
    #display(y1)
    #display(y2)
    
#y1 = pd.concat([x1], keys=['before_surgery'], names=['date'])
#y2 = pd.concat([x2], keys=['after_surgery'], names=['date'])

before_concat = pd.concat(before_frames).set_index('pid', drop=True) 
after_concat = pd.concat(after_frames).set_index('pid', drop=True) 

result_data = pd.concat([before_concat, after_concat], axis=1, 
                        keys=['before_surgery', 'after_surgery'], 
                        names=['date','eventName', 'statistics'])

print("before shape:", before_concat.shape, "Num events:", before_concat.columns.get_level_values(1).nunique())
print("before shape:", after_concat.shape, "Num events:", after_concat.columns.get_level_values(1).nunique())
print("result shape:", result_data.shape)

In [None]:
result_data

In [None]:
metadata.columns
#a.merge(res, on='pid').to_csv(DATA_PATH + "/draft1.csv", index=False)

#pd.concat([metadata.columns], keys=['a'], names=['nomi'])

In [None]:
pd.concat([metadata, result_data], axis=1, levels=['date','eventName', 'statistics'])


In [None]:
#write output
result.to_excel(DATA_PATH + 'draft1_.xlsx')

In [None]:
    
def stupid(row, df, cat):
        
    #get surgery date of current patient 
    surgery_date = row['Date of surgery']
    
    #get events before and after date of surgery
    
    df = df[df['pid'] == row['pid']]
    events_before = df[df['eventStartDate'] < surgery_date]
    events_after = df[df['eventStartDate'] >= surgery_date]
    
    #get relevant columns
    before = events_before[['eventName', 'numeirc']]
    after = events_after[['eventName', 'numeirc']]
    
    
    x = before.groupby('eventName')['numeirc'].agg(['min', 'max']) #, 'mean', 'median'
    
    
    
    y1 = x.stack().to_frame().T
    y2 = x.stack().to_frame().T
    
    display(y1)
    
    z= pd.concat([y1.iloc[:,1:], y2], axis=0,sort=False)
    display(z)
    return 0
    ''' 
    #groupby 
    before = before.groupby('eventName')['dValue'].agg({'sum_col' : np.sum,
                                                         'date' : [np.min, np.max]})
    after = after.groupby('eventName')['dValue'].describe()[['min', 'max']]
    
    print(before)
    #empty data frame of all events 
    events_names = cat['eventName'].drop_duplicates()
    all_events = pd.Series(np.nan, index=events_names)
    
    print(pd.concat([before, after]))
   
    #merge
    merged1 = pd.concat([all_events, values_before], axis=1,sort=False).sort_index()
    merged2 = pd.concat([all_events, values_after], axis=1,sort=False).sort_index()

    
    #convert to dict
    d1 = merged1.to_dict()
    d2 = merged2.to_dict()
   
    titles = ["pid", "before_surgery", "after_surgery"]
    
    res = pd.Series([row["pid"], d1,d2], index=titles)
    #display(res)
    return res
    '''

    
 
    

In [None]:
row = sub_anon.iloc[0,:]
#get surgery date of current patient 
surgery_date = row['Date of surgery']
    
#get events before and after date of surgery

events_before = df.loc[(df['pid'] == row['pid']) & (df['eventStartDate'] < surgery_date), ['eventName', 'dValue']]

#groupby 
yy = events_before.groupby('eventName')['dValue'].describe()[['min', 'max']].head()

## שונות

In [None]:
def create_feature(df, feature_name, alter_names, column_value='dValue', print_output=True):

    '''
    df - data frame of all events 
    feature_name - name of feature 
    alter_names - alternative names of feature
    
    '''
   
    #get the required events
    sub_events = df[df["eventName"].isin(alter_names)]

    #select only rellevant columns 
    feature = sub_events.loc[:, ['pid', 'eventStartDate', 'eventEndDate', 'unitOfMeasure']]
    
    #get values of event 
    feature[feature_name] = sub_events[column_value]
    
    if print_output:
        
        print("Num {} events:".format(feature_name), sub_events.shape[0])     

        print("\nHas sValue:", sub_events['sValue'].any())
        print("Has event description:", sub_events['eventDesc'].any())

        print("\nDescribe numeric values:\n", sub_events[['dValue', 'iValue']].describe())

        print("\nUnit of measure:", sub_events['unitOfMeasure'].unique())
    
        print("\nResult:")
        display(feature.head())


    return feature


In [None]:
'''
Adding suffix to duplicate patients (pateints who underwent > 1 surgery) 
'''

res = anonymous.copy()
anonymous.head()
res = res[['pid', 'Sex', 'Age', 'Date of surgery']]

#convert column to strig 
res['pid'] = res['pid'].astype(str)

#Number each item in each group from 1 to the length of that group.
cumcount = res.groupby('pid').cumcount() + 1


#which patients are duplicated
idx = res['pid'].duplicated(keep=False)

#concatenate suffixes
res.loc[idx, 'pid'] = res['pid'] + "_" + cumcount.astype(str)

res[idx];


In [None]:
ann['cumcount'] = ann.groupby('pid').cumcount()
ann['pid'].astype(str) +"_"+ ann['cumcount'].astype(str)