**Notebook content:**
- Add columns such as "featurName" and "category", using file "eventsInfo"
- Convert units to keep uniformity.

In [37]:
%reset -f 

In [38]:
from myDefs.defs import *
from IPython.display import display

pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 200)
#pd.set_option('display.float_format', lambda x: '%.f' % x)
pd.options.display.precision = 4

#import sys
np.set_printoptions(threshold=sys.maxsize) #- print the full NumPy array


In [39]:
OUTPUT_PATH = DATA_PATH + "parseData2.csv"

### **Read files**

In [40]:
#read events file 
path = DATA_PATH + "parseData1.csv"
df = pd.read_csv(path, sep=',') 

In [41]:
#read events info
eventsinfo = pd.read_excel(ORGANIZE_EVENTS_PATH + "EventsInfo.xlsx")

In [42]:
print(df.shape)
print(eventsinfo.shape)

(1683616, 18)
(3755, 18)


In [81]:
eventsinfo[['eventName', 'featureName', 'inModel', 'category', 'count', 'nPatients', 'dValues', 'source']].sort_values(by='inModel', ascending=False).head()

Unnamed: 0,eventName,featureName,inModel,category,count,nPatients,dValues,source
5,%Reticulo,Reticulocytes (%),1.0,laboratory,239,89,"{'count': 239.0, 'mean': 2.6644351464435165, '...",['Lab Results']
1693,NORADRENALINE/ NOREPINEPHRINE BITARTRATE 4 mg ...,Norepinephrine + dextrose,1.0,drug,15,1,4,['Med Execution Medicine']
1685,NORADRENALINE/ NOREPINEPHRINE BITARTRATE 4 mg ...,Norepinephrine + dextrose,1.0,drug,3,1,4,['Med Execution Medicine']
1686,NORADRENALINE/ NOREPINEPHRINE BITARTRATE 4 mg ...,Norepinephrine + dextrose,1.0,drug,4,2,4,['Med Execution Medicine']
1687,NORADRENALINE/ NOREPINEPHRINE BITARTRATE 4 mg ...,Norepinephrine + dextrose,1.0,drug,12,5,4,['Med Execution Medicine']


# Add new columns

## "featureName"

In [43]:
#create new column feature name 
names_dict = dict(zip(eventsinfo['eventName'], eventsinfo['featureName']))
df['featureName'] = df['eventName'].map(names_dict, na_action='ignore')

In [44]:
df['featureName'].nunique()

2488

In [45]:
df.head()

Unnamed: 0,pid,admissionId,eventName,eventStartDate,eventEndDate,dValue,iValue,sValue,eventDesc,unitOfMeasure,orderNumber,organismId,eventCode,eventCodeOrg,eventType,eventTypeOrg,sourceName,source,featureName
0,1013300000.0,1840641.0,WBC,2018-02-15 10:58:00.0000000,2018-02-15 10:58:00.0000000,2.14,0.0,,,K/microL,,,26464-8,100109500,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results],Lab Results,WBC
1,1013300000.0,1840641.0,Alkaline Phosphatase -Blood,2018-02-15 10:25:00.0000000,2018-02-15 10:25:00.0000000,129.0,0.0,,,IU/l,,,77141-0,100184075,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results],Lab Results,Alkaline Phosphatase
2,1013300000.0,1840641.0,"Protein -Blood, total",2018-02-15 10:25:00.0000000,2018-02-15 10:25:00.0000000,3.6,0.0,,,g/dl,,,2885-2,100184155,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results],Lab Results,"Protein, total"
3,1013300000.0,1840641.0,Sodium -Blood,2018-02-15 10:25:00.0000000,2018-02-15 10:25:00.0000000,147.0,0.0,,,meq/l,,,2951-2,100184295,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results],Lab Results,Sodium
4,1013300000.0,1840641.0,Urea -Blood,2018-02-15 10:25:00.0000000,2018-02-15 10:25:00.0000000,24.0,0.0,,,mg/dl,,,3091-6,100184520,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results],Lab Results,Urea


## "numeric", replace values (sValue -> dValue)

In [46]:
def convert_to_dict(x):
    d = json.loads(x)
    for key in d:
        if d[key] == 'None':
            d[key] = None
    return d

'''
 try:
        d = json.loads(x)
    except:
        display(x)
        return
'''

#convert dict column to dictionary 
eventsinfo.loc[eventsinfo['dict'].notna(),'dict'] = eventsinfo.loc[eventsinfo['dict'].notna(),'dict'].apply(convert_to_dict)

In [47]:
#replace dValue by values in dictionary
def replace_values(x, info):
    #get dictionary 
    d = info.loc[info['eventName'] == x['eventName'].iloc[0], 'dict'].iloc[0]
    #copy sValue to dValue
    x.loc[x['sValue'].notna(),'dValue'] = x['sValue']
    #replace dValues by by sValues 
    return x['dValue'].replace(d)

#replace values
df['numeric'] = df.groupby('eventName', group_keys=False).apply(replace_values, eventsinfo)

In [48]:
#convert column to numeric
df['numeric'] = pd.to_numeric(df['numeric'], downcast='float', errors='coerce')

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1683616 entries, 0 to 1683615
Data columns (total 20 columns):
pid               1683616 non-null float64
admissionId       1683616 non-null float64
eventName         1628158 non-null object
eventStartDate    1683616 non-null object
eventEndDate      1683616 non-null object
dValue            1556931 non-null float64
iValue            1556931 non-null float64
sValue            38589 non-null object
eventDesc         136875 non-null object
unitOfMeasure     966464 non-null object
orderNumber       16699 non-null float64
organismId        2126 non-null float64
eventCode         1683616 non-null object
eventCodeOrg      1534034 non-null object
eventType         1683616 non-null object
eventTypeOrg      1541809 non-null object
sourceName        1683577 non-null object
source            1683577 non-null object
featureName       1628158 non-null object
numeric           1585705 non-null float32
dtypes: float32(1), float64(6), object(13)
memory

## "unit" 

In [50]:
#create new column of unit
df['unit'] = df['unitOfMeasure']

## "category"

In [51]:
#category 
category_dict = dict(zip(eventsinfo['eventName'], eventsinfo['category']))
df['category'] = df['eventName'].map(category_dict)

## "inModel"

In [52]:
#if feature is in Model  
inmodel_dict = dict(zip(eventsinfo['eventName'], eventsinfo['inModel']))
df['inModel'] = df['eventName'].map(inmodel_dict)

## Reorder columns

In [53]:
df = df[['pid', 'eventName', 
         'featureName','eventStartDate', 'eventEndDate',  'numeric', 'unit', 'category', 'inModel',
         'dValue', 'iValue', 'sValue', 'eventDesc', 'unitOfMeasure', 'source', 'sourceName',
         'orderNumber', 'organismId', 'eventCode', 'eventCodeOrg', 'eventType', 'eventTypeOrg']]

In [54]:
df.columns

Index(['pid', 'eventName', 'featureName', 'eventStartDate', 'eventEndDate',
       'numeric', 'unit', 'category', 'inModel', 'dValue', 'iValue', 'sValue',
       'eventDesc', 'unitOfMeasure', 'source', 'sourceName', 'orderNumber',
       'organismId', 'eventCode', 'eventCodeOrg', 'eventType', 'eventTypeOrg'],
      dtype='object')

In [55]:
df.head()

Unnamed: 0,pid,eventName,featureName,eventStartDate,eventEndDate,numeric,unit,category,inModel,dValue,iValue,sValue,eventDesc,unitOfMeasure,source,sourceName,orderNumber,organismId,eventCode,eventCodeOrg,eventType,eventTypeOrg
0,1013300000.0,WBC,WBC,2018-02-15 10:58:00.0000000,2018-02-15 10:58:00.0000000,2.14,K/microL,laboratory,1.0,2.14,0.0,,,K/microL,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,26464-8,100109500,loinc,conv_lab_cham
1,1013300000.0,Alkaline Phosphatase -Blood,Alkaline Phosphatase,2018-02-15 10:25:00.0000000,2018-02-15 10:25:00.0000000,129.0,IU/l,laboratory,1.0,129.0,0.0,,,IU/l,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,77141-0,100184075,loinc,conv_lab_cham
2,1013300000.0,"Protein -Blood, total","Protein, total",2018-02-15 10:25:00.0000000,2018-02-15 10:25:00.0000000,3.6,g/dl,laboratory,1.0,3.6,0.0,,,g/dl,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,2885-2,100184155,loinc,conv_lab_cham
3,1013300000.0,Sodium -Blood,Sodium,2018-02-15 10:25:00.0000000,2018-02-15 10:25:00.0000000,147.0,meq/l,laboratory,1.0,147.0,0.0,,,meq/l,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,2951-2,100184295,loinc,conv_lab_cham
4,1013300000.0,Urea -Blood,Urea,2018-02-15 10:25:00.0000000,2018-02-15 10:25:00.0000000,24.0,mg/dl,laboratory,1.0,24.0,0.0,,,mg/dl,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,3091-6,100184520,loinc,conv_lab_cham


# Convert units

In [56]:
#see which events should be converted
eventsinfo.loc[eventsinfo['inModel'] == 1].loc[eventsinfo['category']!='drug'] \
.loc[eventsinfo['multiUnits'].notna(), ['eventName', 'featureName', 'unit']]

Unnamed: 0,eventName,featureName,unit
2081,Potassium,Potassium,"['mmol/L', 'mmol/l', 'meq/l']"
2391,Sodium,Sodium,"['mmol/L', 'mmol/l', 'meq/l']"
2635,Temperature,body temperature,['F']
2825,WBC,WBC,"['cells/mm3', 'K/microL', 'LEU/UL', '/HPF']"
3156,חום,body temperature,"['C', 'F']"


#### Body temperature

In [57]:
def fahrenheit_to_celsius(f):
    return (f - 32) * 5.0/9.0

temp_ix = df.loc[df['featureName'] == 'body temperature'].loc[df['unitOfMeasure'] == 'F'].index
df.loc[temp_ix, 'numeric'] = df.loc[temp_ix, 'numeric'].apply(fahrenheit_to_celsius)
df.loc[temp_ix, 'unit'] = 'C'

#remove noise
temp = df.loc[df['featureName'] == 'body temperature', 'numeric']
temp_ix = temp[temp > 45].index
df.loc[temp_ix, 'numeric'] = None

#### WBC

In [58]:
df.loc[df['eventName'] == 'WBC'].head()

Unnamed: 0,pid,eventName,featureName,eventStartDate,eventEndDate,numeric,unit,category,inModel,dValue,iValue,sValue,eventDesc,unitOfMeasure,source,sourceName,orderNumber,organismId,eventCode,eventCodeOrg,eventType,eventTypeOrg
0,1013300000.0,WBC,WBC,2018-02-15 10:58:00.0000000,2018-02-15 10:58:00.0000000,2.14,K/microL,laboratory,1.0,2.14,0.0,,,K/microL,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,26464-8,100109500,loinc,conv_lab_cham
29,1013300000.0,WBC,WBC,2018-02-15 10:58:00.0000000,2018-02-15 10:58:00.0000000,2.14,K/microL,laboratory,1.0,2.14,0.0,,,K/microL,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,26464-8,100109500,loinc,conv_lab_cham
193,74266000.0,WBC,WBC,2018-02-18 12:22:00.0000000,2018-02-18 12:22:00.0000000,11.12,K/microL,laboratory,1.0,11.12,0.0,,,K/microL,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,26464-8,100109500,loinc,conv_lab_cham
758,1013300000.0,WBC,WBC,2018-02-10 16:22:00.0000000,2018-02-10 16:22:00.0000000,7.65,K/microL,laboratory,1.0,7.65,0.0,,,K/microL,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,26464-8,100109500,loinc,conv_lab_cham
867,485460000.0,WBC,WBC,2018-02-16 09:28:00.0000000,2018-02-16 09:28:00.0000000,8.42,K/microL,laboratory,1.0,8.42,0.0,,,K/microL,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,26464-8,100109500,loinc,conv_lab_cham


In [59]:
wbc = df.loc[df['eventName'] == 'WBC', 'unitOfMeasure'].value_counts()
display(wbc)

i = 1
for unit in wbc.head(3).index:
    wbc_ix = df.loc[df['featureName'] == 'WBC'].loc[df['unitOfMeasure'] == unit].index
    df.loc[wbc_ix, 'featureName'] = 'WBC_' + str(i)
    df.loc[wbc_ix, 'unit'] = unit
    i += 1


K/microL     14626
/HPF           468
LEU/UL         137
cells/mm3        5
Name: unitOfMeasure, dtype: int64

#### Weight

In [60]:
#convert to kg 
weight_ix = df.loc[df['eventName'] == 'CurrentWeight'].index
df.loc[weight_ix, 'numeric'] = df.loc[weight_ix, 'numeric'].apply(lambda x: x/1000)
df.loc[weight_ix, 'unit'] = 'kg'

#### Hight

In [61]:
hight_ix = df.loc[df['eventName'] == 'height'].index

In [62]:
#convert to cm
df.loc[459519, 'numeric'] = df.loc[459519, 'numeric'] * 100

#### Potassium and Sodium
mmol/l == meq/l
so don't need to convert

In [65]:
df.loc[(df['eventName'] == 'Uric acid -Blood') & df['sValue'].notna()]

Unnamed: 0,pid,eventName,featureName,eventStartDate,eventEndDate,numeric,unit,category,inModel,dValue,iValue,sValue,eventDesc,unitOfMeasure,source,sourceName,orderNumber,organismId,eventCode,eventCodeOrg,eventType,eventTypeOrg
28952,1013300000.0,Uric acid -Blood,Uric acid,2018-03-20 11:09:00.0000000,2018-03-20 11:09:00.0000000,,mg/dl,laboratory,,0.0,0.0,<1.5,,mg/dl,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,3084-1,100184550,loinc,conv_lab_cham
72794,632710000.0,Uric acid -Blood,Uric acid,2018-07-04 05:36:00.0000000,2018-07-04 05:36:00.0000000,,mg/dl,laboratory,,0.0,0.0,<1.5,,mg/dl,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,3084-1,100184550,loinc,conv_lab_cham
82115,644360000.0,Uric acid -Blood,Uric acid,2018-07-26 04:38:00.0000000,2018-07-26 04:38:00.0000000,,mg/dl,laboratory,,0.0,0.0,<1.5,,mg/dl,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,3084-1,100184550,loinc,conv_lab_cham
82361,644360000.0,Uric acid -Blood,Uric acid,2018-07-28 05:10:00.0000000,2018-07-28 05:10:00.0000000,,mg/dl,laboratory,,0.0,0.0,<1.5,,mg/dl,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,3084-1,100184550,loinc,conv_lab_cham
94875,644360000.0,Uric acid -Blood,Uric acid,2018-07-27 04:56:00.0000000,2018-07-27 04:56:00.0000000,,mg/dl,laboratory,,0.0,0.0,<1.5,,mg/dl,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,3084-1,100184550,loinc,conv_lab_cham
139460,632710000.0,Uric acid -Blood,Uric acid,2018-07-05 05:28:00.0000000,2018-07-05 05:28:00.0000000,,mg/dl,laboratory,,0.0,0.0,<1.5,,mg/dl,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,3084-1,100184550,loinc,conv_lab_cham
186446,11339000.0,Uric acid -Blood,Uric acid,2018-10-28 03:48:00.0000000,2018-10-28 03:48:00.0000000,,mg/dl,laboratory,,0.0,0.0,<1.5,,mg/dl,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,3084-1,100184550,loinc,conv_lab_cham
206141,11339000.0,Uric acid -Blood,Uric acid,2018-10-29 03:57:00.0000000,2018-10-29 03:57:00.0000000,,mg/dl,laboratory,,0.0,0.0,<1.5,,mg/dl,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,3084-1,100184550,loinc,conv_lab_cham
215981,632710000.0,Uric acid -Blood,Uric acid,2018-07-06 12:58:00.0000000,2018-07-06 12:58:00.0000000,,mg/dl,laboratory,,0.0,0.0,<1.5,,mg/dl,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,3084-1,100184550,loinc,conv_lab_cham
259665,778430000.0,Uric acid -Blood,Uric acid,2019-04-18 13:23:00.0000000,2019-04-18 13:23:00.0000000,,mg/dl,laboratory,,0.0,0.0,<1.5,,mg/dl,Lab Results,[MIRROR_Chameleon].[Lab_Results],,,3084-1,100184550,loinc,conv_lab_cham


# Write outputs

In [224]:
#output into one file 
if WRITE_FLAG:
    df.to_csv(OUTPUT_PATH, sep=',', index=False)

### drugs

In [226]:
#write list of drugs
drugs = df[df['category']=='drug'].groupby(['featureName'])['pid'].nunique().sort_values(ascending=False).to_frame().reset_index()
if WRITE_FLAG:
    drugs.to_excel(ORGANIZE_EVENTS_PATH + "drugs.xlsx", index=False)