**Notebook content:**
- Compute statistics of dates of events, such as number of events before surgery date. 

In [89]:
%reset -f 

In [90]:
import sys
import pandas as pd
import numpy as np
import datetime
from IPython.display import display

pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 200)
#pd.set_option('display.float_format', lambda x: '%.f' % x)
pd.options.display.precision = 4

#import sys
np.set_printoptions(threshold=sys.maxsize) #- print the full NumPy array

from myDefs.defs import *

# visualization
import seaborn as sns
import matplotlib.pyplot as plt 
from matplotlib.pyplot import figure
%matplotlib inline


**Read events file**

In [91]:
file = DATA_PATH + "parseData1.csv"
df = pd.read_csv(file, sep=',')  

In [92]:
df.shape

(1683616, 17)

In [93]:
df.head()

Unnamed: 0,pid,admissionId,eventName,eventStartDate,eventEndDate,dValue,iValue,sValue,eventDesc,unitOfMeasure,orderNumber,organismId,eventCode,eventCodeOrg,eventType,eventTypeOrg,sourceName
0,1013300000.0,1840641.0,WBC,2018-02-15 10:58:00.0000000,2018-02-15 10:58:00.0000000,2.14,0.0,,,K/microL,,,26464-8,100109500,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results]
1,1013300000.0,1840641.0,Alkaline Phosphatase -Blood,2018-02-15 10:25:00.0000000,2018-02-15 10:25:00.0000000,129.0,0.0,,,IU/l,,,77141-0,100184075,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results]
2,1013300000.0,1840641.0,"Protein -Blood, total",2018-02-15 10:25:00.0000000,2018-02-15 10:25:00.0000000,3.6,0.0,,,g/dl,,,2885-2,100184155,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results]
3,1013300000.0,1840641.0,Sodium -Blood,2018-02-15 10:25:00.0000000,2018-02-15 10:25:00.0000000,147.0,0.0,,,meq/l,,,2951-2,100184295,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results]
4,1013300000.0,1840641.0,Urea -Blood,2018-02-15 10:25:00.0000000,2018-02-15 10:25:00.0000000,24.0,0.0,,,mg/dl,,,3091-6,100184520,loinc,conv_lab_cham,[MIRROR_Chameleon].[Lab_Results]


**Read anonymous file**

In [94]:
file = DATA_PATH + "parseAnonymous0.csv"
anonymous = pd.read_csv(file, sep=',') 

In [95]:
anonymous.shape

(718, 14)

In [129]:
anonymous.head()

Unnamed: 0,pid,Days of hospitalization,Sex,Age,Date of surgery,Patient classification,Name of surgery,Responsible surgeon,Urgency,Surgical approach,Complications,Kk,Simple,Severe
0,1068368,24.0,,,NaT,Surgical Oncology,DIAGNOSTIC/ STAGING LAPAROSCOPY 493200;LAPAROS...,Hermon Hila,elective,Laparoscopic,,1.0,,
1,1123813,42.0,male,59.0,2018-07-18,Surgical Oncology,CYTOREDUCTIVE SURGERY WITH HIPEC (HYPERTHERMIC...,Nissan Aviram,elective,Open,Anastomotic Leak,1.0,1.0,1.0
2,1128298,,,,NaT,Upper Gi,LAPAROSCOPIC CHOLECYSTECTOMY 475620; LAPAROSCO...,,elective,Laparoscopic,,1.0,,
3,1129307,,,,NaT,Upper Gi,"OPEN TOTAL GASTRECTOMY, WITH ESOPHAGOENTEROSTO...",,urgent,Open,,1.0,,
4,1135541,,female,43.0,2018-03-12,Upper Gi,OPEN DISTAL SUBTOTAL GASTRECTOMY 436320,,elective,Open,,1.0,,


### Parse Dates

Parse dates of events in events file.

In [97]:
#parse date of start event
df['eventStartDate'] = pd.to_datetime(df['eventStartDate'], format='%Y-%m-%d')

#parse date of end event
df['eventEndDate'] = pd.to_datetime(df['eventEndDate'], format='%Y-%m-%d')

Parse date of surgery in anonymous file.

In [98]:
#parse date of surgery
anonymous['Date of surgery'] = pd.to_datetime(anonymous['Date of surgery'], format='%Y-%m-%d')

### Sort by patient id and event start dates

In [99]:
df.sort_values(["pid", "eventStartDate"], inplace=True)

# Time

## Column eventStartDate, eventEndDate

In [100]:
df['eventStartDate'].head()

1669916   2018-10-03 08:36:34.433
1669917   2018-10-03 08:36:34.433
1669918   2018-10-03 08:36:34.467
1669919   2018-10-03 08:36:34.467
1669920   2018-10-03 08:36:34.497
Name: eventStartDate, dtype: datetime64[ns]

### Date range
Dates of events are from **01.01.2017** until **21.05.2019**

In [101]:
#get time interval of events starts 
print(df['eventStartDate'].min(), "-" ,df['eventStartDate'].max())

2009-01-01 00:00:00 - 2019-05-20 23:31:34.520000


In [102]:
#get time interval of events ends 
print(df['eventEndDate'].min(), "-" ,df['eventEndDate'].max())

2009-01-01 00:00:00 - 2019-05-21 13:45:39.617000


Years: **2017-2019**. Year 2009 is probably a mistake. Most of the data is from 2018.

In [103]:
df['eventStartDate'].dt.year.value_counts()

2018    1490826
2019     126189
2017      66600
2009          1
Name: eventStartDate, dtype: int64

In [104]:
#get min date of events starts  exculding year 2019 
df[df['eventStartDate'].dt.year != 2009]['eventStartDate'].min()

Timestamp('2017-01-01 00:00:00')

In [105]:
#get min date of events starts  exculding year 2019 
df[df['eventEndDate'].dt.year != 2009]['eventEndDate'].min()

Timestamp('2017-01-01 00:00:00')

In [106]:
#verify that patient contains the 2009 year is not empty
#the event in this year is Carotid endarterectomy (כריתת רחם)
df[df['pid']==722269414].shape

(500, 17)

**Duration of events**

Most events last less than 1 hour, however there are: 
- 16,682 (~1%) which last more than an hour.
- 12,981 (~0.8%) which last more than a day - many of these events are related to bacterial culture.

In [107]:
df['eventDuration'] = df['eventEndDate'] - df['eventStartDate']

In [108]:
df['eventDuration'].describe()

count                    1683616
mean      0 days 00:57:59.862615
std       0 days 12:26:34.163699
min              0 days 00:00:00
25%              0 days 00:00:00
50%              0 days 00:00:00
75%              0 days 00:00:00
max      33 days 14:11:04.577000
Name: eventDuration, dtype: object

In [109]:
duration = df[['eventDuration']]

hour_plus = df[df['eventDuration'].dt.seconds > 3600]
print(hour_plus.shape[0], "->", (hour_plus.shape[0]/df["eventName"].count())*100, "%")

day_plus = df[df['eventDuration'].dt.days > 1]
print(day_plus.shape[0], "->", (day_plus.shape[0]/df["eventName"].count())*100, "%")

16682 -> 1.0245934362635567 %
12981 -> 0.7972813449308974 %


In [110]:
hour_plus;

In [111]:
hour_plus["eventName"].value_counts();

# Statistics of dates
For each patient show statstics of the event dates.

For example, how many events before date of surgery and after day of surgery.

## Create Table 1

Columns of table 1:
- **start_day** - date of first event.
- **end_day** - date of last event.
- **period** - end_day - starts_day. 
- **nunique_days** - number of days for which there are events.
- **n_events** - total number of events for this patient.

In [112]:
anonymous_full = anonymous[~anonymous['Date of surgery'].isna()] #missing values, should be available later 

In [113]:
anonymous_full.shape

(552, 14)

In [114]:
df.head();

In [115]:
#get first and last days we have for patient
start_day = df.groupby('pid')['eventStartDate'].min().dt.normalize().rename("start_day")
end_day = df.groupby('pid')['eventEndDate'].max().dt.normalize().rename("end_day")

#time duration per patient, i.e., period in which data was collected
period = pd.Series(end_day - start_day).rename("period")


In [116]:
#get number of days in which data was collected
def nuniqueDays(x):
    dates = pd.concat([x['eventStartDate'].dt.normalize(), x['eventEndDate'].dt.normalize()])
    return dates.nunique()
  
nunique_days = df.groupby('pid').apply(nuniqueDays).rename("nunique_days")

#nunique_days = df.groupby('pid')['eventStartDate'].apply(lambda x : x.dt.normalize().nunique()).rename("nunique_days")

In [117]:
#get total number of events 
nevents = df.groupby('pid')['eventStartDate'].count().rename("n_events")

In [118]:
times = pd.concat([start_day,end_day, period, nunique_days, nevents], axis=1)
times

Unnamed: 0_level_0,start_day,end_day,period,nunique_days,n_events
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0684e+06,2018-10-03,2018-12-09,67 days,61,2716
1.1238e+06,2018-07-17,2019-04-14,271 days,125,11738
1.1283e+06,2018-04-18,2018-09-29,164 days,17,1080
1.1293e+06,2018-01-04,2018-10-07,276 days,42,5079
1.1355e+06,2018-03-12,2018-04-01,20 days,19,1634
...,...,...,...,...,...
1.0319e+09,2018-04-10,2018-04-18,8 days,9,1485
1.0324e+09,2018-03-27,2019-04-15,384 days,63,5324
1.0338e+09,2018-03-26,2018-07-24,120 days,117,10254
1.0345e+09,2018-03-13,2018-08-02,142 days,12,1608


In [119]:
anonymous = anonymous.sort_values(["pid", 'Date of surgery'])

In [120]:
#there are patients who underwent more than 1 surgery  
#thus put each date of surgery in a saperate column
def split_surgeries(x, max_surgeries):   
    padding = [np.datetime64('NaT')] * (max_surgeries-x.values.shape[0])
    ls = list(x.values) + padding
    titels = ["surgery_{}".format(i+1) for i in range(max_surgeries)]
    return pd.DataFrame([ls], columns=titels)

max_surgeries = anonymous.groupby("pid")['Date of surgery'].count().max()

surgeries_dates = anonymous.groupby("pid")['Date of surgery'].apply(split_surgeries, max_surgeries)
surgeries_dates.reset_index(level=1, drop=True, inplace=True)

In [121]:
surgeries_dates;

In [122]:
surgeries_dates[~surgeries_dates["surgery_2"].isna()];

In [123]:
times = times.merge(surgeries_dates, on="pid") 

## Show Table 1

In [124]:
times

Unnamed: 0_level_0,start_day,end_day,period,nunique_days,n_events,surgery_1,surgery_2,surgery_3
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.0684e+06,2018-10-03,2018-12-09,67 days,61,2716,NaT,NaT,NaT
1.1238e+06,2018-07-17,2019-04-14,271 days,125,11738,2018-07-18,NaT,NaT
1.1283e+06,2018-04-18,2018-09-29,164 days,17,1080,NaT,NaT,NaT
1.1293e+06,2018-01-04,2018-10-07,276 days,42,5079,NaT,NaT,NaT
1.1355e+06,2018-03-12,2018-04-01,20 days,19,1634,2018-03-12,NaT,NaT
...,...,...,...,...,...,...,...,...
1.0319e+09,2018-04-10,2018-04-18,8 days,9,1485,2018-04-11,NaT,NaT
1.0324e+09,2018-03-27,2019-04-15,384 days,63,5324,2018-03-28,NaT,NaT
1.0338e+09,2018-03-26,2018-07-24,120 days,117,10254,2018-03-27,NaT,NaT
1.0345e+09,2018-03-13,2018-08-02,142 days,12,1608,2018-03-14,NaT,NaT


In [125]:
times.describe()

Unnamed: 0,period,nunique_days,n_events
count,680,680.0,680.0
mean,93 days 02:13:24.705882,22.6294,2475.9059
std,189 days 22:37:56.572378,32.9828,5753.7136
min,1 days 00:00:00,2.0,76.0
25%,3 days 00:00:00,4.0,350.75
50%,12 days 12:00:00,10.0,822.0
75%,132 days 00:00:00,23.0,2083.5
max,3585 days 00:00:00,275.0,77817.0


## Create Table 2

Columns of table 2:
- **nEvents_before** - number of events before _first surgery.
- **nEvents_in** - number of events in _first surgery day.
- **nEvents_after** - number of events after _first surgery day.
- **nDays_before** - number of days before _first surgery.
- **nDays_before** - total number of events after _first surgery day.

In [126]:
def getNUniqueDays(df):
    dates = pd.concat([df['eventStartDate'].dt.normalize(), df['eventEndDate'].dt.normalize()])
    return dates.nunique()


def analyseDates(x, dates):
    
    #get surgery date of current patient 
    pid = x.iloc[0]["pid"]
    surgery_date = dates.loc[pid, "surgery_1"]
    
    titels = ["nEvents_before", "nEvents_in", "nEvents_after", "nDays_before", "nDays_after"]
    
    if pd.isnull(surgery_date):
        return None
        #t = [0] * len(titels)
     
    #dates before and surgery
    before_surgery = x[x['eventStartDate'] < surgery_date]
    surgery_day = x[x['eventStartDate'].dt.normalize() == surgery_date]
    after_surgery = x[x['eventStartDate'] >= surgery_date + datetime.timedelta(days=1)]
        
    #number of unique days   
    ndays_before = getNUniqueDays(before_surgery)
    ndays_after = getNUniqueDays(after_surgery)

    #create data frame
    data = [before_surgery.shape[0], surgery_day.shape[0], after_surgery.shape[0], 
         ndays_before, ndays_after]
    df = pd.DataFrame([data], columns=titels)  
    
    return df

    

before_after = df.groupby('pid').apply(analyseDates, times)

## Show Table 1

In [127]:
before_after

Unnamed: 0_level_0,Unnamed: 1_level_0,nEvents_before,nEvents_in,nEvents_after,nDays_before,nDays_after
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.1238e+06,0,74,514,11150,1,123
1.1355e+06,0,0,285,1349,0,18
3.9187e+06,0,64,295,185,1,2
4.8946e+06,0,58,304,6620,2,96
6.5415e+06,0,3411,320,5289,18,41
...,...,...,...,...,...,...
1.0319e+09,0,15,613,857,1,7
1.0324e+09,0,64,539,4721,1,61
1.0338e+09,0,61,541,9652,2,115
1.0345e+09,0,57,614,937,1,10


In [128]:
before_after.describe()

Unnamed: 0,nEvents_before,nEvents_in,nEvents_after,nDays_before,nDays_after
count,525.0,525.0,525.0,525.0,525.0
mean,550.3181,317.301,1652.4971,5.04,16.5162
std,3280.1188,220.5777,3910.874,11.3585,27.6613
min,0.0,0.0,0.0,0.0,0.0
25%,4.0,182.0,113.0,1.0,2.0
50%,63.0,241.0,371.0,1.0,6.0
75%,182.0,411.0,1193.0,4.0,16.0
max,50721.0,1443.0,33466.0,98.0,244.0
