**Notebook content:**
- Compute statistics of dates of events, such as number of events before surgery date. 

In [1]:
%reset -f 

In [2]:
import sys
import pandas as pd
import numpy as np
import datetime
from IPython.display import display

pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 200)
#pd.set_option('display.float_format', lambda x: '%.f' % x)
pd.options.display.precision = 4

#import sys
np.set_printoptions(threshold=sys.maxsize) #- print the full NumPy array

from myDefs.defs import *

# visualization
import seaborn as sns
import matplotlib.pyplot as plt 
from matplotlib.pyplot import figure
%matplotlib inline


**Read events file**

In [None]:
file = DATA_PATH + "parseData1.csv"
df = pd.read_csv(file, sep=',')  

In [None]:
df.shape

In [None]:
df.head()

**Read anonymous file**

In [None]:
file = DATA_PATH + "parseAnonymous0.csv"
anonymous = pd.read_csv(file, sep=',') 

In [None]:
anonymous.shape

In [None]:
anonymous.head()

### Parse Dates

Parse dates of events in events file.

In [None]:
#parse date of start event
df['eventStartDate'] = pd.to_datetime(df['eventStartDate'], format='%Y-%m-%d')

#parse date of end event
df['eventEndDate'] = pd.to_datetime(df['eventEndDate'], format='%Y-%m-%d')

Parse date of surgery in anonymous file.

In [None]:
#parse date of surgery
anonymous['Date of surgery'] = pd.to_datetime(anonymous['Date of surgery'], format='%Y-%m-%d')

### Sort by patient id and event start dates

In [None]:
df.sort_values(["pid", "eventStartDate"], inplace=True)

# Time

## Column eventStartDate, eventEndDate

In [None]:
df['eventStartDate'].head()

### Date range
Dates of events are from **01.01.2017** until **21.05.2019**

In [None]:
#get time interval of events starts 
print(df['eventStartDate'].min(), "-" ,df['eventStartDate'].max())

In [None]:
#get time interval of events ends 
print(df['eventEndDate'].min(), "-" ,df['eventEndDate'].max())

Years: **2017-2019**. Year 2009 is probably a mistake. Most of the data is from 2018.

In [None]:
df['eventStartDate'].dt.year.value_counts()

In [None]:
#get min date of events starts  exculding year 2019 
df[df['eventStartDate'].dt.year != 2009]['eventStartDate'].min()

In [None]:
#get min date of events starts  exculding year 2019 
df[df['eventEndDate'].dt.year != 2009]['eventEndDate'].min()

In [None]:
#verify that patient contains the 2009 year is not empty
#the event in this year is Carotid endarterectomy (כריתת רחם)
df[df['pid']==722269414].shape

**Duration of events**

Most events last less than 1 hour, however there are: 
- 16,682 (~1%) which last more than an hour.
- 12,981 (~0.8%) which last more than a day - many of these events are related to bacterial culture.

In [None]:
df['eventDuration'] = df['eventEndDate'] - df['eventStartDate']

In [None]:
df['eventDuration'].describe()

In [None]:
duration = df[['eventDuration']]

hour_plus = df[df['eventDuration'].dt.seconds > 3600]
print(hour_plus.shape[0], "->", (hour_plus.shape[0]/df["eventName"].count())*100, "%")

day_plus = df[df['eventDuration'].dt.days > 1]
print(day_plus.shape[0], "->", (day_plus.shape[0]/df["eventName"].count())*100, "%")

In [None]:
hour_plus;

In [None]:
hour_plus["eventName"].value_counts();

# Statistics of dates
For each patient show statstics of the event dates.

For example, how many events before date of surgery and after day of surgery.

## Create Table 1

Columns of table 1:
- **start_day** - date of first event.
- **end_day** - date of last event.
- **period** - end_day - starts_day. 
- **nunique_days** - number of days for which there are events.
- **n_events** - total number of events for this patient.

In [None]:
anonymous_full = anonymous[~anonymous['Date of surgery'].isna()] #missing values, should be available later 

In [None]:
anonymous_full.shape

In [None]:
df.head();

In [None]:
#get first and last days we have for patient
start_day = df.groupby('pid')['eventStartDate'].min().dt.normalize().rename("start_day")
end_day = df.groupby('pid')['eventEndDate'].max().dt.normalize().rename("end_day")

#time duration per patient, i.e., period in which data was collected
period = pd.Series(end_day - start_day).rename("period")


In [None]:
#get number of days in which data was collected
def nuniqueDays(x):
    dates = pd.concat([x['eventStartDate'].dt.normalize(), x['eventEndDate'].dt.normalize()])
    return dates.nunique()
  
nunique_days = df.groupby('pid').apply(nuniqueDays).rename("nunique_days")

#nunique_days = df.groupby('pid')['eventStartDate'].apply(lambda x : x.dt.normalize().nunique()).rename("nunique_days")

In [None]:
#get total number of events 
nevents = df.groupby('pid')['eventStartDate'].count().rename("n_events")

In [None]:
times = pd.concat([start_day,end_day, period, nunique_days, nevents], axis=1)
times

In [None]:
anonymous = anonymous.sort_values(["pid", 'Date of surgery'])

In [None]:
#there are patients who underwent more than 1 surgery  
#thus put each date of surgery in a saperate column
def split_surgeries(x, max_surgeries):   
    padding = [np.datetime64('NaT')] * (max_surgeries-x.values.shape[0])
    ls = list(x.values) + padding
    titels = ["surgery_{}".format(i+1) for i in range(max_surgeries)]
    return pd.DataFrame([ls], columns=titels)

max_surgeries = anonymous.groupby("pid")['Date of surgery'].count().max()

surgeries_dates = anonymous.groupby("pid")['Date of surgery'].apply(split_surgeries, max_surgeries)
surgeries_dates.reset_index(level=1, drop=True, inplace=True)

In [None]:
surgeries_dates;

In [None]:
surgeries_dates[~surgeries_dates["surgery_2"].isna()];

In [None]:
times = times.merge(surgeries_dates, on="pid") 

## Show Table 1

In [None]:
times

In [None]:
times.describe()

## Create Table 2

Columns of table 2:
- **nEvents_before** - number of events before _first surgery.
- **nEvents_in** - number of events in _first surgery day.
- **nEvents_after** - number of events after _first surgery day.
- **nDays_before** - number of days before _first surgery.
- **nDays_before** - total number of events after _first surgery day.

In [None]:
def getNUniqueDays(df):
    dates = pd.concat([df['eventStartDate'].dt.normalize(), df['eventEndDate'].dt.normalize()])
    return dates.nunique()


def analyseDates(x, dates):
    
    #get surgery date of current patient 
    pid = x.iloc[0]["pid"]
    surgery_date = dates.loc[pid, "surgery_1"]
    
    titels = ["nEvents_before", "nEvents_in", "nEvents_after", "nDays_before", "nDays_after"]
    
    if pd.isnull(surgery_date):
        return None
        #t = [0] * len(titels)
     
    #dates before and surgery
    before_surgery = x[x['eventStartDate'] < surgery_date]
    surgery_day = x[x['eventStartDate'].dt.normalize() == surgery_date]
    after_surgery = x[x['eventStartDate'] >= surgery_date + datetime.timedelta(days=1)]
        
    #number of unique days   
    ndays_before = getNUniqueDays(before_surgery)
    ndays_after = getNUniqueDays(after_surgery)

    #create data frame
    data = [before_surgery.shape[0], surgery_day.shape[0], after_surgery.shape[0], 
         ndays_before, ndays_after]
    df = pd.DataFrame([data], columns=titels)  
    
    return df

    

before_after = df.groupby('pid').apply(analyseDates, times)

## Show Table 1

In [None]:
before_after

In [None]:
before_after.describe()