**Notebook content:**
- Sort rows.
- Drop empty rows.
- Drop empty and irrlevant columns. 
- Drop duplicate rows.

In [76]:
%reset -f 

In [77]:
import sys, os
import numpy as np
import pandas as pd

from IPython.display import display
sys.path.append(os.path.dirname(sys.path[0])) #to change to environment 
from utils.constants import *

In [78]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
np.set_printoptions(threshold=sys.maxsize) #- print the full NumPy array

In [79]:
#files names 
INPUT_FILE = "data0.csv"

In [80]:
#read input file  
input_path = os.path.join(DPATH_DATA, INPUT_FILE) 
df = pd.read_csv(input_path)
df.shape

(1683669, 44)

**Headers**

https://docs.google.com/spreadsheets/d/1rS5fCmB2nW72YMb0Pqb3ZbKIQO2jzqTS6cnaDWRuYB0/edit#gid=1712287935


In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1683669 entries, 0 to 1683668
Data columns (total 44 columns):
rowId                     1683650 non-null object
id                        1683655 non-null object
pid                       1683616 non-null float64
admissionId               1683616 non-null float64
altAdmissionId            0 non-null float64
altPid                    39 non-null object
bValue                    1556931 non-null object
abnormalFlags             0 non-null float64
bed                       1171800 non-null object
bodyLocation              0 non-null float64
cancelDate                0 non-null float64
cancelled                 1683610 non-null object
converted                 1683616 non-null object
dValue                    1556931 non-null float64
eventCode                 1683616 non-null object
eventCodeOrg              1534034 non-null object
eventDesc                 136875 non-null object
eventEndDate              1683616 non-null object
eventName 

In [82]:
df.isna().mean().round(5).mul(100).sort_values().to_frame("% missing values")

Unnamed: 0,% missing values
rowId,0.001
id,0.001
messageId,0.003
eventType,0.003
eventEndDate,0.003
eventCode,0.003
converted,0.003
eventStartDate,0.003
pid,0.003
admissionId,0.003


In [83]:
df.dtypes.sort_values()

dValue                    float64
iValue                    float64
facility                  float64
orderNumber               float64
organismId                float64
eventResultType           float64
eventResult               float64
pointOfCare               float64
presentOnAdmission        float64
principalDiagnosis        float64
room                      float64
messageId                 float64
normalRange               float64
specimenCollectionSite    float64
cancelDate                float64
bodyLocation              float64
specimenId                float64
abnormalFlags             float64
altAdmissionId            float64
admissionId               float64
pid                       float64
sValue                     object
tValue                     object
transferrable              object
parentId                   object
unitOfMeasure              object
sourceName                 object
rowId                      object
eventStartDate             object
eventType     

### Sort rows 

In [84]:
#parse date of start/end event
df['eventStartDate'] = pd.to_datetime(df['eventStartDate'], format='%Y-%m-%d')
df['eventEndDate'] = pd.to_datetime(df['eventEndDate'], format='%Y-%m-%d')
#sort
df.sort_values(["pid", "eventStartDate"], inplace=True)

### Drop columns

In [85]:
#drop columns with all NaN's.
print("num columns where all values are null:", df.isna().all(axis=0).sum())
df.dropna(axis=1, how='all', inplace=True)

num columns where all values are null: 14


In [86]:
#Drop irrelevant columns
cols_to_drop = ["rowId", "id", "altPid", "bed", \
"cancelled", "converted", \
"messageId", "parentId", "tValue", \
"transferrable", "careGiver", "Time_Stamp"]

print("num irrelevant columnas:", len(cols_to_drop))    
df.drop(columns=cols_to_drop, inplace=True)

num irrelevant columnas: 12


### Drop rows

In [87]:
#drop rows with all NaN's.
print("num rows where all values are null:", df.isna().all(axis=1).sum())
df.dropna(axis=0, how='all', inplace=True)

num rows where all values are null: 53


In [88]:
print("\nNum duplicate rows:", df.duplicated().sum())
df.drop_duplicates(inplace=True)


Num duplicate rows: 96568


In [89]:
print("Num rows with missing patient id:", df[df["pid"].isna()].shape[0])

Num rows with missing patient id: 0


## Resulted dataframe

In [90]:
df.shape

(1587048, 18)

In [91]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1587048 entries, 1669969 to 1592287
Data columns (total 18 columns):
pid               1587048 non-null float64
admissionId       1587048 non-null float64
bValue            1507828 non-null object
dValue            1507828 non-null float64
eventCode         1587048 non-null object
eventCodeOrg      1488929 non-null object
eventDesc         86365 non-null object
eventEndDate      1587048 non-null datetime64[ns]
eventName         1552195 non-null object
eventStartDate    1587048 non-null datetime64[ns]
eventType         1587048 non-null object
eventTypeOrg      1493838 non-null object
iValue            1507828 non-null float64
orderNumber       15551 non-null float64
organismId        1031 non-null float64
sValue            35412 non-null object
sourceName        1587009 non-null object
unitOfMeasure     944714 non-null object
dtypes: datetime64[ns](2), float64(6), object(10)
memory usage: 230.1+ MB
