# 02 Exploratory Data Analysis

Notebook goal: initial Exploratory Data Analysis to understand features, data cleanliness and basic (inter)correlations.

## 1. Load data

Data processed in previous notebook

In [None]:
import pandas as pd
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

%matplotlib inline

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [None]:
# Load into pandas
df = pd.read_parquet("../data/original-data.parquet")

## Explore Data types

In [None]:
df.dtypes

In [None]:
df.arrival_day_of_week.unique()

In [None]:
df.arrival_month_name.unique()

## Convert datetimes

In [None]:
df2 = df.copy()

In [None]:
df2.DISCHARGE_DATE_HOSPITAL_PROVIDER_SPELL = pd.to_datetime(
    df2.DISCHARGE_DATE_HOSPITAL_PROVIDER_SPELL, format="%Y-%m-%d %H:%M:%S.%f"
)

In [None]:
# mainly nan
# df2.DISCHARGE_READY_DATE.sample(10)
df2.EXPECTED_DISCHARGE_DATE = pd.to_datetime(
    df2.EXPECTED_DISCHARGE_DATE, format="%Y-%m-%d %H:%M:%S.%f"
)

In [None]:
df2.FIRST_START_DATE_TIME_WARD_STAY = pd.to_datetime(
    df2.FIRST_START_DATE_TIME_WARD_STAY, format="%Y-%m-%d %H:%M:%S.%f"
)

In [None]:
df2.START_DATE_TIME_HOSPITAL_PROVIDER_SPELL = pd.to_datetime(
    df2.START_DATE_TIME_HOSPITAL_PROVIDER_SPELL, format="%Y-%m-%d %H:%M:%S.%f"
)

In [None]:
df2.dtypes

## Data ordering

How is data ordered?

In [None]:
# not ordered by local patient id
df2.LOCAL_PATIENT_IDENTIFIER.head(10)

# nb. value_counts() shows repeat visits - could this be feature?

In [None]:
# not ordered by start-date
df2.START_DATE_TIME_HOSPITAL_PROVIDER_SPELL.head(10)

In [None]:
# not by end-date
df2.DISCHARGE_DATE_HOSPITAL_PROVIDER_SPELL.head(10)

In [None]:
# not by cds
df2.cds_unique_identifier.sample(10)

### Order by arrival date

In [None]:
df3 = df2.sort_values(by="START_DATE_TIME_HOSPITAL_PROVIDER_SPELL").reset_index(
    drop=True
)

In [None]:
df3.head(10)

### Missing data

In [None]:
sns.set(rc={"figure.figsize": (15, 8)})
sns.heatmap(df3.isnull(), cbar=False);

In [None]:
df3.isnull().sum()

In [None]:
df3.IS_major.unique()

In [None]:
df3.IS_major.value_counts()

In [None]:
# Extract columns with 100% empty values
empty_cols = df3.isnull().sum() == df.shape[0]
empty_cols = empty_cols[empty_cols].index.array

In [None]:
for col in empty_cols:
    print(f"{col} values: {df[col].unique()}")

In [None]:
df4 = df3.drop(columns=empty_cols)

In [None]:
df4[df4.stroke_ward_stay.notna()].shape

In [None]:
df4.stroke_ward_stay.value_counts()

Question:

1. When does the block of missing data end?


In [None]:
df4.iloc[df4.presenting_complaint.first_valid_index()][
    ["START_DATE_TIME_HOSPITAL_PROVIDER_SPELL", "presenting_complaint"]
]

In [None]:
df4.iloc[df4.presenting_complaint.first_valid_index() - 1][
    ["START_DATE_TIME_HOSPITAL_PROVIDER_SPELL", "presenting_complaint"]
]

## Correlation plot

In [None]:
# Pearson correlation by default:
corr = df4.corr()

In [None]:
sns.set_theme(style="white")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 8))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)


# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    vmax=0.3,
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.5},
)

In [None]:
print(df4.FIRST_WARD_STAY_IDENTIFIER.unique())
df4.FIRST_WARD_STAY_IDENTIFIER.value_counts()

In [None]:
# Remove empty/single value columns
df5 = df4.drop(
    columns=[
        "FIRST_WARD_STAY_IDENTIFIER",
        "LENGTH_OF_STAY_IN_MINUTES",
        "START_DATE_HOSPITAL_PROVIDER_SPELL",
        "EXPECTED_DISCHARGE_DATE_TIME",
    ]
)

In [None]:
# How many negative/invalid LoS are there?
df5[df5.LENGTH_OF_STAY < 0].shape

In [None]:
# Remove invalid LoS
df6 = df5[df5.LENGTH_OF_STAY > -1]

## Check duplicate rows

In [None]:
df6[df6.duplicated(keep=False)].sort_values(
    by="START_DATE_TIME_HOSPITAL_PROVIDER_SPELL"
).head()

In [None]:
df7 = df6.drop_duplicates()

In [None]:
print(df6.shape)
print(df7.shape)

## Check duplicate columns

In [None]:
df7.FIRST_START_DATE_TIME_WARD_STAY.equals(df7.START_DATE_TIME_HOSPITAL_PROVIDER_SPELL)

In [None]:
df7[
    ["FIRST_START_DATE_TIME_WARD_STAY", "START_DATE_TIME_HOSPITAL_PROVIDER_SPELL"]
].sample(100).diff(axis=1)

In [None]:
df7.loc[154218]

In [None]:
(
    df7[["FIRST_START_DATE_TIME_WARD_STAY", "START_DATE_TIME_HOSPITAL_PROVIDER_SPELL"]]
    .diff(axis=1)
    .START_DATE_TIME_HOSPITAL_PROVIDER_SPELL
    > pd.Timedelta(0)
).sum()

In [None]:
# Drop duplicate columns
df8 = df7.drop(columns="FIRST_START_DATE_TIME_WARD_STAY")

In [None]:
df8.shape

## Pandas profiling

In [None]:
pd.__version__
# note bug with version 1.4.1: https://github.com/ydataai/pandas-profiling/issues/911
# use lower version (e.g. 1.3.5)

In [None]:
# Dataset large and crashing without minimal=True
profile = ProfileReport(df8, title="Pandas Profiling Report", minimal=True)

In [None]:
profile

In [None]:
df8.attendance_type.value_counts()

In [None]:
# Drop invalid/incomplete columns
df9 = df8.drop(
    columns=[
        "FIRST_REGULAR_DAY_OR_NIGHT_ADMISSION_DESCRIPTION",
        "wait",
        "attendance_type",
        "initial_wait",
        "arrival_day_of_week",
        "arrival_month_name",
    ]
)

In [None]:
sns.set(rc={"figure.figsize": (15, 8)})
sns.heatmap(df9.isnull(), cbar=False);

## Export snapshot data

In [None]:
df9.to_parquet("../data/02-eda.parquet")