# 02 Exploratory Data Analysis

Notebook goal: initial Exploratory Data Analysis to understand features, data cleanliness and basic (inter)correlations.

## 1. Load data

Data processed in previous notebook

In [None]:
import pandas as pd
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

%matplotlib inline

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [None]:
# Load into pandas
original_data_df = pd.read_parquet("../data/original-data.parquet")

## Explore Data types

In [None]:
original_data_df.dtypes

In [None]:
original_data_df.arrival_day_of_week.unique()

In [None]:
original_data_df.arrival_month_name.unique()

## Convert datetimes

In [None]:
datetimes_df = original_data_df.copy()

In [None]:
datetimes_df.DISCHARGE_DATE_HOSPITAL_PROVIDER_SPELL = pd.to_datetime(
    datetimes_df.DISCHARGE_DATE_HOSPITAL_PROVIDER_SPELL, format="%Y-%m-%d %H:%M:%S.%f"
)

In [None]:
# mainly nan
# df2.DISCHARGE_READY_DATE.sample(10)
datetimes_df.EXPECTED_DISCHARGE_DATE = pd.to_datetime(
    datetimes_df.EXPECTED_DISCHARGE_DATE, format="%Y-%m-%d %H:%M:%S.%f"
)

In [None]:
datetimes_df.FIRST_START_DATE_TIME_WARD_STAY = pd.to_datetime(
    datetimes_df.FIRST_START_DATE_TIME_WARD_STAY, format="%Y-%m-%d %H:%M:%S.%f"
)

In [None]:
datetimes_df.START_DATE_TIME_HOSPITAL_PROVIDER_SPELL = pd.to_datetime(
    datetimes_df.START_DATE_TIME_HOSPITAL_PROVIDER_SPELL, format="%Y-%m-%d %H:%M:%S.%f"
)

In [None]:
datetimes_df.dtypes

## Sanity checks

Does the LENGTH_OF_STAY match start/end dates?

In [None]:
# check data types before conducting maths
datetimes_df[
    [
        "DISCHARGE_DATE_HOSPITAL_PROVIDER_SPELL",
        "START_DATE_TIME_HOSPITAL_PROVIDER_SPELL",
    ]
].dtypes

In [None]:
# Discharge is whole day, admission is datetime
datetimes_df[
    [
        "DISCHARGE_DATE_HOSPITAL_PROVIDER_SPELL",
        "START_DATE_TIME_HOSPITAL_PROVIDER_SPELL",
    ]
].sample(10)

In [None]:
# calculate derived LoS
# round up to whole days
datetimes_df["DER_los"] = (
    datetimes_df["DISCHARGE_DATE_HOSPITAL_PROVIDER_SPELL"]
    - datetimes_df["START_DATE_TIME_HOSPITAL_PROVIDER_SPELL"]
).dt.days + 1

In [None]:
# quick visual inspection - do they match?
datetimes_df[["DER_los", "LENGTH_OF_STAY"]].head(10)

In [None]:
# check that mean difference is ~ 0 days
datetimes_df[["DER_los", "LENGTH_OF_STAY"]].diff(axis=1).LENGTH_OF_STAY.mean()

In [None]:
# drop the derived column
datetimes_df.drop(columns="DER_los", inplace=True)

## Data ordering

How is data ordered?

In [None]:
# not ordered by local patient id
datetimes_df.LOCAL_PATIENT_IDENTIFIER.head(10)

# nb. value_counts() shows repeat visits - could this be feature?

In [None]:
# not ordered by start-date
datetimes_df.START_DATE_TIME_HOSPITAL_PROVIDER_SPELL.head(10)

In [None]:
# not by end-date
datetimes_df.DISCHARGE_DATE_HOSPITAL_PROVIDER_SPELL.head(10)

In [None]:
# not by cds
datetimes_df.cds_unique_identifier.sample(10)

### Order by arrival date

In [None]:
sorted_datetimes_df = datetimes_df.sort_values(
    by="START_DATE_TIME_HOSPITAL_PROVIDER_SPELL"
).reset_index(drop=True)

In [None]:
sorted_datetimes_df.head(10)

### Missing data

In [None]:
sns.set(rc={"figure.figsize": (15, 8)})
sns.heatmap(sorted_datetimes_df.isnull(), cbar=False);

In [None]:
sorted_datetimes_df.isnull().sum()

In [None]:
sorted_datetimes_df.IS_major.unique()

In [None]:
sorted_datetimes_df.IS_major.value_counts()

In [None]:
# Extract columns with 100% empty values
empty_cols = sorted_datetimes_df.isnull().sum() == sorted_datetimes_df.shape[0]
empty_cols = empty_cols[empty_cols].index.array

In [None]:
for col in empty_cols:
    print(f"{col} values: {sorted_datetimes_df[col].unique()}")

In [None]:
# df_remove_empty_cols
remove_empty_cols_df = sorted_datetimes_df.drop(columns=empty_cols)

In [None]:
remove_empty_cols_df[remove_empty_cols_df.stroke_ward_stay.notna()].shape

In [None]:
remove_empty_cols_df.stroke_ward_stay.value_counts()

## Identify early block of missing data

The first ~100k rows are missing a significant amount of data - is this due to the introduction of a new system?

1. When does the block of missing data end? Does this align with a new clinical system?


In [None]:
# find the first valid index (ie. not null) for the presenting_complaint field, one of the frequently null fields
remove_empty_cols_df.iloc[
    remove_empty_cols_df.presenting_complaint.first_valid_index()
][["START_DATE_TIME_HOSPITAL_PROVIDER_SPELL", "presenting_complaint"]]

In [None]:
# check the previous entry to confirm it has a missing "presenting complaint"
remove_empty_cols_df.iloc[
    remove_empty_cols_df.presenting_complaint.first_valid_index() - 1
][["START_DATE_TIME_HOSPITAL_PROVIDER_SPELL", "presenting_complaint"]]

## Correlation plot

In [None]:
# Pearson correlation by default:
corr = remove_empty_cols_df.corr()

In [None]:
sns.set_theme(style="white")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 8))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)


# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    vmax=0.3,
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.5},
)

In [None]:
print(remove_empty_cols_df.FIRST_WARD_STAY_IDENTIFIER.unique())
remove_empty_cols_df.FIRST_WARD_STAY_IDENTIFIER.value_counts()

In [None]:
# Remove empty/single value columns
remove_low_cardinality_df = remove_empty_cols_df.drop(
    columns=[
        "FIRST_WARD_STAY_IDENTIFIER",
        "LENGTH_OF_STAY_IN_MINUTES",
        "START_DATE_HOSPITAL_PROVIDER_SPELL",
        "EXPECTED_DISCHARGE_DATE_TIME",
    ]
)

In [None]:
# How many negative/invalid LoS are there?
remove_low_cardinality_df[remove_low_cardinality_df.LENGTH_OF_STAY < 0].shape

In [None]:
# df_valid_los
# Remove invalid LoS
valid_los_df = remove_low_cardinality_df[remove_low_cardinality_df.LENGTH_OF_STAY > -1]

## Check duplicate rows

In [None]:
valid_los_df[valid_los_df.duplicated(keep=False)].sort_values(
    by="START_DATE_TIME_HOSPITAL_PROVIDER_SPELL"
).head()

In [None]:
# df_no_duplicates
no_duplicate_rows_df = valid_los_df.drop_duplicates()

In [None]:
print(no_duplicate_rows_df.shape)

## Check duplicate columns

In [None]:
# check if FIRST_START_DATE_TIME_WARD_STAY is the same as START_DATE_TIME_HOSPITAL_PROVIDER_SPELL
no_duplicate_rows_df.FIRST_START_DATE_TIME_WARD_STAY.equals(
    no_duplicate_rows_df.START_DATE_TIME_HOSPITAL_PROVIDER_SPELL
)

In [None]:
# they are different, so work out what the difference is between the columns
no_duplicate_rows_df[
    ["FIRST_START_DATE_TIME_WARD_STAY", "START_DATE_TIME_HOSPITAL_PROVIDER_SPELL"]
].sample(10).diff(axis=1)

In [None]:
# there are many NaT values in FIRST_START_DATE_TIME_WARD_STAY which lead to a difference of 0 days
# find out if there are any actual differences in dates
(
    no_duplicate_rows_df[
        ["FIRST_START_DATE_TIME_WARD_STAY", "START_DATE_TIME_HOSPITAL_PROVIDER_SPELL"]
    ]
    .diff(axis=1)
    .START_DATE_TIME_HOSPITAL_PROVIDER_SPELL
    > pd.Timedelta(0)
).sum()

In [None]:
# There are very few differences in dates given the sparsity of the FIRST_START_DATE_TIME_WARD_STAY
# so drop this column
no_duplicate_cols_df = no_duplicate_rows_df.drop(
    columns="FIRST_START_DATE_TIME_WARD_STAY"
)

In [None]:
no_duplicate_cols_df.shape

## Pandas profiling

In [None]:
pd.__version__
# note bug with version 1.4.1: https://github.com/ydataai/pandas-profiling/issues/911
# use lower version (e.g. 1.3.5)

In [None]:
# Dataset large and crashing without minimal=True
profile = ProfileReport(
    no_duplicate_cols_df, title="Pandas Profiling Report", minimal=True
)

In [None]:
profile

In [None]:
no_duplicate_cols_df.attendance_type.value_counts()

In [None]:
# Drop invalid/incomplete columns
no_incomplete_cols_df = no_duplicate_cols_df.drop(
    columns=[
        "FIRST_REGULAR_DAY_OR_NIGHT_ADMISSION_DESCRIPTION",
        "wait",
        "attendance_type",
        "initial_wait",
        "arrival_day_of_week",
        "arrival_month_name",
    ]
)

In [None]:
sns.set(rc={"figure.figsize": (15, 8)})
sns.heatmap(no_incomplete_cols_df.isnull(), cbar=False);

## Export snapshot data

In [None]:
no_incomplete_cols_df.to_parquet("../data/02-eda.parquet")

## Export data dictionary fields

Data dictionary will be created in Google Sheets/Excel

In [None]:
# Export cols/descriptions for Excel/Google Sheets import
no_incomplete_cols_df.dtypes.to_csv("../data/cols.csv")
no_incomplete_cols_df.describe().to_csv("../data/describe.csv")