# Data cleaning

This notebook processes the data exported from the EDA notebook after feedback with data owner.

In [None]:
import pandas as pd
import seaborn as sns

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 100)

%matplotlib inline

In [None]:
# Load data
eda_data_df = pd.read_parquet("../data/02-eda.parquet")

In [None]:
eda_data_df.columns

## Drop early rows missing data

As explored during EDA, the earliest ~100k rows are missing a lot of data due to the introduction of a new clinical system. Remove these rows rather than impute.

In [None]:
# visualise when the missing rows end; wait_minutes is a newer field
eda_data_df.wait_minutes.plot()

In [None]:
# create a new dataframe starting with first valid entry for wait_minutes
modern_data_df = eda_data_df[
    eda_data_df.index >= eda_data_df.wait_minutes.first_valid_index()
]
modern_data_df.wait_minutes.plot()

In [None]:
# the gap is still visible, check first few values
modern_data_df.wait_minutes.head()

In [None]:
# there is one odd row with a value for wait_minutes, remove this
modern_data_df.drop(index=66955, inplace=True)

In [None]:
# reprocess to find the beginning of the "modern" block of data
modern_data_df = modern_data_df[
    modern_data_df.index >= modern_data_df.wait_minutes.first_valid_index()
]
modern_data_df.wait_minutes.plot()

## Drop redundant/agreed columns

As agreed with data SME

In [None]:
reduced_cols_df = (
    modern_data_df.drop(
        # Drop redundant columns
        columns=[
            "Frailty Proxy",
            "all_breach_reason_codes",
            "ae_attendance_category_code",
            "all_diagnosis_codes",
            "all_investigation_codes",
            "all_local_investigation_codes",
            "all_local_treatment_codes",
            "all_treatment_codes",
            "PATIENT_CLASSIFICATION",
            "PATIENT_GENDER_CURRENT",
            "SOURCE_OF_ADMISSION_HOSPITAL_PROVIDER_SPELL",
            "TREATMENT_FUNCTION_CODE_AT_ADMISSION",
            "MAIN_SPECIALTY_CODE_AT_ADMISSION",
            "ae_initial_assessment_triage_category_code",
            "ae_initial_assessment_triage_category",
            "major_minor",
            "manchester_triage_category",
        ]
    )
    .drop(
        # Drop identifier columns
        columns=[
            "LOCAL_PATIENT_IDENTIFIER",
            "previous_30_day_hospital_provider_spell_number",
            "ED_attendance_episode_number",
            "unique_internal_ED_admission_number",
            "unique_internal_IP_admission_number",
        ]
    )
    .drop(
        # Drop less useful columns
        columns=["wait_minutes", "initial_wait_minutes"]
    )
)

In [None]:
reduced_cols_df.shape

In [None]:
# visualise missing data
sns.set(rc={"figure.figsize": (15, 8)})
sns.heatmap(reduced_cols_df.isnull(), cbar=False);

## Assign nan values 

* SME agrees that NaN = N for stroke_ward_stay
* SME agrees that None = N for IS_MAJOR

In [None]:
reduced_cols_df.stroke_ward_stay.value_counts()

In [None]:
# fill stroke_ward_stay
imputed_df = reduced_cols_df.copy()
imputed_df.stroke_ward_stay.fillna(value="N", inplace=True)
imputed_df.stroke_ward_stay.value_counts()

In [None]:
imputed_df.IS_major.value_counts()

In [None]:
# fill IS_major
imputed_df.IS_major.fillna(value="N", inplace=True)
imputed_df.IS_major.value_counts()

## Drop sparse rows

In [None]:
# check for null values across dataset
imputed_df.isnull().sum()

In [None]:
# some columns have < 1000 null values, and a larger subset have ~68000 missing. Drop these
removed_sparse_rows_df = imputed_df.dropna(
    subset=[
        "ADMISSION_METHOD_HOSPITAL_PROVIDER_SPELL_DESCRIPTION",
        "EXPECTED_DISCHARGE_DATE",
        "MAIN_SPECIALTY_CODE_AT_ADMISSION_DESCRIPTION",
        "POST_CODE_AT_ADMISSION_DATE_DISTRICT",
        "IMD county decile",
        "all_diagnoses",
    ]
)

## Add derived fields

These were removed during EDA and can now be calculated for the full dataset

In [None]:
derived_df = removed_sparse_rows_df.copy()
derived_df[
    "arrival_day_of_week"
] = derived_df.START_DATE_TIME_HOSPITAL_PROVIDER_SPELL.dt.day_name().str[:3]
derived_df[
    "arrival_month_name"
] = derived_df.START_DATE_TIME_HOSPITAL_PROVIDER_SPELL.dt.month_name().str[:3]

In [None]:
derived_df.arrival_day_of_week.unique()

In [None]:
# Mid-week appears busier. Weekend is quietest
derived_df.arrival_day_of_week.value_counts()

In [None]:
derived_df.arrival_month_name.unique()

In [None]:
# Winter months are busiest
derived_df.arrival_month_name.value_counts()

## Homogenise binary fields

Many fields are encoding as Y/N or similar, convert these into binary fields

In [None]:
binary_fields_df = derived_df.copy()
binary_fields_df.stroke_ward_stay = binary_fields_df.stroke_ward_stay.apply(
    lambda x: 0 if x == "N" else 1
)
binary_fields_df.IS_care_home_on_admission = (
    binary_fields_df.IS_care_home_on_admission.apply(lambda x: 0 if x == "N" else 1)
)
binary_fields_df.IS_care_home_on_discharge = (
    binary_fields_df.IS_care_home_on_discharge.apply(lambda x: 0 if x == "N" else 1)
)
# create new fields
binary_fields_df["IS_illness_not_injury"] = binary_fields_df["Illness Injury Flag"].map(
    {"Illness": 1, "Injury": 0}
)
binary_fields_df["IS_elective"] = binary_fields_df.elective_or_non_elective.map(
    {"Non-elective admission": 0, "Elective admission": 1}
)
# drop old
binary_fields_df.drop(
    columns=["Illness Injury Flag", "elective_or_non_elective"], inplace=True
)

In [None]:
# check new binary fields
for field in [
    "stroke_ward_stay",
    "IS_care_home_on_admission",
    "IS_care_home_on_discharge",
    "IS_illness_not_injury",
    "IS_elective",
]:
    print(
        f"{field} has {binary_fields_df[field].isnull().sum()} null values and values:"
    )
    print(binary_fields_df[field].value_counts())

## Check genders

In [None]:
# there are only 13 "not specified" gender, all others are M/F
binary_fields_df.PATIENT_GENDER_CURRENT_DESCRIPTION.value_counts()

In [None]:
# drop "not specified" values
genders_df = binary_fields_df.drop(
    labels=binary_fields_df[
        binary_fields_df.PATIENT_GENDER_CURRENT_DESCRIPTION == "Not specified"
    ].index
)

In [None]:
# check null values
# there are still some columns with majority (~400k) values null; these can be encoding in a null field during modelling e.g. ae_arrival_mode
genders_df.isnull().sum()

In [None]:
# plot null values
sns.set(rc={"figure.figsize": (15, 8)})
sns.heatmap(genders_df.isnull(), cbar=False);

In [None]:
genders_df.shape

In [None]:
# export data
genders_df.to_parquet("../data/03-clean-data.parquet")