# Feature engineering

This notebook processes cleaned data into the feature set used for modelling.

The decisions around feature engineering are the culmination of a number of explorations of the data, including modelling of the full dataset, which is not included in this repository.

In [None]:
import numpy as np
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [None]:
# Load data
clean_data_df = pd.read_parquet("../../data/clean-data.parquet")

## Add derived fields

These were removed during cleaning due to missing data, and can be recalculated:

In [None]:
derived_df = clean_data_df.copy()
derived_df[
    "arrival_day_of_week"
] = derived_df.START_DATE_TIME_HOSPITAL_PROVIDER_SPELL.dt.day_name().str[:3]
derived_df[
    "arrival_month_name"
] = derived_df.START_DATE_TIME_HOSPITAL_PROVIDER_SPELL.dt.month_name().str[:3]

## Select agreed columns

As agreed with data SME

In [None]:
columns = [
    "ADMISSION_METHOD_HOSPITAL_PROVIDER_SPELL_DESCRIPTION",
    "ae_arrival_mode",
    "IS_major",
    "AGE_ON_ADMISSION",
    "ED CountLast12m",
    "EL CountLast12m",
    "IS_elective",
    "EMCountLast12m",
    "IS_illness_not_injury",
    "IS_cancer",
    "IS_care_home_on_admission",
    "IS_chronic_kidney_disease",
    "IS_COPD",
    "IS_coronary_heart_disease",
    "IS_dementia",
    "IS_diabetes",
    "IS_frailty_proxy",
    "IS_hypertension",
    "IS_mental_health",
    "MAIN_SPECIALTY_CODE_AT_ADMISSION_DESCRIPTION",
    "OP First CountLast12m",
    "OP FU CountLast12m",
    "SOURCE_OF_ADMISSION_HOSPITAL_PROVIDER_SPELL_DESCRIPTION",
    "stroke_ward_stay",
    "LENGTH_OF_STAY",
    "arrival_day_of_week",
    "arrival_month_name",
]
subset_df = derived_df[columns]

## One-hot encode categorical data

In [None]:
# differentiate between non-null and null categorical features
categorical_features = subset_df.columns[subset_df.dtypes == "object"]
categorical_features_with_null = list(
    subset_df[categorical_features].columns[
        subset_df[categorical_features].isnull().sum() > 0
    ]
)
categorical_features_without_null = list(
    subset_df[categorical_features].columns[
        subset_df[categorical_features].isnull().sum() == 0
    ]
)
print(categorical_features_with_null)
print(categorical_features_without_null)

In [None]:
# TBC
# Null values will be encoded by the absence of any of their categories
# However, we may lose interpretability of the importance of NaN values if we don't include a dummy var
# Update, na values are highly correlated so dropping
df3 = pd.get_dummies(df2, columns=categorical_features_with_null, dummy_na=True)

In [None]:
# To avoid the "dummy variable trap", we could drop the first category of these features to reduce duplication.
# However, we may lose interpretability if e.g. Monday is dropped and is an important feature?
df4 = pd.get_dummies(df3, columns=categorical_features_without_null, drop_first=False)

In [None]:
# check for correlation with LENGTH OF STAY
corr = datetime_df.corr()
corr.LENGTH_OF_STAY[corr.LENGTH_OF_STAY.abs().sort_values(ascending=False).index]

In [None]:
# export to parquet