# Data smart (for tree based methods)
- Imputation based on domain knowledge
- Imputation with train and test data

In [None]:
import cudf
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from preprocessing_tools import drop_missing_train_test, get_top_abs_correlations, encoder_train_test

In [None]:
train = pd.read_csv("../data/TrainingWiDS2021.csv", index_col=0)
test = pd.read_csv("../data/UnlabeledWiDS2021.csv", index_col=0)

In [None]:
with open("../data/exam_cols.pkl", "rb") as f:
    exams_cols = pickle.load(f)

In [None]:
# with open("../data/data_fe.pkl", "rb") as f:
#     data = pickle.load(f)

# train = data["train"]
# test = data["test"]

## Split data in covariables and response

In [None]:
RESPONSE_COL = "diabetes_mellitus"
DROP_X_COLS = ["readmission_status", "encounter_id", "hospital_id"]

x_train_val = train.copy().drop(np.append(DROP_X_COLS, RESPONSE_COL), axis=1)
y_train_val = train.loc[:, RESPONSE_COL]

x_test = test.copy().drop(DROP_X_COLS, axis=1)

## Encoding categorical

In [None]:
%%time
x_train_val, x_test, string_cols_le = encoder_train_test(x_train_val, x_test)
x_train_val = x_train_val.to_pandas()
x_test = x_test.to_pandas()

## Data imputation based on domain knowledge

**Hyphotesis**
- For exams variables, nan area not perfomed exams. Imputing with -99 will add the information about non performed exams
- Physical atributes as age, bmi, height, gender and weight can be use to impute themselfs
- Categorical variables let's keep the effect of nan using -99

### Phyisical variables (age, bmi, ethnicity, gender, height and weight)

In [None]:
%%time
physical_cols = ["age", "bmi", "ethnicity", "gender", "height", "weight"]
train_physical = x_train_val.loc[:, physical_cols]
test_physical = x_test.loc[:, physical_cols]

imputer = KNNImputer()
train_test_physical_imputed = imputer.fit_transform(pd.concat([train_physical, test_physical], axis=0))
train_test_physical_imputed = pd.DataFrame(train_test_physical_imputed, columns=physical_cols)

In [None]:
train_physical_imputed = train_test_physical_imputed.head(train_physical.shape[0])
test_physical_imputed = train_test_physical_imputed.tail(test_physical.shape[0])

### Exams variables

In [None]:
exams_cols = np.intersect1d(exams_cols, x_train_val.columns.tolist())

In [None]:
train_exams_imputed = x_train_val.loc[:, exams_cols].fillna(-99)
test_exams_imputed = x_test.loc[:, exams_cols].fillna(-99)

### Other variables

In [None]:
others_cols = np.concatenate([physical_cols, exams_cols])

In [None]:
x_train_val.drop(others_cols, axis=1).isna().sum()

In [None]:
x_train_val.drop(others_cols, axis=1).isna().sum()

In [None]:
train_others_imputed = x_train_val.drop(others_cols, axis=1).fillna(-99)
test_others_imputed = x_test.drop(others_cols, axis=1).fillna(-99)

### Merging all imputed

In [None]:
x_train_val = pd.concat([train_physical, train_exams_imputed, train_others_imputed], axis=1)
x_test = pd.concat([test_physical, test_exams_imputed, test_others_imputed], axis=1)

In [None]:
SEED = 529
TEST_SIZE = 0.2

x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=TEST_SIZE, random_state=SEED)

In [None]:
data_clean = {
    "x_train_val": x_train_val,
    "y_train_val": y_train_val,
    "x_test": x_test,
    "x_train": x_train,
    "y_train": y_train,
    "x_val": x_val,
    "y_val": y_val}

In [None]:
with open("../data/data_smart_no_correlated_drop.pkl", "wb") as f:
    pickle.dump(data_clean, f)