In [None]:
import cudf
import pandas as pd
import numpy as np
import pickle
from preprocessing_tools import drop_missing_train_test, get_top_abs_correlations, encoder_train_test

In [None]:
train = pd.read_csv("../data/TrainingWiDS2021.csv", index_col=0)
test = pd.read_csv("../data/UnlabeledWiDS2021.csv", index_col=0)

## Split data in covariables and response

In [None]:
RESPONSE_COL = "diabetes_mellitus"
DROP_X_COLS = ["readmission_status", "encounter_id"]

x_train_val = train.copy().drop(np.append(DROP_X_COLS, RESPONSE_COL), axis=1)
y_train_val = train.loc[:, RESPONSE_COL]

x_test = test.copy().drop(DROP_X_COLS, axis=1)

## Drop missing

In [None]:
MISSING_TRESH = 0.5

x_train_val, x_test = drop_missing_train_test(x_train_val, x_test, treshold=MISSING_TRESH)

## Preprocess covariables

In [None]:
num_cols = x_train_val.columns[x_train_val.dtypes != "object"].values
cat_cols = x_train_val.columns[x_train_val.dtypes == "object"].values

x_num_train_val = x_train_val[num_cols]
x_cat_train_val = x_train_val[cat_cols]

x_num_test = x_test[num_cols]
x_cat_test = x_test[cat_cols]

### Removing correlated features

In [None]:
CORR_TRESH = 0.8

x_num_inputed = x_num_train_val.fillna(x_num_train_val.mean())

corr_info = get_top_abs_correlations(x_num_inputed, n=10000000)
corr_selected = corr_info[corr_info["corr_abs"] >= CORR_TRESH]

In [None]:
to_drop = list(set(corr_selected["level_1"].tolist()))

x_num_train_val = x_num_train_val.drop(to_drop, axis=1)
x_test = x_test.drop(to_drop, axis=1)

## Encoding categorical

In [None]:
%%time
x_train_le, x_test_le, string_cols_le = encoder_train_test(x_cat_train_val, x_test)
x_train_le = x_train_le.to_pandas()
x_test_le = x_test_le.to_pandas()

In [None]:
x_train_val = pd.concat([x_num_train_val, x_train_le], axis=1)
x_test_le = x_test_le[x_train_val.columns.tolist()] 

In [None]:
data_clean = {"x_train": x_train_val, "y_train": y_train_val, "x_test": x_test_le}

In [None]:
with open("../data/data_remove_redudant.pkl", "wb") as f:
    pickle.dump(data_clean, f)