In [None]:
import cudf
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from preprocessing_tools import drop_missing_train_test, get_top_abs_correlations

In [None]:
train = pd.read_csv("../data/TrainingWiDS2021.csv", index_col=0)
test = pd.read_csv("../data/UnlabeledWiDS2021.csv", index_col=0)

## Split data in covariables and response

In [None]:
RESPONSE_COL = "diabetes_mellitus"
DROP_X_COLS = ["readmission_status", "encounter_id", "hospital_id"]

x_train_val = train.copy().drop(np.append(DROP_X_COLS, RESPONSE_COL), axis=1)
y_train_val = train.loc[:, RESPONSE_COL]

x_test = test.copy().drop(DROP_X_COLS, axis=1)

## Drop missing

In [None]:
MISSING_TRESH = 0.5

x_train_val, x_test = drop_missing_train_test(x_train_val, x_test, treshold=MISSING_TRESH)

## Split features in categorical e numerical

In [None]:
num_cols = x_train_val.columns[x_train_val.dtypes != "object"].values
cat_cols = x_train_val.columns[x_train_val.dtypes == "object"].values

x_num_train_val = x_train_val[num_cols]
x_cat_train_val = x_train_val[cat_cols]

x_num_test = x_test[num_cols]
x_cat_test = x_test[cat_cols]

## Data imputation

In [None]:
means = x_num_train_val.mean()
x_num_train_val = x_num_train_val.fillna(means)
x_num_test = x_num_test.fillna(means)

## PCA

In [None]:
VARIANCE_EXPLAINED = 0.99

pca = PCA(VARIANCE_EXPLAINED)
pca.fit(x_num_train_val)
pca.n_components_

In [None]:
x_num_train_val_pca = pd.DataFrame(pca.transform(x_num_train_val))
x_num_test_pca = pd.DataFrame(pca.transform(x_num_test))

## Encoding categorical

In [None]:
n_train_val = x_cat_train_val.shape[0]
n_test = x_cat_test.shape[0]

x_cat = pd.concat([x_cat_train_val, x_cat_test], axis=1)
x_dummy = pd.get_dummies(x_cat)

x_train_dummy = x_dummy.head(n_train_val)
x_test_dummy = x_dummy.head(n_test)

In [None]:
x_train_val = pd.concat([x_num_train_val_pca.reset_index(drop=True), x_train_dummy.reset_index(drop=True)], axis=1)
x_test = pd.concat([x_num_test_pca.reset_index(drop=True), x_test_dummy.reset_index(drop=True)], axis=1)

In [None]:
SEED = 529
TEST_SIZE = 0.2

x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=TEST_SIZE, random_state=SEED)

In [None]:
data_clean = {
    "x_train_val": x_train_val,
    "y_train_val": y_train_val,
    "x_test": x_test,
    "x_train": x_train,
    "y_train": y_train,
    "x_val": x_val,
    "y_val": y_val}

In [None]:
with open("../data/data_pca_99_dummy.pkl", "wb") as f:
    pickle.dump(data_clean, f)