In [None]:
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

%load_ext autoreload

%autoreload 2

import pandas as pd

## ETL and feature engineering

### Load Data

In [None]:
train_data = pd.read_csv("../data/loan-default-prediction/train_v2.csv")

### Removal of unnecessary columns

In [None]:
dbfile = open('../data/columns_to_be_deleted', 'rb')      
columns_to_be_deleted = pickle.load(dbfile) 
dbfile.close()

### Null value replacement

In [None]:
from null_value_replacer import NullValueReplacer
    
null_value_replacer = NullValueReplacer("median")

with_filled_in_missing_values = null_value_replacer.fit_transform(train_without_unnecessary_columns)

### Scaler

In [None]:
min_max_scaler = MinMaxScaler()

In [None]:
columnsToScale = [x for x in with_filled_in_missing_values.columns.values if x not in ["id", "loss"]]


In [None]:
scaled_train = min_max_scaler.fit_transform(with_filled_in_missing_values[columnsToScale])


### PCA

In [None]:
# 72 components keep 97% explained variance
pca = PCA(n_components=72, svd_solver='full')

In [None]:
pca.fit(scaled_train)

In [None]:
pca_transformed = pca.transform(scaled_train)

In [None]:
pca_transformed.shape

### Save Models

Transformers to be saved:
- unnecessary_column_remover
- null_value_replacer
- min_max_scaler
- pca

In [None]:
etl_models = {
    "unnecessary_column_remover": unnecessary_column_remover,
    "null_value_replacer": null_value_replacer,
    "min_max_scaler": min_max_scaler,
    "pca":pca
}

dbfile = open('./models/models.pickle', 'ab') 
pickle.dump(etl_models, dbfile)                      
dbfile.close()

### Save transformed data

In [None]:
X  = pd.DataFrame(data=pca_transformed) 

In [None]:
X.to_csv("../data/transformed_x.csv", header=False)

In [None]:
with_filled_in_missing_values["loss"].to_csv("../data/y.csv", header=False)