In [None]:
import joblib
import os
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import sklearn.preprocessing as pp

In [None]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])
    import _aux.features as F

# Load Data

In [None]:
df = pd.read_csv(
    "../data/train/X_train.csv",
    index_col=0,
).join(pd.read_csv("../data/train/y_train.csv", index_col=0))

X_train = pd.read_csv(
    "../data/train/X_train.csv",
    index_col=0,
)

# Variables from our exploration

### 1. Personal
- age
### 2. Status
- status_last_archived_0_24m
- account_worst_status_0_12m
- is_last_arch_worst_status_possible
- is_account_worst_status_0_12m_normal

### 3. Account
- num_active_div_by_paid_inv_0_12m_is_above_1
### 4. Archieved
- num_arch_dc_0_12m
- num_arch_dc_0_12m_binned
- num_arch_ok_0_12m
### 5. Order
- is_merchant_category_blacklisted
    - Tobacco
    - Sex toys
    - Plants & Flowers
    - Dating services

# Preprocessors

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("personal_feature_1", pp.StandardScaler(), ["age"]),
        (
            "status_feature_1",
            pp.OneHotEncoder(handle_unknown="ignore"),
            ["status_last_archived_0_24m"],
        ),
        ("status_feature_2", pp.FunctionTransformer(), ["status_last_archived_0_24m"]),
        (
            "status_feature_3",
            pp.OneHotEncoder(handle_unknown="ignore"),
            ["account_worst_status_0_12m"],
        ),
        ("status_feature_4", pp.FunctionTransformer(), ["account_worst_status_0_12m"]),
        (
            "account_feature_1",
            pp.FunctionTransformer(),
            ["num_active_div_by_paid_inv_0_12m_is_above_1"],
        ),
        ("archieved_feature_1", pp.StandardScaler(), ["num_arch_dc_0_12m"]),
        (
            "archieved_feature_2",
            pp.OneHotEncoder(handle_unknown="ignore"),
            ["num_arch_dc_0_12m_binned"],
        ),
        ("archieved_feature_3", pp.StandardScaler(), ["num_arch_ok_0_12m"]),
        (
            "order_feature_1",
            pp.FunctionTransformer(),
            ["is_merchant_category_blacklisted"],
        ),
    ],
    n_jobs=-1,
    remainder="drop",
)

In [None]:
pipe = Pipeline(
    [
        ("column_creator", F.ExtraColumnCreator()),
        ("preprocessor", preprocessor),
        ("model", pp.FunctionTransformer())
    ]
)

pipe = pipe.fit(df, df["default"])

joblib.dump(pipe, "../ml_artifacts/preprocessor.joblib.gz")