In [1]:
%load_ext autoreload
%autoreload 2

In [58]:
import os
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import sklearn.preprocessing as pp

In [3]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])
    import _aux.features as F

# Load Data

In [4]:
df = pd.read_csv("../data/train/X_train.csv", index_col=0,).join(pd.read_csv("../data/train/y_train.csv", index_col=0))

# Variables from our exploration

### 1. Personal
- age
### 2. Status
- status_last_archived_0_24m
- account_worst_status_0_12m
- is_last_arch_worst_status_possible
- is_account_worst_status_0_12m_normal

### 3. Account
- num_active_div_by_paid_inv_0_12m_is_above_1
### 4. Archieved
- num_arch_dc_0_12m
- num_arch_dc_0_12m_binned
- num_arch_ok_0_12m
### 5. Order
- is_merchant_category_blacklisted
    - Tobacco
    - Sex toys
    - Plants & Flowers
    - Dating services

# Preprocessors

In [110]:
preprocessor = ColumnTransformer(
    transformers=[
        ("personal_feature_1",  pp.StandardScaler(),                       ["age"]),
        ("status_feature_1",    pp.OneHotEncoder(handle_unknown="ignore"), ["status_last_archived_0_24m"]),
        ("status_feature_2",    pp.FunctionTransformer(),                  ["status_last_archived_0_24m"]),
        ("status_feature_3",    pp.OneHotEncoder(handle_unknown="ignore"), ["account_worst_status_0_12m"]),
        ("status_feature_4",    pp.FunctionTransformer(),                  ["account_worst_status_0_12m"]),
        ("account_feature_1",   pp.FunctionTransformer(),                  ["num_active_div_by_paid_inv_0_12m"]),
        ("archieved_feature_1", pp.StandardScaler(),                       ["num_arch_dc_0_12m"]),
        ("archieved_feature_2", pp.OneHotEncoder(handle_unknown="ignore"), ["num_arch_dc_0_12m_binned"]),
        ("archieved_feature_3", pp.StandardScaler(),                       ["num_arch_ok_0_12m"]),
        ("order_feature_1",     pp.FunctionTransformer(),                  ["is_merchant_category_blacklisted"]),
    ],
    n_jobs=-1,
    remainder="drop",
)

In [108]:
pipe = Pipeline(
    [
        ("column_creator", F.ExtraColumnCreator()),
        ("preprocessor", preprocessor)
    ]
)

pd.DataFrame(pipe.fit_transform(df))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.385621,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.00000,-0.163973,1.0,0.0,0.0,0.231769,0.0
1,1.541811,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.00000,-0.163973,1.0,0.0,0.0,-0.395708,0.0
2,-0.385173,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.03125,-0.163973,1.0,0.0,0.0,1.423975,0.0
3,0.616859,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.00000,-0.163973,1.0,0.0,0.0,-0.270213,0.0
4,-0.231014,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.00000,-0.163973,1.0,0.0,0.0,-0.270213,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71975,-1.387205,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.00000,-0.163973,1.0,0.0,0.0,-0.395708,0.0
71976,-0.539332,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0.00000,-0.163973,1.0,0.0,0.0,-0.395708,0.0
71977,-0.616411,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0.00000,-0.163973,1.0,0.0,0.0,-0.395708,0.0
71978,1.079335,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.10000,-0.163973,1.0,0.0,0.0,0.043526,0.0
