In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [11]:
df = pd.read_csv('../data/raw/creditcard.csv')

### All Transformation

In [12]:
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    FunctionTransformer,
    QuantileTransformer
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import QuantileTransformer
from sklearn.decomposition import PCA


default_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
)

Time_rbf_transformer = FunctionTransformer(
    rbf_kernel,
    feature_names_out='one-to-one',
    kw_args={
        'Y':[[79000]],
        'gamma':1e-8
    }
)

q_heavy_tail_pipeline = make_pipeline(
    QuantileTransformer(output_distribution='normal'),
    StandardScaler()
)

# V2, V5, V7 with Amount
vn_amount_pipeline = make_pipeline(
    StandardScaler(),
    PCA(n_components=2)
)



all_transformation = ColumnTransformer(
    transformers=[
        ('Time_rbf', Time_rbf_transformer, ['Time']),
        ('heavy_tail', q_heavy_tail_pipeline, ['V1', 'V2', 'V3', 'V4', 'V5',
                                               'V6','V7', 'V8', 'V9', 'V10',
                                               'V11', 'V12', 'V14', 'V15',
                                               'V16', 'V17', 'V18', 'V19', 'V20',
                                               'V21', 'V22', 'V23', 'V24', 'V25',
                                               'V27', 'V28', 'Amount']),
        ('vn_amount', vn_amount_pipeline, ['V2', 'V5', 'V7'])
        
    ],
    remainder=default_pipeline,
)

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
train, test = train_test_split(df,
                               test_size=0.2,
                               stratify=df['Class'],
                               random_state=10
                               )

In [18]:
X_train, X_test = train.drop('Class', axis=1), test.drop('Class', axis=1)
y_train, y_test = train['Class'], test['Class']