In [1]:
import pandas as pd
import numpy as np

import os
import sys

In [2]:
df = pd.read_csv('../../resources/transactions_train.csv')

In [3]:
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0
...,...,...,...,...,...,...,...,...,...,...
6351188,699,TRANSFER,162326.52,C1557504343,162326.52,0.00,C404511346,0.00,0.00,1
6351189,699,CASH_OUT,162326.52,C1532317723,162326.52,0.00,C446134087,0.00,162326.52,1
6351190,699,TRANSFER,2763398.31,C577803442,2763398.31,0.00,C619602282,0.00,0.00,1
6351191,699,CASH_OUT,2763398.31,C1491503658,2763398.31,0.00,C454424230,339515.35,3102913.66,1


### Convert dtypes

In [4]:
# Convert Dtypes :
df[df.select_dtypes(['int64','int16','float32','float64','int8']).columns] = df[df.select_dtypes(['int64','int16','float32','float64','int8']).columns].apply(pd.to_numeric)

df[df.select_dtypes(['object','category']).columns] = df.select_dtypes(['object','category']).apply(lambda x: x.astype('category'))

In [5]:
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrig',
       'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest',
       'isFraud'],
      dtype='object')

In [6]:
X = df[df.columns[:-1]]
y = df[df.columns[-1]].to_numpy()

In [7]:
X

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00
...,...,...,...,...,...,...,...,...,...
6351188,699,TRANSFER,162326.52,C1557504343,162326.52,0.00,C404511346,0.00,0.00
6351189,699,CASH_OUT,162326.52,C1532317723,162326.52,0.00,C446134087,0.00,162326.52
6351190,699,TRANSFER,2763398.31,C577803442,2763398.31,0.00,C619602282,0.00,0.00
6351191,699,CASH_OUT,2763398.31,C1491503658,2763398.31,0.00,C454424230,339515.35,3102913.66


In [8]:
y

array([0, 0, 1, ..., 1, 1, 0])

In [9]:
# select non-numeric columns
cat_columns = X.select_dtypes(exclude=['int64','int16','float32','float64','int8']).columns

# select the float columns
num_columns = X.select_dtypes(include=['int64','int16','float32','float64','int8']).columns

In [10]:
num_columns

Index(['step', 'amount', 'oldbalanceOrig', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest'],
      dtype='object')

In [11]:
cat_columns

Index(['type', 'nameOrig', 'nameDest'], dtype='object')

In [12]:
all_columns = (num_columns.append(cat_columns))
print(cat_columns)
print(num_columns)
print(all_columns)

Index(['type', 'nameOrig', 'nameDest'], dtype='object')
Index(['step', 'amount', 'oldbalanceOrig', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest'],
      dtype='object')
Index(['step', 'amount', 'oldbalanceOrig', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'type', 'nameOrig', 'nameDest'],
      dtype='object')


### Pipeline preparation data

In [13]:
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import PowerTransformer

from sklearn.compose import ColumnTransformer

import category_encoders as ce

In [14]:
X

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00
...,...,...,...,...,...,...,...,...,...
6351188,699,TRANSFER,162326.52,C1557504343,162326.52,0.00,C404511346,0.00,0.00
6351189,699,CASH_OUT,162326.52,C1532317723,162326.52,0.00,C446134087,0.00,162326.52
6351190,699,TRANSFER,2763398.31,C577803442,2763398.31,0.00,C619602282,0.00,0.00
6351191,699,CASH_OUT,2763398.31,C1491503658,2763398.31,0.00,C454424230,339515.35,3102913.66


In [15]:
import pandas as pd
import numpy as np

import os
import sys

BASE_DIR = '/'.join(os.getcwd().split('/')[:-2])
sys.path.append(BASE_DIR)

from src.loaders.featuring import CumTransactionCustomerinStep, AggAmountTransactionofCustomer, CountFrequency

In [16]:
cum_transaction_customer_in_step_pipeline = Pipeline(
    steps=[
        ("Cumulate Transaction Customer in Step", CumTransactionCustomerinStep(step='step', customer_ids='nameOrig'))
    ]
)
agg_amount_transaction_of_customer_pipeline = Pipeline(
    steps=[
        ("Aggregate Amount Transaction of Customer", AggAmountTransactionofCustomer(amount='amount', customer_ids='nameOrig')),
        ("Scaler", PowerTransformer())
    ]
)
count_frequence_pipeline = Pipeline(
    steps=[
        ("Count frequence of Customer", CountFrequency(variables=['nameOrig', 'nameDest']))
    ]
)
scale_numerical_pipeline = Pipeline(
    steps=[
        ("Scale Numerical variables", PowerTransformer())
    ]    
)
category_to_numerical_pipeline = Pipeline(
    steps=[
        ("Categorical variables to Numerical variables", ce.cat_boost.CatBoostEncoder())
    ]
)


#Featureunion fitting training data
preprocessor = FeatureUnion(transformer_list=[('cum_transaction', cum_transaction_customer_in_step_pipeline),
                                              ('mean_amount', agg_amount_transaction_of_customer_pipeline),
                                              ('freq_customer', count_frequence_pipeline),
                                              ('category_encoder', category_to_numerical_pipeline),
                                              ('numeric_scaler', scale_numerical_pipeline)])

In [17]:
preprocessor

In [18]:
data_pipeline = ColumnTransformer(
    [   
        ('cum_transaction', cum_transaction_customer_in_step_pipeline, ['step','nameOrig']),
        ('mean_amount', agg_amount_transaction_of_customer_pipeline, ['amount','nameOrig']),
        ('freq_customer', count_frequence_pipeline, ['nameOrig','nameDest']),
        ('category_encoder', category_to_numerical_pipeline, ['type']),
        ('numeric_scaler', scale_numerical_pipeline, ['amount', 'oldbalanceOrig', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest'])
    ]
)

In [19]:
data_pipeline

In [20]:
X

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00
...,...,...,...,...,...,...,...,...,...
6351188,699,TRANSFER,162326.52,C1557504343,162326.52,0.00,C404511346,0.00,0.00
6351189,699,CASH_OUT,162326.52,C1532317723,162326.52,0.00,C446134087,0.00,162326.52
6351190,699,TRANSFER,2763398.31,C577803442,2763398.31,0.00,C619602282,0.00,0.00
6351191,699,CASH_OUT,2763398.31,C1491503658,2763398.31,0.00,C454424230,339515.35,3102913.66


In [21]:
features = data_pipeline.fit_transform(X, y)

In [26]:
features.shape

(6351193, 10)

In [25]:
features[:5]

array([[ 1.00000000e+00, -9.54966991e-01,  1.00000000e+00,
         1.00000000e+00,  1.21504731e-03, -9.53579562e-01,
         7.99284571e-01,  1.11360603e+00, -1.13436084e+00,
        -1.22433565e+00],
       [ 1.00000000e+00, -1.66980028e+00,  1.00000000e+00,
         1.00000000e+00,  6.07523657e-04, -1.66844851e+00,
         3.78806480e-01,  8.89647475e-01, -1.13436084e+00,
        -1.22433565e+00],
       [ 1.00000000e+00, -2.47038514e+00,  1.00000000e+00,
         4.40000000e+01,  1.21504731e-03, -2.46946078e+00,
        -4.77213888e-01, -8.63663768e-01, -1.13436084e+00,
        -1.22433565e+00],
       [ 1.00000000e+00, -2.47038514e+00,  1.00000000e+00,
         4.10000000e+01,  1.21504731e-03, -2.46946078e+00,
        -4.77213888e-01, -8.63663768e-01,  2.19437247e-01,
        -1.22433565e+00],
       [ 1.00000000e+00, -8.73887576e-01,  1.00000000e+00,
         1.00000000e+00,  4.05015772e-04, -8.72513844e-01,
         5.11090364e-01,  9.38744768e-01, -1.13436084e+00,
        -1.