In [124]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import pickle
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

In [112]:
#We read the file that we prepared in the previous notebook
data_fraud = pd.read_parquet('../data/processed/data_fraud_01.parquet')

In [113]:
data_fraud.head()

Unnamed: 0,step,type,amount,device,connection_time,oldbalance_org,age,newbalance_orig,zone,user_number,user_connections,security_alert,oldbalance_dest,newbalance_dest,isfraud
0,1,PAYMENT,9839.64,mac,0.140039,170136.0,85,160296.36,capital,138,5,1,0.0,0.0,0
1,1,PAYMENT,1864.28,mac,0.49689,21249.0,57,19384.72,country,909,1,0,0.0,0.0,0
2,1,TRANSFER,181.0,pc,0.78115,181.0,66,0.0,capital,2569,10,0,0.0,0.0,1
3,1,CASH_OUT,181.0,mac,0.565068,181.0,31,0.0,country,1787,3,0,21182.0,0.0,1
4,1,PAYMENT,11668.14,mac,0.517114,41554.0,90,29885.86,country,3997,8,0,0.0,0.0,0


### Valores missing

The variables in which we have missing values ("device" and "zone") are categorical, so to replace them we are going to create a new category.

In [114]:
#Let's take a look again at the null values we have in the dataframe
data_fraud_null_columns = data_fraud.isnull().sum().sort_values(ascending=False).to_frame('columns_null').reset_index()
data_fraud_null_columns['columns_percentage'] = data_fraud_null_columns['columns_null']/data_fraud.shape[0]
data_fraud_null_columns

Unnamed: 0,index,columns_null,columns_percentage
0,device,104580,0.099735
1,zone,104414,0.099577
2,step,0,0.0
3,type,0,0.0
4,amount,0,0.0
5,connection_time,0,0.0
6,oldbalance_org,0,0.0
7,age,0,0.0
8,newbalance_orig,0,0.0
9,user_number,0,0.0


In [115]:
#We create two lists to classify the variables in categorical or numeric
var_category = ["type", "device", "zone", "security_alert"]

var_num = ["step", "amount", "connection_time", "oldbalance_org", "age", "newbalance_orig", "user_number", "user_connections", 
           "oldbalance_dest", "newbalance_dest"]

#Transformation of categorical variables
data_fraud[var_category] = data_fraud[var_category].astype("category")

We are going to use SimpleImputer to replace null values and OneHotEncoder to encode categorical variables:

In [173]:
#SimpleImputer + OneHotEncoder
onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unkown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

#We create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[('cat', onehot_transformer, var_category)])

In [174]:
#We save the preprocessor to aply it in the models
with open('../models/preprocessor.pickle', 'wb') as f:
    pickle.dump(preprocessor, f)

### Split

In [135]:
#We split the dataset in train and test. We put stratify because the data is unbalanced and select the size of the test
#of 20%
X_train, X_test, y_train, y_test = train_test_split(data_fraud.drop('isfraud',axis=1), 
                                                   data_fraud['isfraud'], 
                                                   stratify=data_fraud['isfraud'], 
                                                   test_size=0.2)

### Encoding 

In [157]:
var_category_e = ["type", "device", "zone", "security_alert"]
ohe = ce.OneHotEncoder(cols=var_category_e)
model = ohe.fit(X_train, y_train)

In [172]:
X_train_t = model.transform(X_train, y_train)
X_test_t = model.transform(X_test, y_test)
X_train_t.columns

Index(['step', 'type_1', 'type_2', 'type_3', 'type_4', 'type_5', 'amount',
       'device_1', 'device_2', 'device_3', 'device_4', 'connection_time',
       'oldbalance_org', 'age', 'newbalance_orig', 'zone_1', 'zone_2',
       'zone_3', 'zone_4', 'user_number', 'user_connections',
       'security_alert_1', 'security_alert_2', 'oldbalance_dest',
       'newbalance_dest'],
      dtype='object')

### Scale

We have decided to scale the data so that they are all in a similar range and because modeling is faster when data is scaled.

In [159]:
scaler = StandardScaler()
model_scaled = scaler.fit(X_train_t)
X_train_scaled = pd.DataFrame(scaler.transform(X_train_t), columns=X_train_t.columns, index=X_train_t.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_t), columns=X_test_t.columns, index=X_test.index)

### Variable selection

We have decided not to use any variable selection method and we are going to use all of them because we are not facing a problem in which we have too many variables, since we only have 14. We consider that we shouldn't eliminate any because we would have very few left to train the model.

In [161]:
#Saving the data
X_train.to_parquet("../data/processed/X_train.parquet")
X_test.to_parquet("../data/processed/X_test.parquet")
y_train.to_frame().to_parquet("../data/processed/y_train.parquet")
y_test.to_frame().to_parquet("../data/processed/y_test.parquet")
X_train_scaled.to_parquet("../data/processed/X_train_scaled.parquet")
X_test_scaled.to_parquet("../data/processed/X_test_scaled.parquet")