# Advanced Feature Engineering

In this notebook we will look at advanced feature engineering and compare pandas and cudf in this

## RAPIDS cuDF

In [1]:
from feature_engineering_2 import (
    pos_cash, process_unified, process_bureau_and_balance, 
    process_previous_applications, installments_payments,
    credit_card_balance
    )
import cudf as xd
import gc
import numpy as np
import pandas as pd

In [2]:
xd.set_allocator(pool=True, initial_pool_size=14e9) # roughly 14GB pool

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
%%time
bureau_balance = xd.read_parquet('raw_data/bureau_balance.parquet')
bureau = xd.read_parquet('raw_data/bureau.parquet')
cc_balance = xd.read_parquet('raw_data/cc_balance.parquet')
payments = xd.read_parquet('raw_data/payments.parquet')
pc_balance = xd.read_parquet('raw_data/pc_balance.parquet')
prev = xd.read_parquet('raw_data/prev.parquet')
train = xd.read_parquet('raw_data/train.parquet')
test = xd.read_parquet('raw_data/test.parquet')

train_index = train.index
test_index = test.index

train_target = train['TARGET']
unified = xd.concat([train.drop('TARGET', axis=1), test])

del(train)
del(test)
gc.collect()

CPU times: user 794 ms, sys: 268 ms, total: 1.06 s
Wall time: 1.06 s


19

In [5]:
%%time

unified_feat = process_unified(unified, xd)

bureau_agg = process_bureau_and_balance(bureau, bureau_balance, xd)
del unified, bureau, bureau_balance

prev_agg = process_previous_applications(prev, xd)
pos_agg = pos_cash(pc_balance, xd)
ins_agg = installments_payments(payments, xd)
cc_agg = credit_card_balance(cc_balance, xd)

del prev, pc_balance, payments, cc_balance
gc.collect()

unified_feat = unified_feat.join(bureau_agg, how='left', on='SK_ID_CURR') \
    .join(prev_agg, how='left', on='SK_ID_CURR') \
    .join(pos_agg, how='left', on='SK_ID_CURR') \
    .join(ins_agg, how='left', on='SK_ID_CURR') \
    .join(cc_agg, how='left', on='SK_ID_CURR')

del bureau_agg, prev_agg, pos_agg, ins_agg, cc_agg
gc.collect()

# we can't use bool column types in xgb later on
bool_columns = [col for col in unified_feat.columns if (unified_feat[col].dtype in ['bool']) ]    
unified_feat[bool_columns] = unified_feat[bool_columns].astype('int64')

# We will label encode for xgb later on
from sklearn.preprocessing import LabelEncoder
# label encode cats
label_encode_dict = {}

categorical = unified_feat.select_dtypes(include=pd.CategoricalDtype).columns 
for column in categorical:
    label_encode_dict[column] = LabelEncoder()
    unified_feat[column] =  label_encode_dict[column].fit_transform(unified_feat[column])
    unified_feat[column] = unified_feat[column].astype('int64')

### Fix for Int64D
Int64D = unified_feat.select_dtypes(include=[pd.Int64Dtype]).columns
unified_feat[Int64D] = unified_feat[Int64D].fillna(0)
unified_feat[Int64D] = unified_feat[Int64D].astype('int64')

### fix unit8
uint8 = unified_feat.select_dtypes(include=['uint8']).columns
unified_feat[uint8] = unified_feat[uint8].astype('int64')

#unified_feat.replace([np.inf, -np.inf], np.nan, inplace=True)
na_cols = unified_feat.isna().any()[unified_feat.isna().any()==True].index.to_arrow().to_pylist()
unified_feat[na_cols] = unified_feat[na_cols].fillna(0)

train_feats = unified_feat.loc[train_index].merge(train_target, how='left', 
                                               left_index=True, right_index=True)
test_feats = unified_feat.loc[test_index]

CPU times: user 4.21 s, sys: 779 ms, total: 4.98 s
Wall time: 5.05 s


In [6]:
%%time
train_feats.to_parquet('data_eng/feats/train_feats.parquet')
del train_feats

test_feats.to_parquet('data_eng/feats/test_feats.parquet')

CPU times: user 748 ms, sys: 292 ms, total: 1.04 s
Wall time: 1.04 s


## Pandas

In [7]:
from feature_engineering_2 import process_unified, process_bureau_and_balance
import pandas as xd
import gc

In [8]:
%%time
bureau_balance = xd.read_parquet('raw_data/bureau_balance.parquet')
bureau = xd.read_parquet('raw_data/bureau.parquet')
cc_balance = xd.read_parquet('raw_data/cc_balance.parquet')
payments = xd.read_parquet('raw_data/payments.parquet')
pc_balance = xd.read_parquet('raw_data/pc_balance.parquet')
prev = xd.read_parquet('raw_data/prev.parquet')
train = xd.read_parquet('raw_data/train.parquet')
test = xd.read_parquet('raw_data/test.parquet')

train_index = train.index
test_index = test.index

train_target = train['TARGET']
unified = xd.concat([train.drop('TARGET', axis=1), test])

del(train)
del(test)
gc.collect()

CPU times: user 6.25 s, sys: 4.63 s, total: 10.9 s
Wall time: 7.57 s


0

In [9]:
# fix for the process functions not working with columns of type `category`
bureau_balance['STATUS'] = bureau_balance['STATUS'].astype('object') 
bureau['CREDIT_ACTIVE'] = bureau['CREDIT_ACTIVE'].astype('object')
bureau['CREDIT_CURRENCY'] = bureau['CREDIT_CURRENCY'].astype('object')

prev['NAME_CONTRACT_STATUS'] = prev['NAME_CONTRACT_STATUS'].astype('object')

In [10]:
%%time

unified_feat = process_unified(unified, xd)

bureau_agg = process_bureau_and_balance(bureau, bureau_balance, xd)

prev_agg = process_previous_applications(prev, xd)
pos_agg = pos_cash(pc_balance, xd)
ins_agg = installments_payments(payments, xd)
cc_agg = credit_card_balance(cc_balance, xd)

unified_feat = unified_feat.join(bureau_agg, how='left', on='SK_ID_CURR') \
    .join(prev_agg, how='left', on='SK_ID_CURR') \
    .join(pos_agg, how='left', on='SK_ID_CURR') \
    .join(ins_agg, how='left', on='SK_ID_CURR') \
    .join(cc_agg, how='left', on='SK_ID_CURR')

# we can't use bool column types in xgb later on
bool_columns = [col for col in unified_feat.columns if (unified_feat[col].dtype in ['bool']) ]    
unified_feat[bool_columns] = unified_feat[bool_columns].astype('int64')

# We will label encode for xgb later on
from sklearn.preprocessing import LabelEncoder
# label encode cats
label_encode_dict = {}

categorical = unified_feat.select_dtypes(include=pd.CategoricalDtype).columns 
for column in categorical:
    label_encode_dict[column] = LabelEncoder()
    unified_feat[column] =  label_encode_dict[column].fit_transform(unified_feat[column])
    unified_feat[column] = unified_feat[column].astype('int64')

### Fix for Int64D
Int64D = unified_feat.select_dtypes(include=[pd.Int64Dtype]).columns
unified_feat[Int64D] = unified_feat[Int64D].fillna(0)
unified_feat[Int64D] = unified_feat[Int64D].astype('int64')

### fix unit8
uint8 = unified_feat.select_dtypes(include=['uint8']).columns
unified_feat[uint8] = unified_feat[uint8].astype('int64')

nan_columns = unified_feat.columns[unified_feat.isna().any()].tolist()
unified_feat.replace([np.inf, -np.inf], np.nan, inplace=True)
unified_feat[nan_columns] = unified_feat[nan_columns].fillna(0)

train_feats = unified_feat.loc[train_index].merge(train_target, how='left', 
                                               left_index=True, right_index=True)
test_feats = unified_feat.loc[test_index]

CPU times: user 47.8 s, sys: 9.15 s, total: 56.9 s
Wall time: 56.9 s


In [11]:
%%time
train_feats.to_parquet('data_eng/feats/train_feats.parquet')
test_feats.to_parquet('data_eng/feats/test_feats.parquet')

CPU times: user 5.72 s, sys: 315 ms, total: 6.04 s
Wall time: 5.39 s
