# Advanced Feature Engineering

In this notebook we will look at advanced feature engineering and compare pandas and cudf in this

## RAPIDS cuDF

In [1]:
from feature_engineering_2 import process_unified, process_bureau_and_balance, process_previous_applications
import cudf as dd
import gc

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%%time
bureau_balance = dd.read_parquet('raw_data/bureau_balance.parquet')
bureau = dd.read_parquet('raw_data/bureau.parquet')
cc_balance = dd.read_parquet('raw_data/cc_balance.parquet')
payments = dd.read_parquet('raw_data/payments.parquet')
pc_balance = dd.read_parquet('raw_data/pc_balance.parquet')
prev = dd.read_parquet('raw_data/prev.parquet')
train = dd.read_parquet('raw_data/train.parquet')
test = dd.read_parquet('raw_data/test.parquet')

train_index = train.index
test_index = test.index

train_target = train['TARGET']
unified = dd.concat([train.drop('TARGET', axis=1), test])

del(train)
del(test)
gc.collect()

CPU times: user 1.35 s, sys: 357 ms, total: 1.71 s
Wall time: 1.73 s


0

In [4]:
%%time

unified_feat = process_unified(unified, dd)

bureau_agg = process_bureau_and_balance(bureau, bureau_balance, dd)

unified_feat = unified_feat.join(bureau_agg, how='left', on='SK_ID_CURR')

# we can't use bool column types in xgb later on
bool_columns = [col for col in unified_feat.columns if (unified_feat[col].dtype in ['bool']) ]    
unified_feat[bool_columns] = unified_feat[bool_columns].astype('int64')

train_feats = unified_feat.loc[train_index].merge(train_target, how='left', 
                                               left_index=True, right_index=True)
test_feats = unified_feat.loc[test_index]

CPU times: user 1.5 s, sys: 778 ms, total: 2.28 s
Wall time: 2.3 s


In [5]:
%%time
train_feats.to_parquet('data_eng/feats/train_feats.parquet')
test_feats.to_parquet('data_eng/feats/test_feats.parquet')

CPU times: user 389 ms, sys: 195 ms, total: 584 ms
Wall time: 584 ms


## Pandas

In [7]:
from feature_engineering_2 import process_unified, process_bureau_and_balance
import pandas as dd
import gc

In [8]:
%%time
bureau_balance = dd.read_parquet('raw_data/bureau_balance.parquet')
bureau = dd.read_parquet('raw_data/bureau.parquet')
cc_balance = dd.read_parquet('raw_data/cc_balance.parquet')
payments = dd.read_parquet('raw_data/payments.parquet')
pc_balance = dd.read_parquet('raw_data/pc_balance.parquet')
prev = dd.read_parquet('raw_data/prev.parquet')
train = dd.read_parquet('raw_data/train.parquet')
test = dd.read_parquet('raw_data/test.parquet')

train_index = train.index
test_index = test.index

train_target = train['TARGET']
unified = dd.concat([train.drop('TARGET', axis=1), test])

del(train)
del(test)
gc.collect()

CPU times: user 5.26 s, sys: 4.22 s, total: 9.48 s
Wall time: 5.08 s


0

In [9]:
# fix for the process functions not working with columns of type `category`
bureau_balance['STATUS'] = bureau_balance['STATUS'].astype('object') 
bureau['CREDIT_ACTIVE'] = bureau['CREDIT_ACTIVE'].astype('object')
bureau['CREDIT_CURRENCY'] = bureau['CREDIT_CURRENCY'].astype('object')

In [10]:
%%time

unified_feat = process_unified(unified, dd)

bureau_agg = process_bureau_and_balance(bureau, bureau_balance, dd)

unified_feat = unified_feat.join(bureau_agg, how='left', on='SK_ID_CURR')

# we can't use bool column types in xgb later on
bool_columns = [col for col in unified_feat.columns if (unified_feat[col].dtype in ['bool']) ]    
unified_feat[bool_columns] = unified_feat[bool_columns].astype('int64')

train_feats = unified_feat.loc[train_index].merge(train_target, how='left', 
                                               left_index=True, right_index=True)
test_feats = unified_feat.loc[test_index]

CPU times: user 8.85 s, sys: 1.45 s, total: 10.3 s
Wall time: 10.3 s


In [11]:
%%time
train_feats.to_parquet('data_eng/feats/train_feats.parquet')
test_feats.to_parquet('data_eng/feats/test_feats.parquet')

CPU times: user 2.79 s, sys: 147 ms, total: 2.94 s
Wall time: 2.31 s
