# Advanced Feature Engineering

In this notebook we will look at advanced feature engineering and compare pandas and cudf in this

## RAPIDS cuDF

In [14]:
from feature_engineering_2 import (
    pos_cash, process_unified, process_bureau_and_balance, 
    process_previous_applications, installments_payments,
    credit_card_balance
    )
import cudf as dd
import gc
import numpy as np
import pandas as pd

In [15]:
# this allows for overflow of gpu ram to normal ram and hence avoid Out of Memory Errors
dd.set_allocator("managed")

In [16]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
%%time
bureau_balance = dd.read_parquet('raw_data/bureau_balance.parquet')
bureau = dd.read_parquet('raw_data/bureau.parquet')
cc_balance = dd.read_parquet('raw_data/cc_balance.parquet')
payments = dd.read_parquet('raw_data/payments.parquet')
pc_balance = dd.read_parquet('raw_data/pc_balance.parquet')
prev = dd.read_parquet('raw_data/prev.parquet')
train = dd.read_parquet('raw_data/train.parquet')
test = dd.read_parquet('raw_data/test.parquet')

train_index = train.index
test_index = test.index

train_target = train['TARGET']
unified = dd.concat([train.drop('TARGET', axis=1), test])

del(train)
del(test)
gc.collect()

CPU times: user 1.33 s, sys: 255 ms, total: 1.58 s
Wall time: 1.58 s


0

In [48]:
%%time

unified_feat = process_unified(unified, dd)

bureau_agg = process_bureau_and_balance(bureau, bureau_balance, dd)

prev_agg = process_previous_applications(prev, dd)
pos_agg = pos_cash(pc_balance, dd)
ins_agg = installments_payments(payments, dd)
cc_agg = credit_card_balance(cc_balance, dd)

unified_feat = unified_feat.join(bureau_agg, how='left', on='SK_ID_CURR') \
    .join(prev_agg, how='left', on='SK_ID_CURR') \
    .join(pos_agg, how='left', on='SK_ID_CURR') \
    .join(ins_agg, how='left', on='SK_ID_CURR') \
    .join(cc_agg, how='left', on='SK_ID_CURR')

# we can't use bool column types in xgb later on
bool_columns = [col for col in unified_feat.columns if (unified_feat[col].dtype in ['bool']) ]    
unified_feat[bool_columns] = unified_feat[bool_columns].astype('int64')

# We will label encode for xgb later on
from sklearn.preprocessing import LabelEncoder
# label encode cats
label_encode_dict = {}

categorical = unified_feat.select_dtypes(include=pd.CategoricalDtype).columns 
for column in categorical:
    label_encode_dict[column] = LabelEncoder()
    unified_feat[column] =  label_encode_dict[column].fit_transform(unified_feat[column])
    unified_feat[column] = unified_feat[column].astype('int64')

### Fix for Int64D
Int64D = unified_feat.select_dtypes(include=[pd.Int64Dtype]).columns
unified_feat[Int64D] = unified_feat[Int64D].fillna(0)
unified_feat[Int64D] = unified_feat[Int64D].astype('int64')

### fix unit8
uint8 = unified_feat.select_dtypes(include=['uint8']).columns
unified_feat[uint8] = unified_feat[uint8].astype('int64')

#unified_feat.replace([np.inf, -np.inf], np.nan, inplace=True)
na_cols = unified_feat.isna().any()[unified_feat.isna().any()==True].index.to_arrow().to_pylist()
unified_feat[na_cols] = unified_feat[na_cols].fillna(0)

train_feats = unified_feat.loc[train_index].merge(train_target, how='left', 
                                               left_index=True, right_index=True)
test_feats = unified_feat.loc[test_index]

CPU times: user 13.7 s, sys: 2.37 s, total: 16.1 s
Wall time: 16.1 s


In [47]:
%%time
train_feats.to_parquet('data_eng/feats/train_feats.parquet')
test_feats.to_parquet('data_eng/feats/test_feats.parquet')

CPU times: user 3.65 s, sys: 437 ms, total: 4.08 s
Wall time: 4.35 s


## Pandas

In [3]:
from feature_engineering_2 import process_unified, process_bureau_and_balance
import pandas as dd
import gc

In [4]:
%%time
bureau_balance = dd.read_parquet('raw_data/bureau_balance.parquet')
bureau = dd.read_parquet('raw_data/bureau.parquet')
cc_balance = dd.read_parquet('raw_data/cc_balance.parquet')
payments = dd.read_parquet('raw_data/payments.parquet')
pc_balance = dd.read_parquet('raw_data/pc_balance.parquet')
prev = dd.read_parquet('raw_data/prev.parquet')
train = dd.read_parquet('raw_data/train.parquet')
test = dd.read_parquet('raw_data/test.parquet')

train_index = train.index
test_index = test.index

train_target = train['TARGET']
unified = dd.concat([train.drop('TARGET', axis=1), test])

del(train)
del(test)
gc.collect()

CPU times: user 7.03 s, sys: 4.39 s, total: 11.4 s
Wall time: 5.4 s


0

In [5]:
# fix for the process functions not working with columns of type `category`
bureau_balance['STATUS'] = bureau_balance['STATUS'].astype('object') 
bureau['CREDIT_ACTIVE'] = bureau['CREDIT_ACTIVE'].astype('object')
bureau['CREDIT_CURRENCY'] = bureau['CREDIT_CURRENCY'].astype('object')

prev['NAME_CONTRACT_STATUS'] = prev['NAME_CONTRACT_STATUS'].astype('object')

In [8]:
%%time

unified_feat = process_unified(unified, dd)

bureau_agg = process_bureau_and_balance(bureau, bureau_balance, dd)

prev_agg = process_previous_applications(prev, dd)
pos_agg = pos_cash(pc_balance, dd)
ins_agg = installments_payments(payments, dd)
cc_agg = credit_card_balance(cc_balance, dd)

unified_feat = unified_feat.join(bureau_agg, how='left', on='SK_ID_CURR') \
    .join(prev_agg, how='left', on='SK_ID_CURR') \
    .join(pos_agg, how='left', on='SK_ID_CURR') \
    .join(ins_agg, how='left', on='SK_ID_CURR') \
    .join(cc_agg, how='left', on='SK_ID_CURR')

# we can't use bool column types in xgb later on
bool_columns = [col for col in unified_feat.columns if (unified_feat[col].dtype in ['bool']) ]    
unified_feat[bool_columns] = unified_feat[bool_columns].astype('int64')

# We will label encode for xgb later on
from sklearn.preprocessing import LabelEncoder
# label encode cats
label_encode_dict = {}

categorical = unified_feat.select_dtypes(include=pd.CategoricalDtype).columns 
for column in categorical:
    label_encode_dict[column] = LabelEncoder()
    unified_feat[column] =  label_encode_dict[column].fit_transform(unified_feat[column])
    unified_feat[column] = unified_feat[column].astype('int64')

### Fix for Int64D
Int64D = unified_feat.select_dtypes(include=[pd.Int64Dtype]).columns
unified_feat[Int64D] = unified_feat[Int64D].fillna(0)
unified_feat[Int64D] = unified_feat[Int64D].astype('int64')

### fix unit8
uint8 = unified_feat.select_dtypes(include=['uint8']).columns
unified_feat[uint8] = unified_feat[uint8].astype('int64')

nan_columns = unified_feat.columns[unified_feat.isna().any()].tolist()
unified_feat.replace([np.inf, -np.inf], np.nan, inplace=True)
unified_feat[nan_columns] = unified_feat[nan_columns].fillna(0)

train_feats = unified_feat.loc[train_index].merge(train_target, how='left', 
                                               left_index=True, right_index=True)
test_feats = unified_feat.loc[test_index]

CPU times: user 36.1 s, sys: 2 s, total: 38.1 s
Wall time: 38.1 s


In [13]:
%%time
train_feats.to_parquet('data_eng/feats/train_feats.parquet')
test_feats.to_parquet('data_eng/feats/test_feats.parquet')

CPU times: user 5.53 s, sys: 280 ms, total: 5.81 s
Wall time: 4.83 s
