# Advanced Feature Engineering

In this notebook we will look at advanced feature engineering and compare pandas and cudf in this

## RAPIDS cuDF

In [1]:
from feature_engineering import (
    pos_cash, process_unified, process_bureau_and_balance, 
    process_previous_applications, installments_payments,
    credit_card_balance
    )
import cudf as xd
import gc
import rmm
import numpy as np
import pandas as pd

In [None]:
import os
os.chdir('/home/cdsw')

In [2]:
rmm.reinitialize(managed_memory=True) # roughly 14GB pool

In [3]:
%load_ext autoreload
%autoreload 2

In [9]:
%%time
bureau_balance = xd.read_parquet('raw_data/bureau_balance.parquet')
bureau = xd.read_parquet('raw_data/bureau.parquet')
cc_balance = xd.read_parquet('raw_data/cc_balance.parquet')
payments = xd.read_parquet('raw_data/payments.parquet')
pc_balance = xd.read_parquet('raw_data/pc_balance.parquet')
prev = xd.read_parquet('raw_data/prev.parquet')
train = xd.read_parquet('raw_data/train.parquet')
test = xd.read_parquet('raw_data/test.parquet')

CPU times: user 1.61 s, sys: 720 ms, total: 2.33 s
Wall time: 2.3 s


In [10]:
train_target = train['TARGET']

# for col in test.columns.tolist():
#     if train[col].dtype != test[col].dtype:
#         train = train.drop(col, axis=1)
#         test = test.drop(col, axis=1)

train_index = train.index
test_index = test.index

unified = xd.concat([train.drop('TARGET', axis=1), test])

del(train)
del(test)
gc.collect()

575

In [11]:
%%time

unified_feat = process_unified(unified, xd)

bureau_agg = process_bureau_and_balance(bureau, bureau_balance, xd)
del unified, bureau, bureau_balance

prev_agg = process_previous_applications(prev, xd)
pos_agg = pos_cash(pc_balance, xd)
ins_agg = installments_payments(payments, xd)
cc_agg = credit_card_balance(cc_balance, xd)

del prev, pc_balance, payments, cc_balance
gc.collect()

unified_feat = unified_feat.merge(bureau_agg, how='left', on='SK_ID_CURR') \
    .merge(prev_agg, how='left', on='SK_ID_CURR') \
    .merge(pos_agg, how='left', on='SK_ID_CURR') \
    .merge(ins_agg, how='left', on='SK_ID_CURR') \
    .merge(cc_agg, how='left', on='SK_ID_CURR')

del bureau_agg, prev_agg, pos_agg, ins_agg, cc_agg
gc.collect()

# we can't use bool column types in xgb later on
bool_columns = [col for col in unified_feat.columns if (unified_feat[col].dtype in ['bool']) ]    
unified_feat[bool_columns] = unified_feat[bool_columns].astype('int64')

# We will label encode for xgb later on
from sklearn.preprocessing import LabelEncoder
# label encode cats
label_encode_dict = {}

categorical = unified_feat.select_dtypes(include=pd.CategoricalDtype).columns 
for column in categorical:
    label_encode_dict[column] = LabelEncoder()
    unified_feat[column] =  label_encode_dict[column].fit_transform(unified_feat[column])
    unified_feat[column] = unified_feat[column].astype('int64')

### Fix for Int64D
Int64D = unified_feat.select_dtypes(include=[pd.Int64Dtype]).columns
unified_feat[Int64D] = unified_feat[Int64D].fillna(0)
unified_feat[Int64D] = unified_feat[Int64D].astype('int64')

### fix unit8
uint8 = unified_feat.select_dtypes(include=['uint8']).columns
unified_feat[uint8] = unified_feat[uint8].astype('int64')

#unified_feat.replace([np.inf, -np.inf], np.nan, inplace=True)
na_cols = unified_feat.isna().any()[unified_feat.isna().any()==True].index.to_arrow().to_pylist()
unified_feat[na_cols] = unified_feat[na_cols].fillna(0)

train_feats = unified_feat.loc[train_index].merge(train_target, how='left', 
                                               left_index=True, right_index=True)
test_feats = unified_feat.loc[test_index]

  return infer_dtype_from_object(dtype)


CPU times: user 8.07 s, sys: 2.49 s, total: 10.6 s
Wall time: 13.7 s


In [40]:
%%time
train_feats.to_parquet('data_eng/feats/train_feats.parquet')
del train_feats

test_feats.to_parquet('data_eng/feats/test_feats.parquet')

CPU times: user 3.35 s, sys: 708 ms, total: 4.06 s
Wall time: 6.52 s


## Pandas

In [4]:
from feature_engineering import (
    pos_cash, process_unified, process_bureau_and_balance, 
    process_previous_applications, installments_payments,
    credit_card_balance 
    )
import pandas as xd
import gc

In [5]:
%%time
bureau_balance = xd.read_parquet('raw_data/bureau_balance.parquet')
bureau = xd.read_parquet('raw_data/bureau.parquet')
cc_balance = xd.read_parquet('raw_data/cc_balance.parquet')
payments = xd.read_parquet('raw_data/payments.parquet')
pc_balance = xd.read_parquet('raw_data/pc_balance.parquet')
prev = xd.read_parquet('raw_data/prev.parquet')
train = xd.read_parquet('raw_data/train.parquet')
test = xd.read_parquet('raw_data/test.parquet')

train_index = train.index
test_index = test.index

train_target = train['TARGET']
unified = xd.concat([train.drop('TARGET', axis=1), test])

del(train)
del(test)
gc.collect()

CPU times: user 9.39 s, sys: 4.6 s, total: 14 s
Wall time: 4.94 s


0

In [6]:
# fix for the process functions not working with columns of type `category`
bureau_balance['STATUS'] = bureau_balance['STATUS'].astype('object') 
bureau['CREDIT_ACTIVE'] = bureau['CREDIT_ACTIVE'].astype('object')
bureau['CREDIT_CURRENCY'] = bureau['CREDIT_CURRENCY'].astype('object')

prev['NAME_CONTRACT_STATUS'] = prev['NAME_CONTRACT_STATUS'].astype('object')

In [7]:
%%time

unified_feat = process_unified(unified, xd)

bureau_agg = process_bureau_and_balance(bureau, bureau_balance, xd)

prev_agg = process_previous_applications(prev, xd)
pos_agg = pos_cash(pc_balance, xd)
ins_agg = installments_payments(payments, xd)
cc_agg = credit_card_balance(cc_balance, xd)

unified_feat = unified_feat.merge(bureau_agg, how='left', on='SK_ID_CURR') \
    .merge(prev_agg, how='left', on='SK_ID_CURR') \
    .merge(pos_agg, how='left', on='SK_ID_CURR') \
    .merge(ins_agg, how='left', on='SK_ID_CURR') \
    .merge(cc_agg, how='left', on='SK_ID_CURR')

# we can't use bool column types in xgb later on
bool_columns = [col for col in unified_feat.columns if (unified_feat[col].dtype in ['bool']) ]
unified_feat[bool_columns] = unified_feat[bool_columns].astype('int64')

# We will label encode for xgb later on
from sklearn.preprocessing import LabelEncoder
# label encode cats
label_encode_dict = {}

categorical = unified_feat.select_dtypes(include=pd.CategoricalDtype).columns 
for column in categorical:
    label_encode_dict[column] = LabelEncoder()
    unified_feat[column] =  label_encode_dict[column].fit_transform(unified_feat[column])
    unified_feat[column] = unified_feat[column].astype('int64')

### Fix for Int64D
Int64D = unified_feat.select_dtypes(include=[pd.Int64Dtype]).columns
unified_feat[Int64D] = unified_feat[Int64D].fillna(0)
unified_feat[Int64D] = unified_feat[Int64D].astype('int64')

### fix unit8
uint8 = unified_feat.select_dtypes(include=['uint8']).columns
unified_feat[uint8] = unified_feat[uint8].astype('int64')

nan_columns = unified_feat.columns[unified_feat.isna().any()].tolist()
unified_feat.replace([np.inf, -np.inf], np.nan, inplace=True)
unified_feat[nan_columns] = unified_feat[nan_columns].fillna(0)

train_feats = unified_feat.loc[train_index].merge(train_target, how='left', 
                                               left_index=True, right_index=True)
test_feats = unified_feat.loc[test_index]

  income_by_organisation = unified[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']


CPU times: user 1min 1s, sys: 12.3 s, total: 1min 13s
Wall time: 1min 16s


In [8]:
%%time
train_feats.to_parquet('data_eng/feats/train_feats.parquet')
test_feats.to_parquet('data_eng/feats/test_feats.parquet')

CPU times: user 7.83 s, sys: 873 ms, total: 8.7 s
Wall time: 9.52 s
