## Feature Engineering

In [19]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import datetime
import missingno as msno
import lightgbm as lgb
import xgboost as xgb
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import KFold, TimeSeriesSplit, train_test_split,StratifiedKFold
import gc
from statistics import mean 
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import pickle

In [20]:
train_full = pd.read_pickle('data/train_full.pkl')
test_full = pd.read_pickle('data/test_full.pkl')


In [21]:
train_full.shape

(590540, 434)

Start with Time features

In [22]:
train_feat = train_full.copy()
test_feat = test_full.copy()

REMOVE_COLS = []

In [23]:
train_feat['Transaction_day_of_week'] = np.floor((train_full['TransactionDT'] / (3600 * 24) - 1) % 7)
test_feat['Transaction_day_of_week'] = np.floor((test_full['TransactionDT'] / (3600 * 24) - 1) % 7)
train_feat['Transaction_hour'] = np.floor(train_full['TransactionDT'] / 3600) % 24
test_feat['Transaction_hour'] = np.floor(test_full['TransactionDT'] / 3600) % 24

In [24]:
# decimal part of the transaction amount.
train_feat['TransactionAmt_decimal'] = ((train_full['TransactionAmt'] - train_full['TransactionAmt'].astype(int)) * 1000).astype(int)
test_feat['TransactionAmt_decimal'] = ((test_full['TransactionAmt'] - test_full['TransactionAmt'].astype(int)) * 1000).astype(int)

# Whether or not decimal in transaction amount
train_feat['TransactionAmt_decimalTF'] = np.where(train_feat['TransactionAmt_decimal']!=0,1,0)
test_feat['TransactionAmt_decimalTF'] = np.where(test_feat['TransactionAmt_decimal']!=0,1,0)

Extra Time Features

In [25]:
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
dates_range = pd.date_range(start='2017-10-01', end='2019-01-01')
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())

for df in [train_feat, test_feat]:
    # Temporary variables for aggregation
    df['DT'] = df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
    df['DT_M'] = ((df['DT'].dt.year-2017)*12 + df['DT'].dt.month).astype(np.int8)
    df['DT_W'] = ((df['DT'].dt.year-2017)*52 + df['DT'].dt.weekofyear).astype(np.int8)
    df['DT_D'] = ((df['DT'].dt.year-2017)*365 + df['DT'].dt.dayofyear).astype(np.int16)
    df['DT_hour'] = (df['DT'].dt.hour).astype(np.int8)
    # Holidays
    df['is_holiday'] = (df['DT'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)

for col in ['DT_M','DT_W','DT_D', 'DT_hour']:
    temp_df = pd.concat([train_feat[[col]], test_feat[[col]]])
    fq_encode = temp_df[col].value_counts().to_dict()
            
    train_feat[col+'_total'] = train_feat[col].map(fq_encode)
    test_feat[col+'_total']  = test_feat[col].map(fq_encode)
    
    # We can't use it as solo feature
    REMOVE_COLS.append(col+'_total')
    
# Remove temporary features from final list
REMOVE_COLS += ['DT','DT_M','DT_W','DT_D','DT_hour']

Try frequency encoding by timeblock

In [26]:
def timeblock_frequency_encoding(train_df, test_df, periods, columns, 
                                 with_proportions=True, only_proportions=False):
    for period in periods:
        for col in columns:
            new_col = col +'_'+ period
            train_df[new_col] = train_df[col].astype(str)+'_'+train_df[period].astype(str)
            test_df[new_col]  = test_df[col].astype(str)+'_'+test_df[period].astype(str)

            temp_df = pd.concat([train_df[[new_col]], test_df[[new_col]]])
            fq_encode = temp_df[new_col].value_counts().to_dict()

            train_df[new_col] = train_df[new_col].map(fq_encode)
            test_df[new_col]  = test_df[new_col].map(fq_encode)
            
            if only_proportions:
                train_df[new_col] = train_df[new_col]/train_df[period+'_total']
                test_df[new_col]  = test_df[new_col]/test_df[period+'_total']

            if with_proportions:
                train_df[new_col+'_proportions'] = train_df[new_col]/train_df[period+'_total']
                test_df[new_col+'_proportions']  = test_df[new_col]/test_df[period+'_total']

    return train_df, test_df

# time block frequency encoding 
for df in [train_feat, test_feat]:
    df['bank_type'] = df['card3'].astype(str) +'_'+ df['card5'].astype(str)
REMOVE_COLS.append('bank_type')
periods = ['DT_M','DT_W','DT_D']

# Product type
train_feat['product_type'] = train_feat['ProductCD'].astype(str)+'_'+train_feat['TransactionAmt'].astype(str)
test_feat['product_type'] = test_feat['ProductCD'].astype(str)+'_'+test_feat['TransactionAmt'].astype(str)
REMOVE_COLS.append('product_type')
i_cols = ['product_type', 'bank_type']
periods = ['DT_D','DT_W','DT_M']
train_feat, test_feat = timeblock_frequency_encoding(train_feat, test_feat, periods, i_cols, 
                                                 with_proportions=False, only_proportions=True)

Card1-Card6
- frequency encoding
- lets try group the rare cards in the 2nd round

In [27]:
for col in ['card1','card2','card3','card4','card5','card6']:
    train_feat[col]=train_feat[col].astype(object)
    test_feat[col]=test_feat[col].astype(object)
    
    temp_df = pd.concat([train_feat[[col]], test_feat[[col]]])
    fq_encode = temp_df[col].value_counts(dropna=False).to_dict()   
    train_feat[col+'_freq'] = train_feat[col].map(fq_encode)
    test_feat[col+'_freq']  = test_feat[col].map(fq_encode)


In [28]:
train_feat[['card1_freq','card2_freq','card3_freq','card4_freq','card5_freq','card6_freq']].head()

Unnamed: 0,card1_freq,card2_freq,card3_freq,card4_freq,card5_freq,card6_freq
0,56,17587,956845,9524,309,267648
1,1338,5593,956845,347386,49491,267648
2,1794,70496,956845,719649,102930,824959
3,7635,11287,956845,347386,47061,824959
4,30,27225,956845,347386,49491,267648


In [29]:
for df in [train_feat, test_feat]:
    df['card1_card2'] = df['card1'].astype(str)+ '_' + df['card2'].astype(str)
    df['card4_card6'] = df['card4'].astype(str)+ '_' + df['card6'].astype(str)
    
for col in ['card1_card2','card4_card6']:
    temp_df = pd.concat([train_feat[[col]], test_feat[[col]]])
    fq_encode = temp_df[col].value_counts(dropna=False).to_dict()   
    train_feat[col+'_freq'] = train_feat[col].map(fq_encode)
    test_feat[col+'_freq']  = test_feat[col].map(fq_encode)

In [30]:
i_cols = ['card1','card2','card3','card5']

for col in i_cols:
    for agg_type in ['mean','std']:
        new_col_name = col+'_TransactionAmt_'+agg_type
        temp_df = pd.concat([train_feat[[col, 'TransactionAmt']], test_feat[[col,'TransactionAmt']]])
        temp_df = temp_df.groupby([col])['TransactionAmt'].agg([agg_type]).reset_index().rename(
                                                columns={agg_type: new_col_name})
        
        temp_df.index = list(temp_df[col])
        temp_df = temp_df[new_col_name].to_dict()   
    
        train_feat[new_col_name] = train_feat[col].map(temp_df)
        test_feat[new_col_name]  = test_feat[col].map(temp_df)

Addresses
- combine addr1 and addr2
- frequency encode the combo

In [31]:
for df in [train_feat, test_feat]:
    df['addr1_addr2'] =  df['addr1'].astype(str)+ '_' + df['addr2'].astype(str)

In [32]:
temp_df = pd.concat([train_feat[['addr1_addr2']], test_feat[['addr1_addr2']]])
fq_encode = temp_df['addr1_addr2'].value_counts(dropna=False).to_dict()   
train_feat['addr1_addr2_freq'] = train_feat['addr1_addr2'].map(fq_encode)
test_feat['addr1_addr2_freq']  = test_feat['addr1_addr2'].map(fq_encode)

Distances
- multiplication of dist1*dist2
- addition of dist1+dist2

In [33]:
for df in [train_feat, test_feat]:
    df['dist1_x_dist2'] = df['dist1'] * df['dist2']
    df['dist1_plus_dist2'] = df['dist1'] + df['dist2']

Email Domains
- group up gmail, hotmail, yahoo, anonymous, live, outlook, aol etc
- group up suffixes with .net and .com

In [34]:
# for df in [train_feat, test_feat]:
#     m1 = df.P_emaildomain.str.contains('gmail',na=False)
#     m2 = df.P_emaildomain.str.contains('hotmail',na=False)
#     m3 = df.P_emaildomain.str.contains('yahoo',na=False)
#     m4 = df.P_emaildomain.str.contains('anonymous',na=False)
#     m5 = df.P_emaildomain.str.contains('live',na=False)
#     m6 = df.P_emaildomain.str.contains('outlook',na=False)
#     m7= df.P_emaildomain.str.contains('aol',na=False)
#     m8= df.P_emaildomain.str.contains('msn',na=False)
#     m9=df.P_emaildomain.str.contains('icloud',na=False)
#     m10 =df.P_emaildomain.str.contains('comcast',na=False)

#     df['P_emaildomain_knowngroups'] = np.select([m1,m2,m3,m4,m5,m6,m7,m8,m9,m10], ['gmail','hotmail','yahoo','anon','live','outlook','aol','msn','icloud','comcast'], default='other')

In [35]:
# for df in [train_feat, test_feat]:
#     m1 = df.R_emaildomain.str.contains('gmail',na=False)
#     m2 = df.R_emaildomain.str.contains('hotmail',na=False)
#     m3 = df.R_emaildomain.str.contains('yahoo',na=False)
#     m4 = df.R_emaildomain.str.contains('anonymous',na=False)
#     m5 = df.R_emaildomain.str.contains('live',na=False)
#     m6 = df.R_emaildomain.str.contains('outlook',na=False)
#     m7= df.R_emaildomain.str.contains('aol',na=False)
#     m8= df.R_emaildomain.str.contains('msn',na=False)
#     m9= df.R_emaildomain.str.contains('icloud',na=False)
#     m10 =df.R_emaildomain.str.contains('comcast',na=False)

#     df['R_emaildomain_knowngroups'] = np.select([m1,m2,m3,m4,m5,m6,m7,m8,m9,m10], ['gmail','hotmail','yahoo','anon','live','outlook','aol','msn','icloud','comcast'], default='other')

In [36]:
# for df in [train_feat, test_feat]:
#     m1 = df.P_emaildomain.str.contains('net',na=False)
#     m2 = df.P_emaildomain.str.contains('com',na=False)

#     df['P_emaildomain_net_com'] = np.select([m1,m2], ['net','com'], default='other')

# for df in [train_feat, test_feat]:
#     m1 = df.R_emaildomain.str.contains('net',na=False)
#     m2 = df.R_emaildomain.str.contains('com',na=False)

#     df['R_emaildomain_net_com'] = np.select([m1,m2], ['net','com'], default='other')

In [37]:
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo', 'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other', 'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
us_emails = ['gmail', 'net', 'edu']

In [38]:
for c in ['P_emaildomain', 'R_emaildomain']:
    train_feat[c + '_bin'] = train_feat[c].map(emails)
    test_feat[c + '_bin'] = test_feat[c].map(emails)
    
    train_feat[c + '_suffix'] = train_feat[c].map(lambda x: str(x).split('.')[-1])
    test_feat[c + '_suffix'] = test_feat[c].map(lambda x: str(x).split('.')[-1])
    
    train_feat[c + '_suffix'] = train_feat[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    test_feat[c + '_suffix'] = test_feat[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

M Features
- count of M features == true
- convert T and F categories to numeric
- count number of NA in the M features

In [39]:
M_cols = ['M1','M2','M3','M5','M6','M7','M8','M9']
for c in M_cols:
    if train_feat[c].dtype == 'O':
        train_feat[c] = train_feat[c].map({"T": 1.0, "F": 0.0})
    if test_feat[c].dtype == 'O':
        test_feat[c] = test_feat[c].map({"T": 1.0, "F": 0.0})

# Sum each M together except for M4 which is catgeorical
for df in [train_feat, test_feat]:
    df['M_sum'] = df[M_cols].sum(axis=1).astype(np.int8)
    df['M_na'] = df[M_cols].isna().sum(axis=1).astype(np.int8)

IDs
- all ID's are categorical
- ID 14 is a timezone. Encode as categorical
- proxy vs not proxy (nan)
- split OS, browser and screen resolution

In [40]:
for df in [train_feat, test_feat]:
    df['id_proxy'] = np.where(df['id_23'].isin(['IP_PROXY:TRANSPARENT', 'IP_PROXY:ANONYMOUS', 'IP_PROXY:HIDDEN']),1,0)

In [41]:
for df in [train_feat, test_feat]:
    df['id_OS'] = df['id_30'].str.split(' ').str[0]
    df['id_browser'] = df['id_31'].str.split(' ').str[0]
    df['id_screen_width'] = df['id_33'].str.split('x').str[0]
    df['id_screen_height'] = df['id_33'].str.split('x').str[1]
    df['device_name'] = df['DeviceInfo'].str.split('/', expand=True)[0]
    df['device_version'] = df['DeviceInfo'].str.split('/', expand=True)[1]
    
    df['id_34'] = df['id_34'].str.split(':', expand=True)[1]
    df['id_23'] = df['id_23'].str.split(':', expand=True)[1]

In [42]:
browser_counts = train_feat['id_browser'].value_counts().to_dict()
browsers =[]
for key, value in browser_counts.items():
    if value>300:
        browsers.append(key)

In [43]:
for df in [train_feat, test_feat]:
    df['id_browser'] = np.where(df['id_browser'].isin(browsers), df['id_browser'], 'other')

In [44]:
new_browsers = ["samsung browser 7.0","opera 53.0","mobile safari 10.0","google search application 49.0",
               "firefox 60.0","edge 17.0","chrome 69.0","chrome 67.0 for android","chrome 63.0 for android",
               "chrome 63.0 for ios","chrome 64.0","chrome 64.0 for android",
               "chrome 64.0 for ios","chrome 65.0","chrome 65.0 for android","chrome 65.0 for ios",
                "chrome 66.0","chrome 66.0 for android","chrome 66.0 for ios"]

for df in [train_feat, test_feat]:
    df['new_browser'] = np.where(df['id_31'].isin(new_browsers),1,0)

C Features
- simple rowwise sum of C1-C9

In [45]:
i_cols = ['C1','C2', 'C3','C4','C5','C6','C7','C8','C9']
for df in [train_feat, test_feat]:
    df['C_sum'] = df[i_cols].sum(axis=1).astype(np.int8)

Target encoding for ProductCD and M4
- careful of leakage

In [46]:
# ProductCD and M4 Target mean
for col in ['ProductCD','M4']:
    temp_dict = train_feat.groupby([col])['isFraud'].agg(['mean']).reset_index().rename(
                                                        columns={'mean': col+'_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col+'_target_mean'].to_dict()

    train_feat[col+'_target_mean'] = train_feat[col].map(temp_dict)
    test_feat[col+'_target_mean']  = test_feat[col].map(temp_dict)

Aggregations by groups

In [47]:
columns_a = ['TransactionAmt', 'id_02', 'D15']
columns_b = ['card1', 'card4', 'addr1']

for col_a in columns_a:
    for col_b in columns_b:
        for df in [train_feat, test_feat]:
            df[f'{col_a}_to_mean_{col_b}'] = df[col_a] / df.groupby([col_b])[col_a].transform('mean')
            df[f'{col_a}_to_std_{col_b}'] = df[col_a] / df.groupby([col_b])[col_a].transform('std')


Add some more features around transaction amount

In [48]:
# TransactionAmt Features

# Let's add some kind of client uID based on cardID ad addr columns
# The value will be very specific for each client so we need to remove it
# from final feature. But we can use it for aggregations.
train_feat['uid'] = train_feat['card1'].astype(str)+'_'+train_feat['card2'].astype(str)
test_feat['uid'] = test_feat['card1'].astype(str)+'_'+test_feat['card2'].astype(str)

train_feat['uid2'] = train_feat['uid'].astype(str)+'_'+train_feat['card3'].astype(str)+'_'+train_feat['card5'].astype(str)
test_feat['uid2'] = test_feat['uid'].astype(str)+'_'+test_feat['card3'].astype(str)+'_'+test_feat['card5'].astype(str)

train_feat['uid3'] = train_feat['uid2'].astype(str)+'_'+train_feat['addr1'].astype(str)+'_'+train_feat['addr2'].astype(str)
test_feat['uid3'] = test_feat['uid2'].astype(str)+'_'+test_feat['addr1'].astype(str)+'_'+test_feat['addr2'].astype(str)

# Check if the Transaction Amount is common or not (we can use freq encoding here)
# In our dialog with a model we are telling to trust or not to these values   
train_feat['TransactionAmt_check'] = np.where(train_feat['TransactionAmt'].isin(test_feat['TransactionAmt']), 1, 0)
test_feat['TransactionAmt_check']  = np.where(test_feat['TransactionAmt'].isin(train_feat['TransactionAmt']), 1, 0)

# For our model current TransactionAmt is a noise
# https://www.kaggle.com/kyakovlev/ieee-check-noise
# (even if features importances are telling contrariwise)
# There are many unique values and model doesn't generalize well
# Lets do some aggregations
i_cols = ['card1','card2','card3','card5','uid','uid2','uid3']

for col in i_cols:
    for agg_type in ['mean','std']:
        new_col_name = col+'_TransactionAmt_'+agg_type
        temp_df = pd.concat([train_feat[[col, 'TransactionAmt']], test_feat[[col,'TransactionAmt']]])
        #temp_df['TransactionAmt'] = temp_df['TransactionAmt'].astype(int)
        temp_df = temp_df.groupby([col])['TransactionAmt'].agg([agg_type]).reset_index().rename(
                                                columns={agg_type: new_col_name})
        
        temp_df.index = list(temp_df[col])
        temp_df = temp_df[new_col_name].to_dict()   
    
        train_feat[new_col_name] = train_feat[col].map(temp_df)
        test_feat[new_col_name]  = test_feat[col].map(temp_df)

More Frequency Encoding
- frequency encode for device info
- device type seems to be simply mobile or desktop so leave alone

In [49]:
i_cols = ['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14',
          'D1','D2','D3','D4','D5','D6','D7','D8',
          'addr1','addr2',
          'dist1','dist2',
          'P_emaildomain', 'R_emaildomain',
          'DeviceInfo', 'id_proxy','id_OS','id_browser','id_screen_width','id_screen_height',
          'id_30','id_33','uid','uid2','uid3'
         ]

for col in i_cols:
    temp_df = pd.concat([train_feat[[col]], test_feat[[col]]])
    fq_encode = temp_df[col].value_counts(dropna=False).to_dict()   
    train_feat[col+'_freq'] = train_feat[col].map(fq_encode)
    test_feat[col+'_freq']  = test_feat[col].map(fq_encode)

V Features
- rowwise sum of NA counts
- rowwise sum (lets try anyways)

In [50]:
V_cols = list(train_feat.columns[train_feat.columns.str.contains('V')])
for df in [train_feat, test_feat]:
    df['V_sum'] = df[V_cols].sum(axis=1).astype(np.int8)
    df['V_na'] = df[V_cols].isna().sum(axis=1).astype(np.int8)

Leave Alone Because there's too many (and i don't have enough time)
- V features
- D features

Clean up
- make sure correct column types
- do label encoding for the categoricals

In [51]:
REMOVE_COLS += ['DT','DT_M','DT_W','DT_D','DT_hour','DT_day_week','DT_day_month','uid','uid2','uid3']
ALL_FEATURES_train = [col for col in list(train_feat) if col not in REMOVE_COLS]
ALL_FEATURES_test = [col for col in list(test_feat) if col not in REMOVE_COLS]

In [52]:
pd.set_option('display.max_rows', 500)

In [53]:
train_feat=train_feat[ALL_FEATURES_train]
print(train_feat.shape)

test_feat=test_feat[ALL_FEATURES_test]
print(test_feat.shape)

(590540, 550)
(506691, 549)


In [54]:
id_vars = []
for i in range(12,39):
    id_vars.append('id_'+str(i))

In [55]:
# make sure the categoricals are correct types
for df in [train_feat,test_feat]:
    for col in train_feat.columns:
        if col in ['card1','card2','card3','card4','card5','card6']:
            df[col]=df[col].astype(object)
        if col in ['addr1','addr2']:
            df[col]=df[col].astype(object)
        if col in id_vars:
            df[col]=df[col].astype(object)
        

In [56]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
#         else:
#             df[col] = df[col].astype('category') # try converting objects into categorical and compare performance too

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [57]:
train_feat = reduce_mem_usage(train_feat)
test_feat = reduce_mem_usage(test_feat)

Memory usage of dataframe is 1158.19 MB




Memory usage after optimization is: 967.27 MB
Decreased by 16.5%
Memory usage of dataframe is 983.83 MB
Memory usage after optimization is: 822.92 MB
Decreased by 16.4%


In [58]:
# Save to pickle format
train_feat.to_pickle("data/train_feat.pkl")
test_feat.to_pickle("data/test_feat.pkl")

#### Remove Correlated features?

In [59]:
train_full = pd.read_pickle('data/train_feat.pkl')
test_full = pd.read_pickle('data/test_feat.pkl')

In [60]:
for f in test_full.columns:
    if train_full[f].dtype=='object' or test_full[f].dtype=='object': 
        train_full[f] = train_full[f].fillna('unseen_before_label')
        test_full[f]  = test_full[f].fillna('unseen_before_label')
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_full[f].values) + list(test_full[f].values))
        train_full[f] = lbl.transform(list(train_full[f].values))
        test_full[f] = lbl.transform(list(test_full[f].values)) 

# Fill NA's for numerics
train_full = train_full.fillna(-999)
test_full = test_full.fillna(-999)

KeyboardInterrupt: 

In [None]:
# Get list of 98% correlated features
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_df = train_full.select_dtypes(include=numerics)

# calculate the correlation matrix
corr_matrix = numeric_df.corr()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.98
correlated_features = [column for column in upper.columns if any(upper[column] > 0.98)]

In [19]:
# save features
with open('data/corr_feat.pkl', 'wb') as f:
    pickle.dump(correlated_features, f)