In [28]:
import pandas as pd 
import numpy as np 
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold

In this notebook, we will concentrate on both removing redundant features by using feature selection methods and possibly creating new features using feature engineering. We will first turn our attention on the issues relating to missing values in our dataframe

In [42]:
df_train = pd.read_pickle('data/df_train_cleaned.pkl')
df_test = pd.read_pickle('data/df_test_cleaned.pkl')

In [43]:
no_missing = []
few_missing = []
lots_missing = []
for col in list(df_train.columns):
    prop = round(df_train[col].isna().sum()/len(df_train), 3)
    if prop < 0.1:
        no_missing.append(col)
    if (prop > 0.1) & (prop <= 0.7):
        few_missing.append(col)
    elif prop > 0.7:
        lots_missing.append(col)

print(len(no_missing),len(few_missing), len(lots_missing))   

112 114 134


It's observed that 112 columns have almost no missing values with less than 10% missing values. These observations could possibly be dropped or very simply imputed using available data. Here, I choose to impute the data. However, we see that 114 columns have a proportion of 10-30% missing values and 134 columns have more than 30% missing values. For these columns with a high proportion of missing values, different techniques will have to be used

In [44]:
for col in no_missing: 
    if df_train[col].dtype == 'object':
        df_train[col].fillna(df_train[col].mode()[0], inplace = True)
    else:
        df_train[col].fillna(df_train[col].median(), inplace = True)    

for col in few_missing: 
    if df_train[col].dtype == 'object':
        df_train[col].fillna(df_train[col].mode()[0], inplace = True)
    else:
        df_train[col].fillna(df_train[col].median(), inplace = True)  

for col in lots_missing:
    if df_train[col].dtype == 'object':
        df_train[col].fillna('Unknown', inplace = True)
    else:
        df_train[col].fillna(-1000, inplace = True)          


In [27]:
df_train.isna().sum()

TransactionID     0
isFraud           0
TransactionDT     0
TransactionAmt    0
ProductCD         0
                 ..
id_36             0
id_37             0
id_38             0
DeviceType        0
DeviceInfo        0
Length: 360, dtype: int64

Now that we have no missing values in any of the columns, we can move onto feature engineering. However, in order to reduce the sheer size of the number of variables present, I will first implement some feature selection methods such as variance threshold. This will remove those features that contain extremely low predictive power.

In [45]:
threshold = 0.1
numerical_cols = df_test.select_dtypes(include = 'number').columns
selector = VarianceThreshold(threshold = threshold)
selector.fit(df_train[numerical_cols])
columns_to_drop = numerical_cols[~selector.get_support()]
df_train = df_train.drop(columns = columns_to_drop)
df_test = df_test.drop(columns = columns_to_drop)

In [47]:
fraud_by_product = df_train.groupby('ProductCD')['isFraud'].agg(['count', 'mean']).round(3)
print(fraud_by_product)

            count   mean
ProductCD               
C           68519  0.117
H           33024  0.048
R           37699  0.038
S           11628  0.059
W          439670  0.020


In [50]:
fraud_by_card = df_train.groupby(['card6', 'card4'])['isFraud'].agg(['count', 'mean']).round(3)
print(fraud_by_card)

                                   count   mean
card6           card4                          
charge card     american express       3  0.000
                visa                  12  0.000
credit          american express    8175  0.029
                discover            6304  0.079
                mastercard         50772  0.069
                visa               83735  0.068
debit           american express     150  0.033
                discover             347  0.040
                mastercard        138415  0.022
                visa              302597  0.025
debit or credit mastercard            30  0.000


In [54]:
fraud_by_domain = df_train.groupby('P_emaildomain')['isFraud'].agg(['count', 'mean']).round(3)
fraud_by_domain.sort_values('mean', ascending = False).head(10)

Unnamed: 0_level_0,count,mean
P_emaildomain,Unnamed: 1_level_1,Unnamed: 2_level_1
protonmail.com,76,0.408
mail.com,559,0.19
outlook.es,438,0.13
aim.com,315,0.127
outlook.com,5096,0.095
hotmail.es,305,0.066
live.com.mx,749,0.055
hotmail.com,45250,0.053
gmail.com,322811,0.039
yahoo.fr,143,0.035


Feature engineering

In [78]:
df_train = df_train.copy()
df_train['hour'] = round((df_train['TransactionDT']/3600)% 24, ndigits = -1)
df_train['day_of_week'] = (df_train['TransactionDT'] // (3600 * 24)) % 7
df_test['hour'] = round((df_test['TransactionDT']/3600)% 24, ndigits = -1)
df_test['day_of_week'] = (df_test['TransactionDT'] // (3600 * 24)) % 7

Interaction terms

In [80]:
df_train['Product_plus_card_type'] = (df_train['ProductCD'] + ' + ' + df_train['card4']).astype(str)
df_test['Product_plus_card_type'] = (df_test['ProductCD'] + ' + ' + df_test['card4']).astype(str)

In [85]:
df_train.to_pickle('data/df_train_w_feats.pkl')
df_test.to_pickle('data/df_test_w_feats.pkl')