In [None]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import matplotlib.pyplot as plt

<h2> 2 | Function for reducing memory usage of a pandas dataframe </h2>

In [2]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

#One Hot encoding for the specified columns
def one_hot_encode(train,test,one_hot_list):
    OH_cols_train = pd.get_dummies(train, dummy_na=True, prefix=one_hot_list, columns=one_hot_list)
    train_no_categorical = train.drop(one_hot_list,axis=1) # Drop categorical columns
    OH_train = pd.concat([train_no_categorical, OH_cols_train], axis=1) # Add one hot columns

    OH_cols_test = pd.get_dummies(test, dummy_na=True, prefix=one_hot_list, columns=one_hot_list)
    test_no_categorical = test.drop(one_hot_list,axis=1) # Drop categorical columns
    OH_test = pd.concat([test_no_categorical, OH_cols_test], axis=1) # Add one hot columns
    
    return OH_train, OH_test

#Plots the fraud rate for the specified column within the train dataset

def graph_fraud_rate(col,df=train,fraud='isFraud'):    
    x = df[col].unique()
    x_counts = df[col].value_counts(normalize=True,dropna=False)
    total_fraud_rate = df[fraud].mean()
    if len(x) > 25:
        print("Too many columns: {}".format(len(x)))
    else:
        fraud_rate = df['isFraud'].groupby(by=df[col].astype('str')).mean()
        y_pos = np.arange(len(x))
            
        plt.bar(y_pos,fraud_rate)
        #plt.plot(y_pos,x_counts)
        plt.xticks(y_pos,list(fraud_rate.index))
        plt.axhline(total_fraud_rate)
        plt.ylabel('Fraud Rate')
        plt.title(col)
        plt.show()

<h2> 3 | Load Data </h2>

In [3]:
train_identity = pd.read_csv('../../data/train_identity.csv')
train_transaction = pd.read_csv('../../data/train_transaction.csv')
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
del train_identity, train_transaction

test_identity = pd.read_csv('../../data/test_identity.csv')
test_transaction = pd.read_csv('../../data/test_transaction.csv')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
del test_identity, test_transaction

#sub = pd.read_csv('../../data/sample_submission.csv')

<h2> 4 | Run function </h2>

In [4]:
train, NAlist_train = reduce_mem_usage(train)
print("_________________")
print("")
print("Warning: the following columns have missing values filled with 'df['column_name'].min() -1': ")
print("_________________")
print("")
print(NAlist)

Memory usage of properties dataframe is : 1959.8762512207031  MB
******************************
Column:  TransactionID
dtype before:  int64
dtype after:  uint32
******************************
******************************
Column:  isFraud
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  TransactionDT
dtype before:  int64
dtype after:  uint32
******************************
******************************
Column:  TransactionAmt
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  card1
dtype before:  int64
dtype after:  uint16
******************************
******************************
Column:  card2
dtype before:  float64
dtype after:  uint16
******************************
******************************
Column:  card3
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  card5
dtype before:  float64
d

dtype after:  uint8
******************************
******************************
Column:  V29
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V30
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V31
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V32
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V33
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V34
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V35
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V36
dtype before:  float64
dtype after:  uint8
******************************
*****

dtype after:  uint8
******************************
******************************
Column:  V99
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V100
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V101
dtype before:  float64
dtype after:  uint16
******************************
******************************
Column:  V102
dtype before:  float64
dtype after:  uint16
******************************
******************************
Column:  V103
dtype before:  float64
dtype after:  uint16
******************************
******************************
Column:  V104
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V105
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V106
dtype before:  float64
dtype after:  uint8
**************************

dtype after:  uint16
******************************
******************************
Column:  V168
dtype before:  float64
dtype after:  uint16
******************************
******************************
Column:  V169
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V170
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V171
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V172
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V173
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V174
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V175
dtype before:  float64
dtype after:  uint8
**************************

dtype after:  uint8
******************************
******************************
Column:  V237
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V238
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V239
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V240
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V241
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V242
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V243
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V244
dtype before:  float64
dtype after:  uint8
****************************

dtype after:  uint8
******************************
******************************
Column:  V306
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  V307
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  V308
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  V309
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  V310
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  V311
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  V312
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  V313
dtype before:  float64
dtype after:  float32
************

NameError: name 'NAlist' is not defined

In [5]:
test, NAlist_test = reduce_mem_usage(test)
print("_________________")
print("")
print("Warning: the following columns have missing values filled with 'df['column_name'].min() -1': ")
print("_________________")
print("")
print(NAlist)

Memory usage of properties dataframe is : 1677.7335662841797  MB
******************************
Column:  TransactionID
dtype before:  int64
dtype after:  uint32
******************************
******************************
Column:  TransactionDT
dtype before:  int64
dtype after:  uint32
******************************
******************************
Column:  TransactionAmt
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  card1
dtype before:  int64
dtype after:  uint16
******************************
******************************
Column:  card2
dtype before:  float64
dtype after:  uint16
******************************
******************************
Column:  card3
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  card5
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  addr1
dtype before:  float64
d

dtype after:  uint8
******************************
******************************
Column:  V30
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V31
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V32
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V33
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V34
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V35
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V36
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V37
dtype before:  float64
dtype after:  uint8
******************************
*****

dtype after:  uint8
******************************
******************************
Column:  V100
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V101
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V102
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V103
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V104
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V105
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V106
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V107
dtype before:  float64
dtype after:  uint8
****************************

dtype after:  uint16
******************************
******************************
Column:  V169
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V170
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V171
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V172
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V173
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V174
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V175
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V176
dtype before:  float64
dtype after:  uint8
***************************

dtype after:  uint8
******************************
******************************
Column:  V238
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V239
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V240
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V241
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V242
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V243
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V244
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  V245
dtype before:  float64
dtype after:  uint8
****************************

dtype after:  float32
******************************
******************************
Column:  V307
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  V308
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  V309
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  V310
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  V311
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  V312
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  V313
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  V314
dtype before:  float64
dtype after:  float32
**********

NameError: name 'NAlist' is not defined

In [18]:
# Label Encoding for categoricals
one_hot_list = ['ProductCD', 'card4', 'card6', 'id_23']
label_encode_list = ['M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9','id_12', 'id_15', 'id_16', 'id_27', 
                     'id_28', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType']
for f in test:
    if (train[f].dtype=='object' or test[f].dtype=='object' or hasattr(train[f], 'cat')) and f not in one_hot_list: 
        lbl = LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))

#One Hot encoding for a handful of categoricals
OH_train, OH_test = one_hot_encode(train,test,one_hot_list)
OH_test['card6_debit or credit'] = np.zeros(len(OH_test))

In [25]:
# Look at low cardinality categoricals 
low_cardinality_categorical = []
for col, values in test.iteritems():
    num_uniques = values.nunique()
    if num_uniques <25:
        low_cardinality_categorical.append(col)
        print ('{name}: {num_unique}'.format(name=col, num_unique=num_uniques))
        print (values.unique()[:min(20,len(values.unique()))])
        print ('\n')

ProductCD: 5
['W' 'C' 'S' 'H' 'R']


card4: 4
['visa' 'mastercard' 'american express' 'discover' nan]


card6: 3
['debit' 'credit' nan 'charge card']


M1: 3
[1 2 0]


M2: 3
[1 0 2]


M3: 3
[0 1 2]


M4: 4
[3 0 1 2]


M5: 3
[2 0 1]


M6: 3
[0 1 2]


M7: 3
[1 2 0]


M8: 3
[1 2 0]


M9: 3
[1 2 0]


V1: 3
[  1 255   0]


V2: 13
[  1 255   2   3   0   5   4   6  10   7   9  11   8]


V3: 13
[  1 255   2   3   0   5   4   7   6  10   9  11   8]


V4: 11
[  1   0 255   2   3   4   5   6  10   8   9]


V5: 12
[  1   0 255   2   4   3   5   6  10   8   9   7]


V6: 14
[  1   2 255   3   4   6   5  11   7   8  10  12  13   0]


V7: 14
[  1   2 255   3   4   5   6   7  11   8  10  12  13   0]


V8: 13
[  1 255   2   3   4   5   6   0  10   7   9  11   8]


V9: 13
[  1 255   2   3   4   5   6   7   0  10   9  11   8]


V10: 7
[  1   0 255   2   3   4   5]


V11: 9
[  1   0 255   2   3   4   5   6   7]


V12: 6
[  0   1 255   2   3   4]


V13: 8
[  0   1 255   2   3   5   4   6]


V14: 3
[  1 255 

id_03: 23
[-13   0  -7   1  -2  -5   2   3  -6   4   6   5 -10  -4  -8  -9  -1  -3
 -12 -11]


id_04: 16
[-20   0 -11  -7  -6  -5  -8 -13 -12  -4  -2  -9 -10  -1  -3 -19]


id_12: 3
[2 1 0]


id_15: 4
[3 1 0 2]


id_16: 3
[2 1 0]


id_18: 18
[10 15 13 17 12 26 18 11 27 20 21 23 24 14 28 29 25 19]


id_23: 3
[nan 'IP_PROXY:TRANSPARENT' 'IP_PROXY:ANONYMOUS' 'IP_PROXY:HIDDEN']


id_24: 16
[ 9 11 21 15 26 24 19 16 18 25 12 10 20 22 14 13]


id_27: 3
[2 0 1]


id_28: 3
[2 1 0]


id_29: 3
[2 1 0]


id_32: 6
[ 7 24 32 16 48  8]


id_34: 3
[4 3 2]


id_35: 3
[2 0 1]


id_36: 3
[2 0 1]


id_37: 3
[2 1 0]


id_38: 3
[2 0 1]


DeviceType: 3
[2 1 0]




In [None]:
# Graph out all the low cardinality columns
for col in low_cardinality_categorical:
    graph_fraud_rate(col)