In [4]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import matplotlib.pyplot as plt
import gc

<h2> 2 | Function for reducing memory usage of a pandas dataframe </h2>

In [2]:
def reduce_mem_usage(props,verbose=False):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            if verbose:
                print("******************************")
                print("Column: ",col)
                print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            if verbose:
                print("dtype after: ",props[col].dtype)
                print("******************************")
    
    # Print final result
    if verbose:
        print("___MEMORY USAGE AFTER COMPLETION:___")
        mem_usg = props.memory_usage().sum() / 1024**2 
        print("Memory usage is: ",mem_usg," MB")
        print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
        print("_________________")
        print("")
        print("Warning: the following columns have missing values filled with 'df['column_name'].min() -1': ")
        print("_________________")
        print("")
        print(NAlist)
    return props, NAlist

#One Hot encoding for the specified columns
def one_hot_encode(train,test,one_hot_list):
    OH_cols_train = pd.get_dummies(train, dummy_na=True, prefix=one_hot_list, columns=one_hot_list)
    train_no_categorical = train.drop(one_hot_list,axis=1) # Drop categorical columns
    OH_train = pd.concat([train_no_categorical, OH_cols_train], axis=1) # Add one hot columns

    OH_cols_test = pd.get_dummies(test, dummy_na=True, prefix=one_hot_list, columns=one_hot_list)
    test_no_categorical = test.drop(one_hot_list,axis=1) # Drop categorical columns
    OH_test = pd.concat([test_no_categorical, OH_cols_test], axis=1) # Add one hot columns
    
    return OH_train, OH_test

#Plots the fraud rate for the specified column within the train dataset

def graph_fraud_rate(col,df,fraud='isFraud'):    
    x = df[col].unique()
    x_counts = df[col].value_counts(normalize=True,dropna=False)
    total_fraud_rate = df[fraud].mean()
    if len(x) > 25:
        print("Too many columns: {}".format(len(x)))
    else:
        fraud_rate = df['isFraud'].groupby(by=df[col].astype('str')).mean()
        y_pos = np.arange(len(x))
            
        plt.bar(y_pos,fraud_rate)
        #plt.plot(y_pos,x_counts)
        plt.xticks(y_pos,list(fraud_rate.index))
        plt.axhline(total_fraud_rate)
        plt.ylabel('Fraud Rate')
        plt.title(col)
        plt.show()

<h2> 3 | Load Data </h2>

In [3]:
#TRAIN
train_identity = pd.read_csv('../../data/train_identity.csv')
train_transaction = pd.read_csv('../../data/train_transaction.csv')
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
del train_identity, train_transaction

train, NAlist_train = reduce_mem_usage(train,verbose=False)


# TEST
test_identity = pd.read_csv('../../data/test_identity.csv')
test_transaction = pd.read_csv('../../data/test_transaction.csv')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
del test_identity, test_transaction

test, NAlist_test = reduce_mem_usage(test,verbose=False)

gc.collect()

Memory usage of properties dataframe is : 1959.8762512207031  MB
Memory usage of properties dataframe is : 1677.7335662841797  MB


In [6]:
# Label Encoding for categoricals
one_hot_list = ['ProductCD', 'card4', 'card6', 'id_23']
label_encode_list = ['M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9','id_12', 'id_15', 'id_16', 'id_27', 
                     'id_28', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType']
for f in test:
    if (train[f].dtype=='object' or test[f].dtype=='object' or hasattr(train[f], 'cat')) and f not in one_hot_list: 
        lbl = LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))

#One Hot encoding for a handful of categoricals
OH_train, OH_test = one_hot_encode(train,test,one_hot_list)
OH_test['card6_debit or credit'] = np.zeros(len(OH_test))

In [7]:
# Remove duplicate columns
OH_train = OH_train.loc[:,~OH_train.columns.duplicated()]
OH_test = OH_test.loc[:,~OH_test.columns.duplicated()]

In [12]:
print("OH_train shape: {}".format(OH_train.shape))
print("OH_test shape: {}".format(OH_test.shape))

OH_train shape: (590540, 450)
OH_test shape: (506691, 449)


In [13]:
OH_train.to_pickle("train.pkl")
OH_test.to_pickle("test.pkl")

In [25]:
# Look at low cardinality categoricals 
low_cardinality_categorical = []
for col, values in test.iteritems():
    num_uniques = values.nunique()
    if num_uniques <25:
        low_cardinality_categorical.append(col)
        print ('{name}: {num_unique}'.format(name=col, num_unique=num_uniques))
        print (values.unique()[:min(20,len(values.unique()))])
        print ('\n')

ProductCD: 5
['W' 'C' 'S' 'H' 'R']


card4: 4
['visa' 'mastercard' 'american express' 'discover' nan]


card6: 3
['debit' 'credit' nan 'charge card']


M1: 3
[1 2 0]


M2: 3
[1 0 2]


M3: 3
[0 1 2]


M4: 4
[3 0 1 2]


M5: 3
[2 0 1]


M6: 3
[0 1 2]


M7: 3
[1 2 0]


M8: 3
[1 2 0]


M9: 3
[1 2 0]


V1: 3
[  1 255   0]


V2: 13
[  1 255   2   3   0   5   4   6  10   7   9  11   8]


V3: 13
[  1 255   2   3   0   5   4   7   6  10   9  11   8]


V4: 11
[  1   0 255   2   3   4   5   6  10   8   9]


V5: 12
[  1   0 255   2   4   3   5   6  10   8   9   7]


V6: 14
[  1   2 255   3   4   6   5  11   7   8  10  12  13   0]


V7: 14
[  1   2 255   3   4   5   6   7  11   8  10  12  13   0]


V8: 13
[  1 255   2   3   4   5   6   0  10   7   9  11   8]


V9: 13
[  1 255   2   3   4   5   6   7   0  10   9  11   8]


V10: 7
[  1   0 255   2   3   4   5]


V11: 9
[  1   0 255   2   3   4   5   6   7]


V12: 6
[  0   1 255   2   3   4]


V13: 8
[  0   1 255   2   3   5   4   6]


V14: 3
[  1 255 

id_03: 23
[-13   0  -7   1  -2  -5   2   3  -6   4   6   5 -10  -4  -8  -9  -1  -3
 -12 -11]


id_04: 16
[-20   0 -11  -7  -6  -5  -8 -13 -12  -4  -2  -9 -10  -1  -3 -19]


id_12: 3
[2 1 0]


id_15: 4
[3 1 0 2]


id_16: 3
[2 1 0]


id_18: 18
[10 15 13 17 12 26 18 11 27 20 21 23 24 14 28 29 25 19]


id_23: 3
[nan 'IP_PROXY:TRANSPARENT' 'IP_PROXY:ANONYMOUS' 'IP_PROXY:HIDDEN']


id_24: 16
[ 9 11 21 15 26 24 19 16 18 25 12 10 20 22 14 13]


id_27: 3
[2 0 1]


id_28: 3
[2 1 0]


id_29: 3
[2 1 0]


id_32: 6
[ 7 24 32 16 48  8]


id_34: 3
[4 3 2]


id_35: 3
[2 0 1]


id_36: 3
[2 0 1]


id_37: 3
[2 1 0]


id_38: 3
[2 0 1]


DeviceType: 3
[2 1 0]




In [None]:
# Graph out all the low cardinality columns
for col in low_cardinality_categorical:
    graph_fraud_rate(col,train)

In [15]:
for col in OH_train.columns:
    print(col)

TransactionID
isFraud
TransactionDT
TransactionAmt
card1
card2
card3
card5
addr1
addr2
dist1
dist2
P_emaildomain
R_emaildomain
C1
C2
C3
C4
C5
C6
C7
C8
C9
C10
C11
C12
C13
C14
D1
D2
D3
D4
D5
D6
D7
D8
D9
D10
D11
D12
D13
D14
D15
M1
M2
M3
M4
M5
M6
M7
M8
M9
V1
V2
V3
V4
V5
V6
V7
V8
V9
V10
V11
V12
V13
V14
V15
V16
V17
V18
V19
V20
V21
V22
V23
V24
V25
V26
V27
V28
V29
V30
V31
V32
V33
V34
V35
V36
V37
V38
V39
V40
V41
V42
V43
V44
V45
V46
V47
V48
V49
V50
V51
V52
V53
V54
V55
V56
V57
V58
V59
V60
V61
V62
V63
V64
V65
V66
V67
V68
V69
V70
V71
V72
V73
V74
V75
V76
V77
V78
V79
V80
V81
V82
V83
V84
V85
V86
V87
V88
V89
V90
V91
V92
V93
V94
V95
V96
V97
V98
V99
V100
V101
V102
V103
V104
V105
V106
V107
V108
V109
V110
V111
V112
V113
V114
V115
V116
V117
V118
V119
V120
V121
V122
V123
V124
V125
V126
V127
V128
V129
V130
V131
V132
V133
V134
V135
V136
V137
V138
V139
V140
V141
V142
V143
V144
V145
V146
V147
V148
V149
V150
V151
V152
V153
V154
V155
V156
V157
V158
V159
V160
V161
V162
V163
V164
V165
V166
V167
V168
V169
V170
V171
V

In [21]:
train['R_emaildomain'].value_counts()

32    453249
16     57147
19     27509
1      20529
55     11842
2       3701
36      2507
9       1812
56      1508
23      1398
31       852
25       762
26       754
50       620
30       556
44       552
10       459
37       433
3        430
4        422
22       293
21       292
51       237
28       218
60       207
38       207
35       187
17       147
59       137
8        127
       ...  
18       105
15        95
11        79
57        75
43        69
12        68
46        63
58        57
27        55
42        53
24        53
13        52
52        47
20        42
39        41
54        39
7         37
0         36
47        35
53        33
49        29
5         27
40        27
48        25
41        25
14        14
33        14
6         12
34         9
45         8
Name: R_emaildomain, Length: 61, dtype: int64

Interactive namespace is empty.
