In [1]:
# CAP 5610 Project
# Dataset Link: https://www.kaggle.com/c/ieee-fraud-detection
# Feature Engineering: https://www.kaggle.com/c/ieee-fraud-detection/discussion/108575
# https://www.kaggle.com/kenjee/titanic-project-example 
# evaluation metrics: https://towardsdatascience.com/metrics-to-evaluate-your-machine-learning-algorithm-f10ba6e38234

import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import os

train_identity = pd.read_csv(os.getcwd() + "\\train_identity.csv")
train_transaction = pd.read_csv(os.getcwd() + "\\train_transaction.csv")

# print(train_identity.head())
# print(train_transaction.head())

'''

We are predicting whether a given transaction is considered fraud or not, as indicated by the isFraud column
in train_transaction.

Not all transactions have corresponding identity information.


General information about the meaning of the data features from the competition discussion:

Transaction Table *
TransactionDT: timedelta from a given reference datetime (not an actual timestamp)
TransactionAMT: transaction payment amount in USD
ProductCD: product code, the product for each transaction
card1 - card6: payment card information, such as card type, card category, issue bank, country, etc.
addr: address
dist: distance
P_ and (R__) emaildomain: purchaser and recipient email domain
C1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.
D1-D15: timedelta, such as days between previous transaction, etc.
M1-M9: match, such as names on card and address, etc.
Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.
Categorical Features:
ProductCD
card1 - card6
addr1, addr2
P_emaildomain
R_emaildomain
M1 - M9

Identity Table *
Variables in this table are identity information – network connection information (IP, ISP, Proxy, etc) and digital signature (UA/browser/os/version, etc) associated with transactions.
They're collected by Vesta’s fraud protection system and digital security partners.
(The field names are masked and pairwise dictionary will not be provided for privacy protection and contract agreement)

Categorical Features:
DeviceType
DeviceInfo
id_12 - id_38

'''


"\n\nWe are predicting whether a given transaction is considered fraud or not, as indicated by the isFraud column\nin train_transaction.\n\nNot all transactions have corresponding identity information.\n\n\nGeneral information about the meaning of the data features from the competition discussion:\n\nTransaction Table *\nTransactionDT: timedelta from a given reference datetime (not an actual timestamp)\nTransactionAMT: transaction payment amount in USD\nProductCD: product code, the product for each transaction\ncard1 - card6: payment card information, such as card type, card category, issue bank, country, etc.\naddr: address\ndist: distance\nP_ and (R__) emaildomain: purchaser and recipient email domain\nC1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.\nD1-D15: timedelta, such as days between previous transaction, etc.\nM1-M9: match, such as names on card and address, etc.\nVxxx: Vesta engineered rich feat

In [2]:
train_identity.info()

'''

All columns in train_identity seem like categorical data. More specifically, they are nominal. However, the description 
provided of the features in train_identity states that DeviceType, DeviceInfo, and id_12 - id_38 are the only features 
that are categorical. Will inspect id_01-id_11 to confirm their type.

'''

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144233 entries, 0 to 144232
Data columns (total 41 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   TransactionID  144233 non-null  int64  
 1   id_01          144233 non-null  float64
 2   id_02          140872 non-null  float64
 3   id_03          66324 non-null   float64
 4   id_04          66324 non-null   float64
 5   id_05          136865 non-null  float64
 6   id_06          136865 non-null  float64
 7   id_07          5155 non-null    float64
 8   id_08          5155 non-null    float64
 9   id_09          74926 non-null   float64
 10  id_10          74926 non-null   float64
 11  id_11          140978 non-null  float64
 12  id_12          144233 non-null  object 
 13  id_13          127320 non-null  float64
 14  id_14          80044 non-null   float64
 15  id_15          140985 non-null  object 
 16  id_16          129340 non-null  object 
 17  id_17          139369 non-nul

'\n\nAll columns in train_identity seem like categorical data. More specifically, they are nominal. However, the description \nprovided of the features in train_identity states that DeviceType, DeviceInfo, and id_12 - id_38 are the only features \nthat are categorical. Will inspect id_01-id_11 to confirm their type.\n\n'

In [3]:
train_transaction.info()
'''

train_transaction contains a mix of categorical and numeric data. 

'''

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(4), object(14)
memory usage: 1.7+ GB


'\n\ntrain_transaction contains a mix of categorical and numeric data. \n\n'

In [4]:
# Data Preprocessing

# replace null values with -999
train_identity = train_identity.fillna(value = -999)
train_transaction = train_transaction.fillna(value = -999)

from sklearn import preprocessing

# convert non-numeric columns to numeric by assigning each class to a number
number = preprocessing.LabelEncoder()

for col in list(train_identity):
    if (train_identity[col].dtype == "object"):
        train_identity[col] = train_identity[col].astype(str)
        train_identity[col] = number.fit_transform(train_identity[col])
        
for col in list(train_transaction):
    if (train_transaction[col].dtype == "object"):
        train_transaction[col] = train_transaction[col].astype(str)
        train_transaction[col] = number.fit_transform(train_transaction[col])
        


In [5]:
# not all transactions have corresponding identity data, so only include those with identity data
# df = pd.merge(train_identity, train_transaction, on = "TransactionID", how = "left")

# not all transactions have corresponding identity data, so for instances with no corresponding idenity data, fill with 
# dummy values
df = pd.merge(train_identity, train_transaction, on = "TransactionID", how = "outer")
df = df.fillna(value = -999)

In [6]:
from sklearn.model_selection import train_test_split

X = df.drop(["isFraud"], axis = 1).values

y = df["isFraud"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.25)

In [7]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(criterion = "entropy")
model.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy')

In [8]:
predicted = model.predict(X_test)

from sklearn.metrics import accuracy_score

print("The accuracy score is " + str(accuracy_score(y_test, predicted)))

from sklearn.metrics import confusion_matrix

print("Confusion matrix: ")
print(confusion_matrix(y_test, predicted))

from sklearn.metrics import precision_score

print("The precision score is " + str(precision_score(y_test, predicted)))

from sklearn.metrics import recall_score

print("The recall score is " + str(recall_score(y_test, predicted)))


from sklearn.metrics import f1_score

print("The F1 score is " + str(f1_score(y_test, predicted)))


The accuracy score is 0.9795915602668744
Confusion matrix: 
[[142296    129]
 [  2884   2326]]
The precision score is 0.9474541751527495
The recall score is 0.44644913627639154
The F1 score is 0.6069145466405741
