# Fraud Detection with RAPIDS

In [1]:
import time
import xgboost as xgb
from tqdm import tqdm
import cudf


## 1. Load Data

Dataset from: https://www.kaggle.com/c/ieee-fraud-detection/

In [2]:
start_time = time.time()

In [3]:
#train_transaction = pd.read_csv("./data/train_transaction.csv", index_col="TransactionID")
#train_identity = pd.read_csv("./data/train_identity.csv", index_col="TransactionID")
train_transaction = cudf.read_csv("./data/train_transaction.csv", index_col="TransactionID")
train_identity = cudf.read_csv("./data/train_identity.csv", index_col="TransactionID")

In [4]:
train_transaction.head()

Unnamed: 0_level_0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
train_identity.head()

Unnamed: 0_level_0,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987004,0.0,70787.0,,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
2987008,-5.0,98945.0,,,0.0,-5.0,,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
2987011,-5.0,221832.0,,,0.0,-6.0,,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [6]:
train_data = train_transaction.merge(train_identity, how="left", left_index=True, right_index=True)

In [7]:
# label
y_all = train_data["isFraud"].astype("int32")
y_all.head()

TransactionID
2990216    0
2990217    0
2990218    0
2990219    0
2990220    0
Name: isFraud, dtype: int32

Drop label (target column) and fill in blanks in the data. A very low, negative number works well for decision trees to indicate the absence of a feature (as opposed to a feature value of `0.0`)

In [8]:
# features (data)
x_all = train_data.drop("isFraud", axis=1)
x_all = x_all.fillna(-999.0)

converted_to_float = 0
converted_to_cat = 0
dropped = 0

for col in tqdm(x_all.columns):
    try:
        x_all[col] = x_all[col].astype("float32")
        converted_to_float += 1
    except:
        # only works for XGBoost GPU
        # convert column type to category
    #try:
            #
        #x_all[col] = x_all[col].astype("category")
        #converted_to_cat += 1
    #except:
        x_all.drop([col], axis=1,inplace=True)
        dropped += 1

x_all.head()

100%|██████████| 432/432 [00:00<00:00, 861.84it/s] 


Unnamed: 0_level_0,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,...,id_17,id_18,id_19,id_20,id_21,id_22,id_24,id_25,id_26,id_32
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2990216,153466.0,108.949997,4293.0,112.0,150.0,117.0,205.0,87.0,0.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2990217,153477.0,45.0,18132.0,567.0,150.0,117.0,476.0,87.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2990218,153486.0,200.0,7919.0,194.0,150.0,166.0,330.0,87.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2990219,153503.0,100.0,4503.0,298.0,150.0,226.0,327.0,87.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2990220,153511.0,150.0,16865.0,145.0,150.0,226.0,433.0,87.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0


In [9]:
converted_to_float, converted_to_cat, dropped

(401, 0, 31)

In [10]:
x_all.info()

<class 'cudf.core.dataframe.DataFrame'>
Int64Index: 590540 entries, 2990216 to 3577478
Columns: 401 entries, TransactionDT to id_32
dtypes: float32(401)
memory usage: 907.9 MB


## 2. Train Model

Make train test split

In [11]:
from cuml.model_selection import train_test_split

In [12]:
x_all = x_all
y_all = y_all

In [13]:
print(x_all.shape)
print(y_all.shape)

(590540, 401)
(590540,)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.1, stratify=y_all)

In [15]:
print("Train:", X_train.shape, y_train.shape)
print(" Test:", X_test.shape, y_test.shape)

Train: (531486, 401) (531486,)
 Test: (59054, 401) (59054,)


Construct the XGBoost Classifier. You can also try out [additional arguments](https://xgboost.readthedocs.io/en/latest/parameter.html) to perform parameter tuning.

In [16]:
clf = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.05,
    tree_method="gpu_hist",
    scale_pos_weight=2.0,
    use_label_encoder=False,
    enable_categorical=True,
)

In [17]:
train_start_time = time.time()

clf.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=True)

train_end_time = time.time()

print("Training taken:", int(train_end_time-train_start_time), "seconds")

[0]	validation_0-logloss:0.65023	validation_1-logloss:0.65039
[1]	validation_0-logloss:0.61137	validation_1-logloss:0.61171
[2]	validation_0-logloss:0.57591	validation_1-logloss:0.57640
[3]	validation_0-logloss:0.54354	validation_1-logloss:0.54416
[4]	validation_0-logloss:0.51387	validation_1-logloss:0.51462
[5]	validation_0-logloss:0.48661	validation_1-logloss:0.48748
[6]	validation_0-logloss:0.46141	validation_1-logloss:0.46241
[7]	validation_0-logloss:0.43810	validation_1-logloss:0.43923
[8]	validation_0-logloss:0.41651	validation_1-logloss:0.41778
[9]	validation_0-logloss:0.39651	validation_1-logloss:0.39791
[10]	validation_0-logloss:0.37774	validation_1-logloss:0.37931
[11]	validation_0-logloss:0.36018	validation_1-logloss:0.36188
[12]	validation_0-logloss:0.34397	validation_1-logloss:0.34580
[13]	validation_0-logloss:0.32881	validation_1-logloss:0.33076
[14]	validation_0-logloss:0.31474	validation_1-logloss:0.31682
[15]	validation_0-logloss:0.30148	validation_1-logloss:0.30367
[1

## 3. Evaluation

In [18]:
preds = clf.predict(X_test)
acc_xgb = (preds == y_test.to_numpy()).sum().astype(float) / len(preds)*100

In [19]:
end_time = time.time()

time_taken = int(end_time - start_time)

In [20]:
print("XGBoost's prediction accuracy is: %3.2f" % (acc_xgb))
print("Total time taken:", time_taken)

XGBoost's prediction accuracy is: 98.04
Total time taken: 17


<br>
<div align="center"><h2>Please Restart the Kernel</h2></div>

In [21]:
#import IPython
#app = IPython.Application.instance()
#app.kernel.do_shutdown(True)