In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, classification_report, roc_auc_score

In [29]:
cols = ['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
        'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']
df = pd.read_csv('datasets/PS_20174392719_1491204439457_log.csv', header = 0, names = cols)
print('df.shape:', df.shape)
df.head()

df.shape: (6362620, 11)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [30]:
pd.value_counts(df.isFraud, normalize = True)

0    0.998709
1    0.001291
Name: isFraud, dtype: float64

In [31]:
majority_class = df.isFraud.mode()[0]
y_pred = np.full(shape = df.isFraud.shape, fill_value = majority_class)
accuracy_score(df.isFraud, y_pred)

0.9987091795518198

In [32]:
print(classification_report(df.isFraud, y_pred))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00   6354407
           1       0.00      0.00      0.00      8213

    accuracy                           1.00   6362620
   macro avg       0.50      0.50      0.50   6362620
weighted avg       1.00      1.00      1.00   6362620



In [33]:
roc_auc_score(df.isFraud, y_pred)

0.5

This gives us a value of 0.5. A value of 1.0 for the ROC AUC score is the best anyone can get with any model. Why did we get 0.5? This is because we can predict all the isFraud=0 cases perfectly, but none of the isFraud=1 cases. So out of the two classes, we can predict only 1 (which gives us an ROC AUC of 0.5).

# Undersampling

In [44]:
from imblearn.under_sampling import RandomUnderSampler
X = df.drop(['isFraud', 'type', 'nameOrig', 'nameDest'], axis = 1)
y = df.isFraud
rus = RandomUnderSampler(sampling_strategy=0.8)
X_res, y_res = rus.fit_resample(X, y)
print(X_res.shape, y_res.shape)
print(pd.value_counts(y_res))

(18479, 7) (18479,)
0    10266
1     8213
dtype: int64


Table data after sampling

In [45]:
cols_numeric = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
                'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud']
df_rus = pd.DataFrame(X_res, columns = cols_numeric)
df_rus.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
0,235.0,8969.84,61014.0,52044.16,0.0,0.0,0.0
1,202.0,46583.31,19419848.29,19466431.59,310678.05,264094.75,0.0
2,331.0,30449.46,12651.0,0.0,0.0,0.0,0.0
3,594.0,179546.88,73580.0,0.0,181475.46,361022.34,0.0
4,374.0,45158.65,10164.0,0.0,1960338.18,2005496.84,0.0


In [46]:
from sklearn.model_selection import train_test_split

def train_validation_test_split(X, y, train_size=0.8, val_size=0.1, test_size=0.1, random_state=None, shuffle=True):
    
    assert int(train_size + val_size + test_size + 1e-7) == 1
    
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, shuffle=shuffle)
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val,    test_size=val_size/(train_size+val_size), 
        random_state=random_state, shuffle=shuffle)
    
    return X_train, X_val, X_test, y_train, y_val, y_test



X_train, X_val, X_test, y_train, y_val, y_test = train_validation_test_split(
    X_res, y_res, train_size=0.8, val_size=0.1, test_size=0.1, random_state=1)

class_weight = {0: 1, 1: 5}

model = LogisticRegression(class_weight=class_weight)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print(classification_report(y_val, y_pred))
print('accuracy', accuracy_score(y_val, y_pred))

roc_auc_score(y_val, y_pred)

              precision    recall  f1-score   support

           0       0.98      0.57      0.72      1035
           1       0.65      0.99      0.78       813

    accuracy                           0.76      1848
   macro avg       0.81      0.78      0.75      1848
weighted avg       0.83      0.76      0.75      1848

accuracy 0.7559523809523809




0.7808064602385154

Testing on test dataset

In [47]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy', accuracy_score(y_test, y_pred))
print('ROC AUC score:', roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.61      0.75      1019
           1       0.67      0.99      0.80       829

    accuracy                           0.78      1848
   macro avg       0.83      0.80      0.78      1848
weighted avg       0.84      0.78      0.77      1848

Accuracy 0.7797619047619048
ROC AUC score: 0.7991698145370647


Testing on full unsampled dataset

In [48]:
y_pred = model.predict(X)
print(classification_report(y, y_pred))
print('Accuracy:', accuracy_score(y, y_pred))
print('ROC AUC score:', roc_auc_score(y, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.59      0.74   6354407
           1       0.00      0.99      0.01      8213

    accuracy                           0.59   6362620
   macro avg       0.50      0.79      0.38   6362620
weighted avg       1.00      0.59      0.74   6362620

Accuracy: 0.5932763547092236
ROC AUC score: 0.7914505013783757


# OverSampling

In [40]:
from imblearn.over_sampling import RandomOverSampler
X = df.drop(['isFraud', 'type', 'nameOrig', 'nameDest'], axis = 1)
y = df.isFraud
ros = RandomOverSampler(sampling_strategy=0.8)
X_res, y_res = ros.fit_resample(X, y)
print(X_res.shape, y_res.shape)
print(pd.value_counts(y_res))

(11437932, 7) (11437932,)
0    6354407
1    5083525
dtype: int64


In [41]:
from sklearn.model_selection import train_test_split

def train_validation_test_split(X, y, train_size=0.8, val_size=0.1, test_size=0.1, random_state=None, shuffle=True):
    
    assert int(train_size + val_size + test_size + 1e-7) == 1
    
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, shuffle=shuffle)
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val,    test_size=val_size/(train_size+val_size), 
        random_state=random_state, shuffle=shuffle)
    
    return X_train, X_val, X_test, y_train, y_val, y_test



X_train, X_val, X_test, y_train, y_val, y_test = train_validation_test_split(
    X_res, y_res, train_size=0.8, val_size=0.1, test_size=0.1, random_state=1)

class_weight = {0: 4, 1: 5}

model = LogisticRegression(class_weight=class_weight)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print(classification_report(y_val, y_pred))
print('accuracy', accuracy_score(y_val, y_pred))

roc_auc_score(y_val, y_pred)



              precision    recall  f1-score   support

           0       0.92      0.91      0.91    635474
           1       0.89      0.90      0.89    508320

    accuracy                           0.91   1143794
   macro avg       0.90      0.90      0.90   1143794
weighted avg       0.91      0.91      0.91   1143794

accuracy 0.9050904271223664


0.9044278259968608

In [42]:
# Testing on test data set

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy', accuracy_score(y_test, y_pred))
print('ROC AUC score:', roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.91      0.91    634887
           1       0.89      0.90      0.89    508907

    accuracy                           0.91   1143794
   macro avg       0.90      0.90      0.90   1143794
weighted avg       0.91      0.91      0.91   1143794

Accuracy 0.9055371858918652
ROC AUC score: 0.9048469999642428


In [43]:
# Testing on full unsampled dataset
y_pred = model.predict(X)
print(classification_report(y, y_pred))
print('Accuracy:', accuracy_score(y, y_pred))
print('ROC AUC score:', roc_auc_score(y, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.91      0.95   6354407
           1       0.01      0.90      0.03      8213

    accuracy                           0.91   6362620
   macro avg       0.51      0.90      0.49   6362620
weighted avg       1.00      0.91      0.95   6362620

Accuracy: 0.9106243968679569
ROC AUC score: 0.9044252983850717
