# LoRAS "Credit Fraud" Tutorial

## Prepare dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
filename='creditcard.csv'
data=pd.read_csv(filename)
data=data.values
data.shape

(284807, 31)

In [3]:
labels=data[:,30]
features=list(range(30))
features=data[:,features]

In [4]:
label_1=np.where(labels == 1)[0]
label_1=list(label_1)
features_1=features[label_1]
features_1_trn=features_1[list(range(0,246))]
features_1_tst=features_1[list(range(246,492))]
label_0=np.where(labels == 0)[0]
label_0=list(label_0)
features_0=features[label_0]
features_0_trn=features_0[list(range(0,142157))]
features_0_tst=features_0[list(range(142157,284315))]
training_data=np.concatenate((features_1_trn,features_0_trn))
test_data=np.concatenate((features_1_tst,features_0_tst))
training_labels=np.concatenate((np.zeros(246)+1, np.zeros(142157)))
test_labels=np.concatenate((np.zeros(246)+1, np.zeros(142158)))

## LoRAS oversampling

In [5]:
import loras

In [6]:
min_class_points = features_1_trn
maj_class_points = features_0_trn
k = 30
num_shadow_points = 100
sigma = [.005]*min_class_points.shape[1]
num_generated_points = (len(features_0)-len(features_1))//len(features_1)
num_aff_comb = 300
seed = 42

In [7]:
loras_min_class_points = loras.fit_resample(maj_class_points, min_class_points)
print(loras_min_class_points.shape)
LoRAS_feat = np.concatenate((loras_min_class_points, maj_class_points))
LoRAS_labels = np.concatenate((np.zeros(len(loras_min_class_points))+1, np.zeros(len(maj_class_points))))
print(LoRAS_feat.shape)
print(LoRAS_labels.shape)

(142680, 30)
(284837, 30)
(284837,)


## SMOTE and its extensions oversampling

In [8]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN

In [9]:
sm = SMOTE(random_state=42, k_neighbors=30, ratio=1)
SMOTE_feat, SMOTE_labels = sm.fit_resample(training_data,training_labels)
print(SMOTE_feat.shape)
print(SMOTE_labels.shape)

(284314, 30)
(284314,)


In [10]:
smb = BorderlineSMOTE(random_state=42, k_neighbors=30, kind='borderline-1')
SMOTEb_feat, SMOTEb_labels = smb.fit_resample(training_data,training_labels)
print(SMOTEb_feat.shape)
print(SMOTEb_labels.shape)

(284314, 30)
(284314,)


In [11]:
smbt = BorderlineSMOTE(random_state=42, k_neighbors=30, kind='borderline-2')
SMOTEbt_feat, SMOTEbt_labels = smb.fit_resample(training_data,training_labels)
print(SMOTEbt_feat.shape)
print(SMOTEbt_labels.shape)

(284314, 30)
(284314,)


In [12]:
sms = SVMSMOTE(random_state=42, k_neighbors=30)
SMOTEs_feat, SMOTEs_labels = sms.fit_resample(training_data,training_labels)
print(SMOTEs_feat.shape)
print(SMOTEs_labels.shape)

(220447, 30)
(220447,)


In [13]:
ada = ADASYN(random_state=42,n_neighbors=30)
ADA_feat, ADA_labels = ada.fit_resample(training_data,training_labels)
print(ADA_feat.shape)
print(ADA_labels.shape)

(284330, 30)
(284330,)


## Defining ML models and metrics

In [14]:
from sklearn.metrics import f1_score, balanced_accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [15]:
def get_metrics(y_test, y_pred):
    metrics = []
    metrics.append(f1_score(y_test, y_pred))
    metrics.append(balanced_accuracy_score(y_test, y_pred))
    metrics.append(precision_score(y_test, y_pred))
    metrics.append(recall_score(y_test, y_pred))
    return metrics

In [16]:
def lr(X_train, y_train, X_test, y_test):
    logreg = LogisticRegression(random_state=42, C=.005, solver='lbfgs', multi_class='multinomial', max_iter=685)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    return get_metrics(y_test, y_pred)

In [17]:
def rf(X_train, y_train, X_test, y_test):
    det = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=42)
    det.fit(X_train, y_train)
    y_pred = det.predict(X_test)
    return get_metrics(y_test, y_pred)

## Training

In [18]:
results_normal_lr = lr(training_data, training_labels, test_data, test_labels)
results_normal_rf = rf(training_data, training_labels, test_data, test_labels)

results_loras_lr = lr(LoRAS_feat, LoRAS_labels, test_data, test_labels)
results_loras_rf = rf(LoRAS_feat, LoRAS_labels, test_data, test_labels)

results_sm_lr = lr(SMOTE_feat, SMOTE_labels, test_data, test_labels)
results_sm_rf = rf(SMOTE_feat, SMOTE_labels, test_data, test_labels)

results_sms_lr = lr(SMOTEs_feat, SMOTEs_labels, test_data, test_labels)
results_sms_rf = rf(SMOTEs_feat, SMOTEs_labels, test_data, test_labels)

results_smb_lr = lr(SMOTEb_feat, SMOTEb_labels, test_data, test_labels)
results_smb_rf = rf(SMOTEb_feat, SMOTEb_labels, test_data, test_labels)

results_smbt_lr = lr(SMOTEbt_feat, SMOTEbt_labels, test_data, test_labels)
results_smbt_rf = rf(SMOTEbt_feat, SMOTEbt_labels, test_data, test_labels)

results_ada_lr = lr(ADA_feat, ADA_labels, test_data, test_labels)
results_ada_rf = rf(ADA_feat, ADA_labels, test_data, test_labels)

results = [results_normal_lr, results_normal_rf, results_loras_lr, results_loras_rf, results_sm_lr, results_sm_rf, results_sms_lr, results_sms_rf, results_smb_lr, results_smb_rf, results_smbt_lr, results_smbt_rf, results_ada_lr, results_ada_rf]



In [19]:
res_names = ['Normal LR', 'Normal RF', 'LoRAS LR', 'LORAS RF', 'SMOTE LR', 'SMOTE RF','SMOTE SVM LR', 'SMOTE SVM RF', 'SMOTE BORDELINE-1 LR', 'SMOTE BORDELINE-1 RF', 'SMOTE BORDELINE-2 LR', 'SMOTE BORDELINE-2 RF', 'ADASYN LR', 'ADASYN RF']
met_names = ['f1_score', 'balanced_accuracy_score', 'precision_score', 'recall_score']
for res, r_name in zip(results, res_names):
    print(r_name, " : [")
    for met,r in zip(res,met_names):
        print(r, " : ", met)
    print("]")

Normal LR  : [
f1_score  :  0.469135802469
balanced_accuracy_score  :  0.654464510289
precision_score  :  0.974358974359
recall_score  :  0.308943089431
]
Normal RF  : [
f1_score  :  0.644808743169
balanced_accuracy_score  :  0.739830363948
precision_score  :  0.983333333333
recall_score  :  0.479674796748
]
LoRAS LR  : [
f1_score  :  0.809322033898
balanced_accuracy_score  :  0.88808827965
precision_score  :  0.845132743363
recall_score  :  0.776422764228
]
LORAS RF  : [
f1_score  :  0.807174887892
balanced_accuracy_score  :  0.865783314272
precision_score  :  0.9
recall_score  :  0.731707317073
]
SMOTE LR  : [
f1_score  :  0.685714285714
balanced_accuracy_score  :  0.914124150421
precision_score  :  0.58452722063
recall_score  :  0.829268292683
]
SMOTE RF  : [
f1_score  :  0.420081967213
balanced_accuracy_score  :  0.914820129715
precision_score  :  0.280821917808
recall_score  :  0.833333333333
]
SMOTE SVM LR  : [
f1_score  :  0.767772511848
balanced_accuracy_score  :  0.82921905169