# LoRAS "Credit Fraud" Tutorial

## Prepare dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
filename='creditcard.csv'
data=pd.read_csv(filename)
data=data.values
data.shape

(284807, 31)

## Prepare dataset

In [3]:
labels=data[:,30]
features=list(range(30))
features=data[:,features]

In [4]:
label_1=np.where(labels == 1)[0]
label_1=list(label_1)
features_1=features[label_1]
features_1_trn=features_1[list(range(0,246))]
features_1_tst=features_1[list(range(246,492))]
label_0=np.where(labels == 0)[0]
label_0=list(label_0)
features_0=features[label_0]
features_0_trn=features_0[list(range(0,142157))]
features_0_tst=features_0[list(range(142157,284315))]
training_data=np.concatenate((features_1_trn,features_0_trn))
test_data=np.concatenate((features_1_tst,features_0_tst))
training_labels=np.concatenate((np.zeros(246), np.zeros(142157)+1))
test_labels=np.concatenate((np.zeros(246), np.zeros(142158)+1))

## LoRAS oversampling

In [5]:
import loras

In [6]:
min_class_points = features_1_trn
maj_class_points = features_0_trn
# k = 10
# num_shadow_points = 100
# sigma = [.005]*min_class_points.shape[1]
# num_generated_points = (len(features_0)-len(features_1))//len(features_1) #300
# num_aff_comb = 300
# seed = 42

In [7]:
loras_min_class_points = loras.fit_resample(maj_class_points, min_class_points)
print(loras_min_class_points.shape)
LoRAS_feat = np.concatenate((loras_min_class_points, maj_class_points))
LoRAS_labels = np.concatenate((np.zeros(len(loras_min_class_points)), np.zeros(len(maj_class_points))+1))
print(LoRAS_feat.shape)
print(LoRAS_labels.shape)

(142680, 30)
(284837, 30)
(284837,)


## SMOTE and its extensions oversampling

In [8]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN

Using TensorFlow backend.


In [9]:
sm = SMOTE(random_state=42, k_neighbors=10)
SMOTE_feat, SMOTE_labels = sm.fit_resample(training_data,training_labels)
print(SMOTE_feat.shape)
print(SMOTE_labels.shape)

(284314, 30)
(284314,)


In [10]:
smb = BorderlineSMOTE(random_state=42, k_neighbors=10, kind='borderline-1')
SMOTEb_feat, SMOTEb_labels = smb.fit_resample(training_data,training_labels)
print(SMOTEb_feat.shape)
print(SMOTEb_labels.shape)

(284314, 30)
(284314,)


In [11]:
smbt = BorderlineSMOTE(random_state=42, k_neighbors=10, kind='borderline-2')
SMOTEbt_feat, SMOTEbt_labels = smb.fit_resample(training_data,training_labels)
print(SMOTEbt_feat.shape)
print(SMOTEbt_labels.shape)

(284314, 30)
(284314,)


In [12]:
sms = SVMSMOTE(random_state=42, k_neighbors=10)
SMOTEs_feat, SMOTEs_labels = sms.fit_resample(training_data,training_labels)
print(SMOTEs_feat.shape)
print(SMOTEs_labels.shape)

(220447, 30)
(220447,)


In [13]:
ada = ADASYN(random_state=42,n_neighbors=10)
ADA_feat, ADA_labels = ada.fit_resample(training_data,training_labels)
print(ADA_feat.shape)
print(ADA_labels.shape)

(284303, 30)
(284303,)


## Defining ML models and metrics

In [14]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [15]:
def get_metrics(y_test, y_pred):
    metrics = []
    metrics.append(f1_score(y_test, y_pred))
    metrics.append(accuracy_score(y_test, y_pred))
    metrics.append(precision_score(y_test, y_pred))
    metrics.append(recall_score(y_test, y_pred))
    return metrics

In [16]:
def lr(X_train, y_train, X_test, y_test):
    logreg = LogisticRegression(C=1e5, solver='lbfgs',random_state=42)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    return get_metrics(y_test, y_pred)

In [17]:
def rf(X_train, y_train, X_test, y_test):
    det = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=42)
    det.fit(X_train, y_train)
    y_pred = det.predict(X_test)
    return get_metrics(y_test, y_pred)

## Training

In [18]:
results_normal_lr = lr(training_data, training_labels, test_data, test_labels)
results_normal_rf = rf(training_data, training_labels, test_data, test_labels)

results_loras_lr = lr(LoRAS_feat, LoRAS_labels, test_data, test_labels)
results_loras_rf = rf(LoRAS_feat, LoRAS_labels, test_data, test_labels)

results_sm_lr = lr(SMOTE_feat, SMOTE_labels, test_data, test_labels)
results_sm_rf = rf(SMOTE_feat, SMOTE_labels, test_data, test_labels)

results_smb_lr = lr(SMOTEb_feat, SMOTEb_labels, test_data, test_labels)
results_smb_rf = rf(SMOTEb_feat, SMOTEb_labels, test_data, test_labels)

results_smbt_lr = lr(SMOTEbt_feat, SMOTEbt_labels, test_data, test_labels)
results_smbt_rf = rf(SMOTEbt_feat, SMOTEbt_labels, test_data, test_labels)

results_ada_lr = lr(ADA_feat, ADA_labels, test_data, test_labels)
results_ada_rf = rf(ADA_feat, ADA_labels, test_data, test_labels)

