# LoRAS "car_eval_4" Tutorial

## Prepare dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
from collections import Counter
from imblearn.datasets import fetch_datasets
data = fetch_datasets()['car_eval_4']
data.data.shape

Using TensorFlow backend.


(1728, 21)

In [3]:
labels=data.target
print(labels.shape)
features=data.data
print(features.shape)

(1728,)
(1728, 21)


In [5]:
label_1=np.where(labels == 1)[0]
label_1=list(label_1)
features_1=features[label_1]
features_1_trn=features_1[list(range(0,32))]
features_1_tst=features_1[list(range(32,65))]
label_0=np.where(labels == -1)[0]
label_0=list(label_0)
features_0=features[label_0]
features_0_trn=features_0[list(range(0,831))]
features_0_tst=features_0[list(range(831,1663))]
training_data=np.concatenate((features_1_trn,features_0_trn))
test_data=np.concatenate((features_1_tst,features_0_tst))
training_labels=np.concatenate((np.zeros(len(features_1_trn))+1, 
                                np.zeros(len(features_0_trn))))
test_labels=np.concatenate((np.zeros(len(features_1_tst))+1, 
                            np.zeros(len(features_0_tst))))

## LoRAS oversampling

In [6]:
import loras

In [7]:
min_class_points = features_1_trn
maj_class_points = features_0_trn
k = 20
num_shadow_points = 40
num_generated_points=(len(features_0)-len(features_1))//len(features_1)
num_aff_comb = 21

In [8]:
loras_min_class_points = loras.fit_resample(maj_class_points, 
                                            min_class_points, k=k, 
                                            num_shadow_points=num_shadow_points, 
                                            num_generated_points=num_generated_points,
                                            num_aff_comb=num_aff_comb)
print(loras_min_class_points.shape)
LoRAS_feat = np.concatenate((loras_min_class_points, maj_class_points))
LoRAS_labels = np.concatenate((np.zeros(len(loras_min_class_points))+1, 
                               np.zeros(len(maj_class_points))))
print(LoRAS_feat.shape)
print(LoRAS_labels.shape)

(800, 21)
(1631, 21)
(1631,)


## SMOTE and its extensions oversampling

In [9]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN

In [10]:
sm = SMOTE(random_state=42, k_neighbors=30, ratio=1)
SMOTE_feat, SMOTE_labels = sm.fit_resample(training_data,training_labels)
print(SMOTE_feat.shape)
print(SMOTE_labels.shape)

(1662, 21)
(1662,)


In [11]:
smb = BorderlineSMOTE(random_state=42, k_neighbors=30, kind='borderline-1')
SMOTEb_feat, SMOTEb_labels = smb.fit_resample(training_data,training_labels)
print(SMOTEb_feat.shape)
print(SMOTEb_labels.shape)

(1662, 21)
(1662,)


In [12]:
smbt = BorderlineSMOTE(random_state=42, k_neighbors=30, kind='borderline-2')
SMOTEbt_feat, SMOTEbt_labels = smb.fit_resample(training_data,training_labels)
print(SMOTEbt_feat.shape)
print(SMOTEbt_labels.shape)

(1662, 21)
(1662,)


In [13]:
sms = SVMSMOTE(random_state=42, k_neighbors=30)
SMOTEs_feat, SMOTEs_labels = sms.fit_resample(training_data,training_labels)
print(SMOTEs_feat.shape)
print(SMOTEs_labels.shape)

(1662, 21)
(1662,)


In [20]:
ada = ADASYN(random_state=42,n_neighbors=30)
ADA_feat, ADA_labels = ada.fit_resample(training_data,training_labels)
print(ADA_feat.shape)
print(ADA_labels.shape)

(1658, 21)
(1658,)


## Defining ML models and metrics

In [21]:
from sklearn.metrics import f1_score, balanced_accuracy_score, average_precision_score
from sklearn.neighbors import KNeighborsClassifier

In [22]:
def get_metrics(y_test, y_pred, y_prob):
    metrics = []
    metrics.append(f1_score(y_test, y_pred))
    metrics.append(balanced_accuracy_score(y_test, y_pred))
    metrics.append(average_precision_score(y_test, y_prob[:,1]))
    return metrics

In [23]:
def knn(X_train,y_train,X_test,y_test):
    knn = KNeighborsClassifier(n_neighbors=30)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    y_prob = knn.predict_proba(X_test)
    return get_metrics(y_test, y_pred, y_prob)

## Training

In [24]:
results_normal_knn = knn(training_data, training_labels, test_data, test_labels)

results_loras_knn = knn(LoRAS_feat, LoRAS_labels, test_data, test_labels)

results_sm_knn = knn(SMOTE_feat, SMOTE_labels, test_data, test_labels)

results_sms_knn = knn(SMOTEs_feat, SMOTEs_labels, test_data, test_labels)

results_smb_knn = knn(SMOTEb_feat, SMOTEb_labels, test_data, test_labels)

results_smbt_knn = knn(SMOTEbt_feat, SMOTEbt_labels, test_data, test_labels)

results_ada_knn = knn(ADA_feat, ADA_labels, test_data, test_labels)

results = [results_normal_knn, results_loras_knn, results_sm_knn, results_sms_knn, results_smb_knn, results_smbt_knn, results_ada_knn]

In [25]:
res_names = ['Normal KNN', 'LoRAS KNN', 'SMOTE KNN', 'SMOTE SVM KNN', 'SMOTE BORDELINE-1 KNN', 'SMOTE BORDELINE-2 KNN', 'ADASYN KNN']
met_names = ['f1_score', 'balanced_accuracy_score', 'average_precision_score']
for res, r_name in zip(results, res_names):
    print(r_name, " : [")
    for met,r in zip(res,met_names):
        print(r, " : ", met)
    print("]")

Normal KNN  : [
f1_score  :  0.0
balanced_accuracy_score  :  0.49939903846153844
average_precision_score  :  0.19120856795480828
]
LoRAS KNN  : [
f1_score  :  0.21639344262295082
balanced_accuracy_score  :  0.8563701923076923
average_precision_score  :  0.14980901451489687
]
SMOTE KNN  : [
f1_score  :  0.13953488372093023
balanced_accuracy_score  :  0.7554086538461539
average_precision_score  :  0.15200831365604092
]
SMOTE SVM KNN  : [
f1_score  :  0.16624685138539044
balanced_accuracy_score  :  0.8010817307692308
average_precision_score  :  0.37006655915736963
]
SMOTE BORDELINE-1 KNN  : [
f1_score  :  0.14410480349344978
balanced_accuracy_score  :  0.7644230769230769
average_precision_score  :  0.36569904097759254
]
SMOTE BORDELINE-2 KNN  : [
f1_score  :  0.14410480349344978
balanced_accuracy_score  :  0.7644230769230769
average_precision_score  :  0.36569904097759254
]
ADASYN KNN  : [
f1_score  :  0.14072494669509597
balanced_accuracy_score  :  0.7578125
average_precision_score  :  0