# LoRAS "ozone_level" Tutorial

## Prepare dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
from collections import Counter
from imblearn.datasets import fetch_datasets
data = fetch_datasets()['ozone_level']
data.data.shape

Using TensorFlow backend.


(2536, 72)

In [3]:
labels=data.target
print(labels.shape)
features=data.data
print(features.shape)

(2536,)
(2536, 72)


In [4]:
label_1=np.where(labels == 1)[0]
label_1=list(label_1)
features_1=features[label_1]
features_1_trn=features_1[list(range(0,36))]
features_1_tst=features_1[list(range(36,73))]
label_0=np.where(labels == -1)[0]
label_0=list(label_0)
features_0=features[label_0]
features_0_trn=features_0[list(range(0,1231))]
features_0_tst=features_0[list(range(1231,2463))]
training_data=np.concatenate((features_1_trn,features_0_trn))
test_data=np.concatenate((features_1_tst,features_0_tst))
training_labels=np.concatenate((np.zeros(len(features_1_trn))+1, 
                                np.zeros(len(features_0_trn))))
test_labels=np.concatenate((np.zeros(len(features_1_tst))+1, 
                            np.zeros(len(features_0_tst))))

## LoRAS oversampling

In [5]:
import loras

In [6]:
min_class_points = features_1_trn
maj_class_points = features_0_trn
k = 12
num_shadow_points = 40
num_generated_points=(len(features_0)-len(features_1))//len(features_1)
num_aff_comb = 72

In [7]:
loras_min_class_points = loras.fit_resample(maj_class_points, 
                                            min_class_points, k=k, 
                                            num_shadow_points=num_shadow_points, 
                                            num_generated_points=num_generated_points,
                                            num_aff_comb=num_aff_comb)
print(loras_min_class_points.shape)
LoRAS_feat = np.concatenate((loras_min_class_points, maj_class_points))
LoRAS_labels = np.concatenate((np.zeros(len(loras_min_class_points))+1, 
                               np.zeros(len(maj_class_points))))
print(LoRAS_feat.shape)
print(LoRAS_labels.shape)## SMOTE and its extensions oversampling

(1188, 72)
(2419, 72)
(2419,)


## SMOTE and its extensions oversampling

In [8]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN

In [9]:
sm = SMOTE(random_state=42, k_neighbors=12, ratio=1)
SMOTE_feat, SMOTE_labels = sm.fit_resample(training_data,training_labels)
print(SMOTE_feat.shape)
print(SMOTE_labels.shape)

(2462, 72)
(2462,)


In [10]:
smb = BorderlineSMOTE(random_state=42, k_neighbors=12, kind='borderline-1')
SMOTEb_feat, SMOTEb_labels = smb.fit_resample(training_data,training_labels)
print(SMOTEb_feat.shape)
print(SMOTEb_labels.shape)

(2462, 72)
(2462,)


In [11]:
smbt = BorderlineSMOTE(random_state=42, k_neighbors=12, kind='borderline-2')
SMOTEbt_feat, SMOTEbt_labels = smb.fit_resample(training_data,training_labels)
print(SMOTEbt_feat.shape)
print(SMOTEbt_labels.shape)

(2462, 72)
(2462,)


In [12]:
sms = SVMSMOTE(random_state=42, k_neighbors=12)
SMOTEs_feat, SMOTEs_labels = sms.fit_resample(training_data,training_labels)
print(SMOTEs_feat.shape)
print(SMOTEs_labels.shape)

(1924, 72)
(1924,)


In [13]:
ada = ADASYN(random_state=111, n_neighbors=12, ratio=1)
ADA_feat, ADA_labels = ada.fit_resample(training_data,training_labels)
print(ADA_feat.shape)
print(ADA_labels.shape)

(2458, 72)
(2458,)


## Defining ML models and metrics

In [14]:
from sklearn.metrics import f1_score, balanced_accuracy_score, average_precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [15]:
def get_metrics(y_test, y_pred, y_prob):
    metrics = []
    metrics.append(f1_score(y_test, y_pred))
    metrics.append(balanced_accuracy_score(y_test, y_pred))
    metrics.append(average_precision_score(y_test, y_prob[:,1]))
    return metrics

In [16]:
def knn(X_train,y_train,X_test,y_test):
    knn = KNeighborsClassifier(n_neighbors=29)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    y_prob = knn.predict_proba(X_test)
    return get_metrics(y_test, y_pred, y_prob)

In [23]:
def lr(X_train, y_train, X_test, y_test):
    logreg = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial', class_weight={0: 1, 1: 1})
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    y_prob = logreg.predict_proba(X_test)
    return get_metrics(y_test, y_pred, y_prob)

## Training

In [26]:
results_normal_knn = knn(training_data, training_labels, test_data, test_labels)
results_normal_lr = lr(training_data, training_labels, test_data, test_labels)

results_loras_knn = knn(LoRAS_feat, LoRAS_labels, test_data, test_labels)
results_loras_lr = lr(LoRAS_feat, LoRAS_labels, test_data, test_labels)

results_sm_knn = knn(SMOTE_feat, SMOTE_labels, test_data, test_labels)
results_sm_lr = lr(SMOTE_feat, SMOTE_labels, test_data, test_labels)

results_sms_knn = knn(SMOTEs_feat, SMOTEs_labels, test_data, test_labels)
results_sms_lr = lr(SMOTEs_feat, SMOTEs_labels, test_data, test_labels)

results_smb_knn = knn(SMOTEb_feat, SMOTEb_labels, test_data, test_labels)
results_smb_lr = lr(SMOTEb_feat, SMOTEb_labels, test_data, test_labels)

results_smbt_knn = knn(SMOTEbt_feat, SMOTEbt_labels, test_data, test_labels)
results_smbt_lr = lr(SMOTEbt_feat, SMOTEbt_labels, test_data, test_labels)

results_ada_knn = knn(ADA_feat, ADA_labels, test_data, test_labels)
results_ada_lr = lr(ADA_feat, ADA_labels, test_data, test_labels)

results = [results_normal_knn, results_normal_lr, results_loras_knn, 
           results_loras_lr, results_sm_knn, results_sm_lr, 
           results_sms_knn, results_sms_lr, results_smb_knn, 
           results_smb_lr, results_smbt_knn, results_smbt_lr, 
           results_ada_knn, results_ada_lr]

  'precision', 'predicted', average, warn_for)


In [27]:
res_names = ['Normal KNN', 'Normal LR', 'LoRAS KNN', 'LoRAS LR', 
             'SMOTE KNN', 'SMOTE LR', 'SMOTE SVM KNN', 'SMOTE SVM LR', 
             'SMOTE BORDELINE-1 KNN', 'SMOTE BORDELINE-1 LR', 
             'SMOTE BORDELINE-2 KNN', 'SMOTE BORDELINE-2 LR', 
             'ADASYN KNN', 'ADASYN LR']
met_names = ['f1_score', 'balanced_accuracy_score', 
             'average_precision_score']
for res, r_name in zip(results, res_names):
    print(r_name, " : [")
    for met,r in zip(res,met_names):
        print(r, " : ", met)
    print("]")

Normal KNN  : [
f1_score  :  0.0
balanced_accuracy_score  :  0.5
average_precision_score  :  0.05796703413379864
]
Normal LR  : [
f1_score  :  0.052631578947368425
balanced_accuracy_score  :  0.5135135135135135
average_precision_score  :  0.3118388055667834
]
LoRAS KNN  : [
f1_score  :  0.10972568578553617
balanced_accuracy_score  :  0.6584985959985961
average_precision_score  :  0.05549443844806338
]
LoRAS LR  : [
f1_score  :  0.2254901960784314
balanced_accuracy_score  :  0.7523692523692523
average_precision_score  :  0.2535720324522699
]
SMOTE KNN  : [
f1_score  :  0.10722610722610723
balanced_accuracy_score  :  0.6610543173043173
average_precision_score  :  0.06426709638423649
]
SMOTE LR  : [
f1_score  :  0.21374045801526717
balanced_accuracy_score  :  0.7984270796770797
average_precision_score  :  0.2911811334083407
]
SMOTE SVM KNN  : [
f1_score  :  0.14166666666666666
balanced_accuracy_score  :  0.6542427167427167
average_precision_score  :  0.06538971052269957
]
SMOTE SVM LR  : 