In [1]:
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SVMSMOTE 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from collections import Counter
from sklearn import metrics
import pandas as pd

In [2]:
train_csv = '../../dataset/train_N10.csv'
test_csv = '../../dataset/test_N10.csv'

train = pd.read_csv(train_csv)
test = pd.read_csv(test_csv)

x_train = train.drop(columns = ['osm_id', 'class'])
y_train = train['class']

print(x_train.shape)
print(y_train.shape)

x_test = test.drop(columns = ['osm_id', 'class'])
y_test = test['class']

print(x_test.shape)
print(y_test.shape)

print('Original training dataset shape %s' % Counter(y_train))

(4481441, 22)
(4481441,)
(1920618, 22)
(1920618,)
Original training dataset shape Counter({0: 4480444, 1: 997})


In [3]:
print("sampling_strategy=auto")

sm = SVMSMOTE()

x_res, y_res = sm.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy=auto
Resampled dataset shape Counter({0: 4480444, 1: 1251438})
Training started
Making Predictions
AUC Score:
0.5702664194457042
Accuracy Score
0.999534004159078
Precision Score
0.09932659932659933
Recall Score
0.14081145584725538
f1 Score
0.11648568608094767
f1.5 Score
0.12477631364893445
f2 Score
0.1299559471365639


In [4]:
print("sampling_strategy = 0.2")

sm = SVMSMOTE(sampling_strategy = 0.2)

x_res, y_res = sm.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy = 0.2
Resampled dataset shape Counter({0: 4480444, 1: 548396})
Training started
Making Predictions
AUC Score:
0.6011567490168933
Accuracy Score
0.9992757539500307
Precision Score
0.07443082311733801
Recall Score
0.20286396181384247
f1 Score
0.10890454836643178
f1.5 Score
0.13250989327257465
f2 Score
0.15081618168914124


In [5]:
print("sampling_strategy = 0.4")

sm = SVMSMOTE(sampling_strategy = 0.4)

x_res, y_res = sm.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy = 0.4
Resampled dataset shape Counter({0: 4480444, 1: 741857})
Training started
Making Predictions
AUC Score:
0.5976179383187712
Accuracy Score
0.9993564571403579
Precision Score
0.0835881753312946
Recall Score
0.1957040572792363
f1 Score
0.11714285714285716
f1.5 Score
0.1385315139701105
f2 Score
0.1543093714715845


In [6]:
print("sampling_strategy = 0.6")

sm = SVMSMOTE(sampling_strategy = 0.6)

x_res, y_res = sm.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy = 0.6
Resampled dataset shape Counter({0: 4480444, 1: 1231067})
Training started
Making Predictions
AUC Score:
0.5750287527689506
Accuracy Score
0.9995142188608042
Precision Score
0.0984375
Recall Score
0.15035799522673032
f1 Score
0.11898016997167138
f1.5 Score
0.1293634496919918
f2 Score
0.13601036269430053


In [7]:
print("sampling_strategy = 0.8")

sm = SVMSMOTE(sampling_strategy = 0.8)

x_res, y_res = sm.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy = 0.8
Resampled dataset shape Counter({0: 4480444, 1: 2156366})
Training started
Making Predictions
AUC Score:
0.5380945003511166
Accuracy Score
0.9996152280151492
Precision Score
0.08333333333333333
Recall Score
0.07637231503579953
f1 Score
0.07970112079701121
f1.5 Score
0.07838703599020161
f2 Score
0.07766990291262137


In [8]:
print("sampling_strategy = 1.0")

sm = SVMSMOTE(sampling_strategy = 1.0)

x_res, y_res = sm.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy = 1.0
Resampled dataset shape Counter({0: 4480444, 1: 2044928})
Training started
Making Predictions
AUC Score:
0.5404754066230613
Accuracy Score
0.9996048147002683
Precision Score
0.08333333333333333
Recall Score
0.081145584725537
f1 Score
0.08222490931076179
f1.5 Score
0.0818064038497131
f2 Score
0.08157389635316699
