In [1]:
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.under_sampling import NearMiss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from collections import Counter
from sklearn import metrics
import pandas as pd

In [2]:
train_csv = '../../dataset/train_N10.csv'
test_csv = '../../dataset/test_N10.csv'

train = pd.read_csv(train_csv)
test = pd.read_csv(test_csv)

x_train = train.drop(columns = ['osm_id', 'class'])
y_train = train['class']

print(x_train.shape)
print(y_train.shape)

x_test = test.drop(columns = ['osm_id', 'class'])
y_test = test['class']

print(x_test.shape)
print(y_test.shape)

print('Original training dataset shape %s' % Counter(y_train))

(4481441, 22)
(4481441,)
(1920618, 22)
(1920618,)
Original training dataset shape Counter({0: 4480444, 1: 997})


In [3]:
print("sampling_strategy=auto")

nm = NearMiss()

x_res, y_res = nm.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy=auto
Resampled dataset shape Counter({0: 997, 1: 997})
Training started
Making Predictions
AUC Score:
0.49376639849831794
Accuracy Score
0.0068384238823128805
Precision Score
0.00021542156322675295
Recall Score
0.9809069212410502
f1 Score
0.00043074852775183854
f1.5 Score
0.0006997742988865298
f2 Score
0.0010761624518279837


In [4]:
print("sampling_strategy=0.2")

nm = NearMiss(sampling_strategy=0.2)

x_res, y_res = nm.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy=0.2
Resampled dataset shape Counter({0: 4985, 1: 997})
Training started
Making Predictions
AUC Score:
0.5090945942517362
Accuracy Score
0.042259314449828125
Precision Score
0.0002223003685620536
Recall Score
0.9761336515513126
f1 Score
0.00044449950876826905
f1.5 Score
0.0007221061869574612
f2 Score
0.0011104902502214193


In [5]:
print("sampling_strategy=0.4")

nm = NearMiss(sampling_strategy=0.4)

x_res, y_res = nm.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))


sampling_strategy=0.4
Resampled dataset shape Counter({0: 2492, 1: 997})
Training started
Making Predictions
AUC Score:
0.49571238751170554
Accuracy Score
0.015500739866022292
Precision Score
0.00021625954529447782
Recall Score
0.9761336515513126
f1 Score
0.00043242328847866823
f1.5 Score
0.0007024933426963879
f2 Score
0.001080340341546033


In [6]:
print("sampling_strategy=0.6")

nm = NearMiss(sampling_strategy=0.6)

x_res, y_res = nm.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))


sampling_strategy=0.6
Resampled dataset shape Counter({0: 1661, 1: 997})
Training started
Making Predictions
AUC Score:
0.49681993978296657
Accuracy Score
0.010558580623528468
Precision Score
0.00021675722769610217
Recall Score
0.9832935560859188
f1 Score
0.0004334189125288048
f1.5 Score
0.0007041117576278008
f2 Score
0.0010828313411339243


In [7]:
print("sampling_strategy=0.8")

nm = NearMiss(sampling_strategy=0.8)

x_res, y_res = nm.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy=0.8
Resampled dataset shape Counter({0: 1246, 1: 997})
Training started
Making Predictions
AUC Score:
0.49753231746946736
Accuracy Score
0.009597431660017764
Precision Score
0.00021707228875134555
Recall Score
0.9856801909307876
f1 Score
0.0004340489886816209
f1.5 Score
0.000705135537977439
f2 Score
0.0010844061865241657


In [8]:
print("sampling_strategy=1.0")

nm = NearMiss(sampling_strategy=1.0)

x_res, y_res = nm.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy=1.0
Resampled dataset shape Counter({0: 997, 1: 997})
Training started
Making Predictions
AUC Score:
0.4958934248338603
Accuracy Score
0.006320361466986147
Precision Score
0.00021635655560363478
Recall Score
0.9856801909307876
f1 Score
0.0004326181516310909
f1.5 Score
0.000702811704976166
f2 Score
0.0010808338070159458
