In [1]:
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.under_sampling import InstanceHardnessThreshold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from collections import Counter
from sklearn import metrics
import pandas as pd

In [2]:
train_csv = '../../dataset/train_N10.csv'
test_csv = '../../dataset/test_N10.csv'

train = pd.read_csv(train_csv)
test = pd.read_csv(test_csv)

x_train = train.drop(columns = ['osm_id', 'class'])
y_train = train['class']

print(x_train.shape)
print(y_train.shape)

x_test = test.drop(columns = ['osm_id', 'class'])
y_test = test['class']

print(x_test.shape)
print(y_test.shape)

print('Original training dataset shape %s' % Counter(y_train))

(4481441, 22)
(4481441,)
(1920618, 22)
(1920618,)
Original training dataset shape Counter({0: 4480444, 1: 997})


In [3]:
print("sampling_strategy=auto")

iht = InstanceHardnessThreshold()

x_res, y_res = iht.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy=auto
Resampled dataset shape Counter({0: 4456326, 1: 997})
Training started
Making Predictions
AUC Score:
0.738975052233952
Accuracy Score
0.6999241910676668
Precision Score
0.0005654185831928459
Recall Score
0.7780429594272077
f1 Score
0.0011300159623420449
f1.5 Score
0.001834610591131619
f2 Score
0.0028188987271721084


In [4]:
print("sampling_strategy=0.2")

iht = InstanceHardnessThreshold(sampling_strategy=0.2)

x_res, y_res = iht.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy=0.2
Resampled dataset shape Counter({0: 4456091, 1: 997})
Training started
Making Predictions
AUC Score:
0.7339200893857236
Accuracy Score
0.7017444385088549
Precision Score
0.000560152724506071
Recall Score
0.766109785202864
f1 Score
0.0011194869192661609
f1.5 Score
0.0018175063360108853
f2 Score
0.0027925962271242


In [5]:
print("sampling_strategy=0.4")

iht = InstanceHardnessThreshold(sampling_strategy=0.4)

x_res, y_res = iht.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))


sampling_strategy=0.4
Resampled dataset shape Counter({0: 4456398, 1: 997})
Training started
Making Predictions
AUC Score:
0.7384622964837619
Accuracy Score
0.6965133097784151
Precision Score
0.0005607793976166018
Recall Score
0.7804295942720764
f1 Score
0.001120753475363988
f1.5 Score
0.0018195912318902305
f2 Score
0.002795861099568566


In [6]:
print("sampling_strategy=0.6")

iht = InstanceHardnessThreshold(sampling_strategy=0.6)

x_res, y_res = iht.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy=0.6
Resampled dataset shape Counter({0: 4456398, 1: 997})
Training started
Making Predictions
AUC Score:
0.7502286578712685
Accuracy Score
0.7009561505723678
Precision Score
0.0005830142707970762
Recall Score
0.7995226730310262
f1 Score
0.0011651788897410345
f1.5 Score
0.0018916926674864727
f2 Score
0.002906593368455393


In [7]:
print("sampling_strategy=0.8")

iht = InstanceHardnessThreshold(sampling_strategy=0.8)

x_res, y_res = iht.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy=0.8
Resampled dataset shape Counter({0: 4456399, 1: 997})
Training started
Making Predictions
AUC Score:
0.7314163456812858
Accuracy Score
0.691966856501397
Precision Score
0.0005457546874920798
Recall Score
0.7708830548926014
f1 Score
0.0010907371762401649
f1.5 Score
0.001770881877244442
f2 Score
0.0027210678042920424


In [8]:
print("sampling_strategy=1.0")

iht = InstanceHardnessThreshold(sampling_strategy=1.0)

x_res, y_res = iht.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy=1.0
Resampled dataset shape Counter({0: 4456449, 1: 997})
Training started
Making Predictions
AUC Score:
0.7421547855656134
Accuracy Score
0.6991254898162987
Precision Score
0.0005691017533177421
Recall Score
0.7852028639618138
f1 Score
0.0011373791534649443
f1.5 Score
0.0018465693832147403
f2 Score
0.00283728310738556
