In [1]:
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.combine import SMOTETomek
from collections import Counter
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn import metrics
import pandas as pd

In [2]:
train_csv = '../../dataset/train_N10.csv'
test_csv = '../../dataset/test_N10.csv'

train = pd.read_csv(train_csv)
test = pd.read_csv(test_csv)

x_train = train.drop(columns = ['osm_id', 'class'])
y_train = train['class']

print(x_train.shape)
print(y_train.shape)

x_test = test.drop(columns = ['osm_id', 'class'])
y_test = test['class']

print(x_test.shape)
print(y_test.shape)

print('Original training dataset shape %s' % Counter(y_train))

(4481441, 22)
(4481441,)
(1920618, 22)
(1920618,)
Original training dataset shape Counter({0: 4480444, 1: 997})


In [3]:
print("sampling strategy auto")

sme = SMOTETomek()

x_res, y_res = sme.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling strategy auto
Resampled dataset shape Counter({0: 4480443, 1: 4480443})
Training started
Making Predictions
AUC Score:
0.5249838924747185
Accuracy Score
0.9996412613023516
Precision Score
0.0673076923076923
Recall Score
0.050119331742243436
f1 Score
0.057455540355677154
f1.5 Score
0.05439330543933055
f2 Score
0.0528169014084507


In [4]:
print("sampling strategy 0.2")

sme = SMOTETomek(sampling_strategy = 0.2)

x_res, y_res = sme.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling strategy 0.2
Resampled dataset shape Counter({0: 4480443, 1: 896087})
Training started
Making Predictions
AUC Score:
0.5880961359587406
Accuracy Score
0.99940175506009
Precision Score
0.08428246013667426
Recall Score
0.1766109785202864
f1 Score
0.1141094834232845
f1.5 Score
0.13208842509954688
f2 Score
0.14487079091620989


In [5]:
print("sampling strategy 0.4")

sme = SMOTETomek(sampling_strategy = 0.4)

x_res, y_res = sme.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling strategy 0.4
Resampled dataset shape Counter({0: 4480443, 1: 1792176})
Training started
Making Predictions
AUC Score:
0.5500086661289398
Accuracy Score
0.9995824260732743
Precision Score
0.08993576017130621
Recall Score
0.10023866348448687
f1 Score
0.09480812641083522
f1.5 Score
0.0968256783117574
f2 Score
0.09799346710219319


In [6]:
print("sampling strategy 0.6")

sme = SMOTETomek(sampling_strategy = 0.6)

x_res, y_res = sme.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling strategy 0.6
Resampled dataset shape Counter({0: 4480443, 1: 2688265})
Training started
Making Predictions
AUC Score:
0.5392912028393696
Accuracy Score
0.999622517335566
Precision Score
0.08870967741935484
Recall Score
0.07875894988066826
f1 Score
0.08343868520859672
f1.5 Score
0.08157444381061038
f2 Score
0.08056640625


In [7]:
print("sampling strategy 0.8")

sme = SMOTETomek(sampling_strategy = 0.8)

x_res, y_res = sme.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling strategy 0.8
Resampled dataset shape Counter({0: 4480443, 1: 3584354})
Training started
Making Predictions
AUC Score:
0.5333311254691568
Accuracy Score
0.9996329306504469
Precision Score
0.08187134502923976
Recall Score
0.06682577565632458
f1 Score
0.0735873850197109
f1.5 Score
0.07083090095349291
f2 Score
0.06937561942517344


In [8]:
print("sampling strategy 1.0")

sme = SMOTETomek(sampling_strategy = 1.0)

x_res, y_res = sme.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling strategy 1.0
Resampled dataset shape Counter({0: 4480443, 1: 4480443})
Training started
Making Predictions
AUC Score:
0.5273694857608737
Accuracy Score
0.9996402199708635
Precision Score
0.07232704402515723
Recall Score
0.05489260143198091
f1 Score
0.06241519674355494
f1.5 Score
0.05929010509617292
f2 Score
0.057673019057171516
