In [1]:
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import ADASYN 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from collections import Counter
from sklearn import metrics
import pandas as pd

In [2]:
train_csv = '../../dataset/train_N10.csv'
test_csv = '../../dataset/test_N10.csv'

train = pd.read_csv(train_csv)
test = pd.read_csv(test_csv)

x_train = train.drop(columns = ['osm_id', 'class'])
y_train = train['class']

print(x_train.shape)
print(y_train.shape)

x_test = test.drop(columns = ['osm_id', 'class'])
y_test = test['class']

print(x_test.shape)
print(y_test.shape)

print('Original training dataset shape %s' % Counter(y_train))

(4481441, 22)
(4481441,)
(1920618, 22)
(1920618,)
Original training dataset shape Counter({0: 4480444, 1: 997})


In [3]:
print("sampling_strategy=auto")

ada = ADASYN()

x_res, y_res = ada.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy=auto
Resampled dataset shape Counter({0: 4480444, 1: 4480359})
Training started
Making Predictions
AUC Score:
0.5261779910661879
Accuracy Score
0.9996433439653278
Precision Score
0.07096774193548387
Recall Score
0.05250596658711217
f1 Score
0.06035665294924554
f1.5 Score
0.0570744362402714
f2 Score
0.0553877139979859


In [4]:
print("sampling_strategy=0.2")

ada = ADASYN(sampling_strategy=0.2)

x_res, y_res = ada.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy=0.2
Resampled dataset shape Counter({0: 4480444, 1: 896017})
Training started
Making Predictions
AUC Score:
0.5916753070570085
Accuracy Score
0.99940175506009
Precision Score
0.08710407239819004
Recall Score
0.18377088305489261
f1 Score
0.11818879508825786
f1.5 Score
0.13699192555084166
f2 Score
0.150390625


In [5]:
print("sampling_strategy=0.4")

ada = ADASYN(sampling_strategy=0.4)

x_res, y_res = ada.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy=0.4
Resampled dataset shape Counter({0: 4480444, 1: 1792034})
Training started
Making Predictions
AUC Score:
0.5512030251100877
Accuracy Score
0.9995850294019946
Precision Score
0.09267241379310345
Recall Score
0.1026252983293556
f1 Score
0.09739524348810873
f1.5 Score
0.09934245601563887
f2 Score
0.10046728971962617


In [6]:
print("sampling_strategy=0.6")

ada = ADASYN(sampling_strategy=0.6)

x_res, y_res = ada.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy=0.6
Resampled dataset shape Counter({0: 4480444, 1: 2688395})
Training started
Making Predictions
AUC Score:
0.5369043076048226
Accuracy Score
0.9996209553383338
Precision Score
0.08355795148247978
Recall Score
0.07398568019093078
f1 Score
0.07848101265822785
f1.5 Score
0.07668886774500475
f2 Score
0.07572056668295066


In [7]:
print("sampling_strategy=0.8")

ada = ADASYN(sampling_strategy=0.8)

x_res, y_res = ada.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy=0.8
Resampled dataset shape Counter({0: 4480444, 1: 3584345})
Training started
Making Predictions
AUC Score:
0.5285599388968463
Accuracy Score
0.9996350133134231
Precision Score
0.07272727272727272
Recall Score
0.057279236276849645
f1 Score
0.06408544726301735
f1.5 Score
0.061284619917501476
f2 Score
0.05982053838484547


In [8]:
print("sampling_strategy=1.0")

ada = ADASYN(sampling_strategy=1.0)

x_res, y_res = ada.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling_strategy=1.0
Resampled dataset shape Counter({0: 4480444, 1: 4480359})
Training started
Making Predictions
AUC Score:
0.5249836320850402
Accuracy Score
0.9996407406366076
Precision Score
0.0670926517571885
Recall Score
0.050119331742243436
f1 Score
0.05737704918032787
f1.5 Score
0.05434999004578938
f2 Score
0.05279034690799397
