In [1]:
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from collections import Counter
from sklearn import metrics
import pandas as pd

In [2]:
train_csv = '../../dataset/train_N10.csv'
test_csv = '../../dataset/test_N10.csv'

train = pd.read_csv(train_csv)
test = pd.read_csv(test_csv)

x_train = train.drop(columns = ['osm_id', 'class'])
y_train = train['class']

print(x_train.shape)
print(y_train.shape)

x_test = test.drop(columns = ['osm_id', 'class'])
y_test = test['class']

print(x_test.shape)
print(y_test.shape)

print('Original training dataset shape %s' % Counter(y_train))

(4481441, 22)
(4481441,)
(1920618, 22)
(1920618,)
Original training dataset shape Counter({0: 4480444, 1: 997})


In [3]:
print("Sampling Strategy = auto")

rus = RandomUnderSampler()

x_res, y_res = rus.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

Sampling Strategy = auto
Resampled dataset shape Counter({0: 997, 1: 997})
Training started
Making Predictions
AUC Score:
0.7334231689324174
Accuracy Score
0.6959796273907669
Precision Score
0.0005529553150475747
Recall Score
0.7708830548926014
f1 Score
0.0011051179277157075
f1.5 Score
0.001794209050351086
f2 Score
0.0027568665608303034


In [4]:
print("Sampling Strategy = 0.2")

rus = RandomUnderSampler(sampling_strategy=0.2)

x_res, y_res = rus.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

Sampling Strategy = 0.2
Resampled dataset shape Counter({0: 4985, 1: 997})
Training started
Making Predictions
AUC Score:
0.7383424992095184
Accuracy Score
0.7082017350665255
Precision Score
0.000574325167927685
Recall Score
0.7684964200477327
f1 Score
0.0011477925489727793
f1.5 Score
0.0018634234373796686
f2 Score
0.0028630671451486127


In [5]:
print("Sampling Strategy = 0.4")

rus = RandomUnderSampler(sampling_strategy=0.4)

x_res, y_res = rus.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

Sampling Strategy = 0.4
Resampled dataset shape Counter({0: 2492, 1: 997})
Training started
Making Predictions
AUC Score:
0.7350702970907396
Accuracy Score
0.6945019780091616
Precision Score
0.0005536853295535081
Recall Score
0.7756563245823389
f1 Score
0.0011065807506022353
f1.5 Score
0.0017965917909575086
f2 Score
0.002760544430325557


In [6]:
print("Sampling Strategy = 0.6")

rus = RandomUnderSampler(sampling_strategy=0.6)

x_res, y_res = rus.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

Sampling Strategy = 0.6
Resampled dataset shape Counter({0: 1661, 1: 997})
Training started
Making Predictions
AUC Score:
0.7368953198728789
Accuracy Score
0.6957656337699636
Precision Score
0.0005576929985578675
Recall Score
0.7780429594272077
f1 Score
0.0011145870728413423
f1.5 Score
0.0018095837914580643
f2 Score
0.002780492880744149


In [7]:
print("Sampling Strategy = 0.8")

rus = RandomUnderSampler(sampling_strategy=0.8)

x_res, y_res = rus.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

Sampling Strategy = 0.8
Resampled dataset shape Counter({0: 1246, 1: 997})
Training started
Making Predictions
AUC Score:
0.733708410599413
Accuracy Score
0.6893932057285728
Precision Score
0.0005462558018733557
Recall Score
0.7780429594272077
f1 Score
0.001091745100961305
f1.5 Score
0.0017725312859300413
f2 Score
0.0027236300809068514


In [8]:
print("Sampling Strategy = 1.0")

rus = RandomUnderSampler(sampling_strategy=1.0)

x_res, y_res = rus.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = BalancedRandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

Sampling Strategy = 1.0
Resampled dataset shape Counter({0: 997, 1: 997})
Training started
Making Predictions
AUC Score:
0.7308872158326579
Accuracy Score
0.7028367952398655
Precision Score
0.000556962557403924
Recall Score
0.7589498806682577
f1 Score
0.0011131082497772909
f1.5 Score
0.0018071443840168703
f2 Score
0.002776662068002026
