In [1]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from collections import Counter
from sklearn import metrics
import pandas as pd

In [2]:
train_csv = '../../dataset/train_N10.csv'
test_csv = '../../dataset/test_N10.csv'

train = pd.read_csv(train_csv)
test = pd.read_csv(test_csv)

x_train = train.drop(columns = ['osm_id', 'class'])
y_train = train['class']

print(x_train.shape)
print(y_train.shape)

x_test = test.drop(columns = ['osm_id', 'class'])
y_test = test['class']

print(x_test.shape)
print(y_test.shape)

print('Original training dataset shape %s' % Counter(y_train))

(4481441, 22)
(4481441,)
(1920618, 22)
(1920618,)
Original training dataset shape Counter({0: 4480444, 1: 997})


In [3]:
print("sampling strategy = auto")

ros = RandomOverSampler()

x_res, y_res = ros.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = RandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling strategy = auto
Resampled dataset shape Counter({0: 4480444, 1: 4480444})
Training started
Making Predictions
AUC Score:
0.6035222922978146
Accuracy Score
0.999234621356251
Precision Score
0.07102040816326531
Recall Score
0.20763723150357996
f1 Score
0.10583941605839417
f1.5 Score
0.13043478260869568
f2 Score
0.1499482936918304


In [4]:
print("sampling strategy = 0.2")

ros = RandomOverSampler(sampling_strategy=0.2)

x_res, y_res = ros.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = RandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling strategy = 0.2
Resampled dataset shape Counter({0: 4480444, 1: 896088})
Training started
Making Predictions
AUC Score:
0.6047075376402198
Accuracy Score
0.9992190013839295
Precision Score
0.07000795544948289
Recall Score
0.2100238663484487
f1 Score
0.10501193317422435
f1.5 Score
0.1300147744061825
f2 Score
0.15001704739174906


In [5]:
print("sampling strategy = 0.4")

ros = RandomOverSampler(sampling_strategy=0.4)

x_res, y_res = ros.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = RandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling strategy = 0.4
Resampled dataset shape Counter({0: 4480444, 1: 1792177})
Training started
Making Predictions
AUC Score:
0.6035178656732825
Accuracy Score
0.9992257700386021
Precision Score
0.07004830917874397
Recall Score
0.20763723150357996
f1 Score
0.10475617098133656
f1.5 Score
0.12941984208719534
f2 Score
0.1490747087045922


In [6]:
print("sampling strategy = 0.6")

ros = RandomOverSampler(sampling_strategy=0.6)

x_res, y_res = ros.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = RandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling strategy = 0.6
Resampled dataset shape Counter({0: 4480444, 1: 2688266})
Training started
Making Predictions
AUC Score:
0.6035209903494227
Accuracy Score
0.9992320180275307
Precision Score
0.07073170731707316
Recall Score
0.20763723150357996
f1 Score
0.1055184960582171
f1.5 Score
0.13013462202278217
f2 Score
0.14969029593943564


In [7]:
print("sampling strategy = 0.8")

ros = RandomOverSampler(sampling_strategy=0.8)

x_res, y_res = ros.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = RandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling strategy = 0.8
Resampled dataset shape Counter({0: 4480444, 1: 3584355})
Training started
Making Predictions
AUC Score:
0.6047158701099273
Accuracy Score
0.999235662687739
Precision Score
0.07183673469387755
Recall Score
0.2100238663484487
f1 Score
0.1070559610705596
f1.5 Score
0.13193403298350825
f2 Score
0.15167183729748362


In [8]:
print("sampling strategy = 1.0")

ros = RandomOverSampler(sampling_strategy=1.0)

x_res, y_res = ros.fit_resample(x_train, y_train)

print('Resampled dataset shape %s' % Counter(y_res))

clf = RandomForestClassifier()

print("Training started")
clf.fit(x_res, y_res)

print("Making Predictions")
y_pred = clf.predict(x_test)

print("AUC Score:")
print(roc_auc_score(y_test, y_pred))

print("Accuracy Score")
print(accuracy_score(y_test, y_pred))

print("Precision Score")
print(precision_score(y_test, y_pred))

print("Recall Score")
print(recall_score(y_test, y_pred))

print("f1 Score")
print(f1_score(y_test, y_pred))

print("f1.5 Score")
print(fbeta_score(y_test, y_pred, beta=1.5, pos_label=1))

print("f2 Score")
print(fbeta_score(y_test, y_pred, beta=2, pos_label=1))

sampling strategy = 1.0
Resampled dataset shape Counter({0: 4480444, 1: 4480444})
Training started
Making Predictions
AUC Score:
0.6035233338565281
Accuracy Score
0.9992367040192272
Precision Score
0.07125307125307126
Recall Score
0.20763723150357996
f1 Score
0.10609756097560975
f1.5 Score
0.13067590987868286
f2 Score
0.15015533310321022
