# Testing with sklearn models

In [66]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from tqdm.notebook import tqdm

import numpy as np

sys.path.append("..")
from src.models.data import CropDataset, BoundingBox
from src.datasets_labeled import labeled_datasets

In [7]:
# Load validation dataset (tensors)
datasets = [d for d in labeled_datasets]
train = CropDataset(
    data_folder=Path("../data"),
    subset="training",
    datasets=datasets,
    cache=None,
    noise_factor=0,
    probability_threshold=0.5,
    remove_b1_b10=True,
    upsample=False,
    target_bbox=BoundingBox(min_lon=-180, max_lon=180, min_lat=-90, max_lat=90),
    is_local_only=False,
    is_global_only=False,
)

100%|██████████| 61818/61818 [00:15<00:00, 4085.72it/s]


In [19]:
# Load validation dataset (tensors)
dataset_name = "Uganda"
datasets = [d for d in labeled_datasets if d.dataset == dataset_name]
val = CropDataset(
    data_folder=Path("../data"),
    subset="validation",
    datasets=datasets,
    cache=None,
    noise_factor=0,
    probability_threshold=0.5,
    remove_b1_b10=True,
    normalizing_dict=train.normalizing_dict,
    upsample=False,
    target_bbox=BoundingBox(min_lon=-180, max_lon=180, min_lat=-90, max_lat=90),
    is_local_only=True,
    is_global_only=False,
)

100%|██████████| 458/458 [00:00<00:00, 3785.63it/s]


In [31]:
X_train = [t[0].numpy().flatten() for t in tqdm(train)]
y_train = [float(t[1]) for t in tqdm(train)]

  0%|          | 0/61818 [00:00<?, ?it/s]

  0%|          | 0/61818 [00:00<?, ?it/s]

In [32]:
X_val = [v[0].numpy().flatten() for v in tqdm(val)]
y_val = [float(v[1]) for v in tqdm(val)]

  0%|          | 0/458 [00:00<?, ?it/s]

  0%|          | 0/458 [00:00<?, ?it/s]

In [48]:
y_train.count(1.0) / len(y_train)

0.3102009123556246

In [49]:
y_train.count(0.0) / len(y_train)

0.6897990876443755

In [54]:
def test_classifier(clf):
    print("Fitting...")
    clf.fit(X_train, y_train)
    print("Predicting...")
    y_val_pred = clf.predict(X_val)
    
    print(f"Accuracy: {accuracy_score(y_val, y_val_pred)}")
    print(f"Precision: {precision_score(y_val, y_val_pred)}")
    print(f"Recall: {recall_score(y_val, y_val_pred)}")
    print(f"F1: {f1_score(y_val, y_val_pred)}")
    return y_val_pred

In [55]:
y_val_rf = test_classifier(RandomForestClassifier())

Fitting...
Predicting...
Accuracy: 0.8187772925764192
Precision: 0.3333333333333333
Recall: 0.38333333333333336
F1: 0.3565891472868217


In [56]:
y_val_scv = test_classifier(SVC())

Fitting...
Predicting...
Accuracy: 0.7947598253275109
Precision: 0.3425925925925926
Recall: 0.6166666666666667
F1: 0.4404761904761905


In [59]:
y_val_knn = test_classifier(KNeighborsClassifier())

Fitting...
Predicting...
Accuracy: 0.759825327510917
Precision: 0.2767857142857143
Recall: 0.5166666666666667
F1: 0.3604651162790698


In [63]:
y_val_linear_svc = test_classifier(LinearSVC())

Fitting...
Predicting...
Accuracy: 0.7270742358078602
Precision: 0.24806201550387597
Recall: 0.5333333333333333
F1: 0.3386243386243386




In [64]:
y_val_sgd = test_classifier(SGDClassifier())

Fitting...
Predicting...
Accuracy: 0.648471615720524
Precision: 0.22099447513812154
Recall: 0.6666666666666666
F1: 0.33195020746887965


In [69]:
y_val_ensemble = np.array([np.array(y) for y in [y_val_sgd, y_val_linear_svc, y_val_knn, y_val_scv, y_val_rf]])

In [76]:
y_val_ensemble_pred = (y_val_ensemble.mean(axis=0) > 0.5).astype(int)

In [77]:
# SVC is still better
f1_score(y_val, y_val_ensemble_pred)

0.43037974683544306