# Testing with sklearn models

In [29]:
from pathlib import Path
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
import sys 

sys.path.append("..")
from src.models.data import CropDataset, BoundingBox
from src.datasets_labeled import labeled_datasets

In [6]:
# Load validation dataset (tensors)
datasets = [d for d in labeled_datasets if d.dataset != "Ethiopia"]
train = CropDataset(
    data_folder=Path("../data"),
    subset="training",
    datasets=datasets,
    cache=None,
    noise_factor=0,
    probability_threshold=0.5,
    remove_b1_b10=True,
    upsample=False,
    target_bbox=BoundingBox(min_lon=-180, max_lon=180, min_lat=-90, max_lat=90),
    is_local_only=False,
    is_global_only=False,
)

100%|██████████| 61818/61818 [00:20<00:00, 3050.19it/s]


In [24]:
def generate_validation_X_y(labeled_dataset):
    val = CropDataset(
        data_folder=Path("../data"),
        subset="validation",
        datasets=[labeled_dataset],
        cache=None,
        noise_factor=0,
        probability_threshold=0.5,
        remove_b1_b10=True,
        normalizing_dict=train.normalizing_dict,
        upsample=False,
        target_bbox=BoundingBox(min_lon=-180, max_lon=180, min_lat=-90, max_lat=90),
        is_local_only=False,
        is_global_only=False,
    )
    return generate_X_y(val)

In [9]:
def generate_X_y(dataset):
    X = [t[0].numpy().flatten() for t in tqdm(dataset)]
    y = [float(t[1]) for t in tqdm(dataset)]
    return X, y

In [8]:
X_train, y_train = generate_X_y(train)

  0%|          | 0/61818 [00:00<?, ?it/s]

  0%|          | 0/61818 [00:00<?, ?it/s]

In [10]:
X_val, y_val = generate_X_y(val)

  0%|          | 0/458 [00:00<?, ?it/s]

  0%|          | 0/458 [00:00<?, ?it/s]

In [16]:
classifiers = {
    "RandomForest": RandomForestClassifier(),
    "SVC": SVC(),
}

# Fit classifiers
for name, clf in classifiers.items():
    print(f"Fitting: {name}")
    clf.fit(X_train, y_train)

RandomForest
SVC


In [26]:
results = {}
for d in labeled_datasets:
    if d.dataset not in ["Kenya", "Togo", "Uganda", "Rwanda"]:
        continue
    results[d.dataset] = {}
    X_val, y_val = generate_validation_X_y(d)
    for name, clf in classifiers.items():
        y_pred = clf.predict(X_val)
        results[d.dataset][name] = f1_score(y_val, y_pred)

        

  0%|          | 0/861 [00:00<?, ?it/s]

  0%|          | 0/861 [00:00<?, ?it/s]

  0%|          | 0/277 [00:00<?, ?it/s]

  0%|          | 0/277 [00:00<?, ?it/s]

  0%|          | 0/538 [00:00<?, ?it/s]

  0%|          | 0/538 [00:00<?, ?it/s]

  0%|          | 0/458 [00:00<?, ?it/s]

  0%|          | 0/458 [00:00<?, ?it/s]

In [30]:
pd.DataFrame(results)

Unnamed: 0,Kenya,Togo,Rwanda,Uganda
RandomForest,0.929553,0.669231,0.514793,0.375
SVC,0.925226,0.60241,0.578431,0.440476
