In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from tqdm import tqdm
from bench_utils import *

In [12]:
# define n iterations
n = 5

# load metadata

metadata = pd.read_csv('../data/l1/GLC24_PA_metadata_train.csv')

# get survey ids
survey_ids = metadata['surveyId'].unique()
survey_ids

# define train ratio
train_ratio = 0.8

# shuffle survey ids


def get_train_val_survey_ids(survey_ids, train_ratio):
    
    # shuffle survey ids
    np.random.shuffle(survey_ids)
    # split survey ids into train and val
    n_train = int(train_ratio * len(survey_ids))
    train_survey_ids = survey_ids[:n_train]
    val_survey_ids = survey_ids[n_train:]
    return train_survey_ids, val_survey_ids

# get train and val survey ids
train_survey_ids, val_survey_ids = get_train_val_survey_ids(survey_ids, train_ratio)

train_survey_ids

array([ 491140,  658107,  224167, ..., 2388091, 3554093, 3552189])

In [13]:
# group by surveyId and aggregate speciesId into list
metadata_grouped = metadata.groupby('surveyId')['speciesId'].apply(list).reset_index()

# create X

X_raster = pd.read_csv('../data/l2/raster_data.csv', header=None)
X = X_raster.iloc[:,1:].to_numpy()
X.shape

scaler = StandardScaler()
X = scaler.fit_transform(X)
X.shape

(88987, 46)

In [14]:
# create speciesIds ranked

speciesIds_ranked = metadata['speciesId'].value_counts().index

# change data type to int

speciesIds_ranked = speciesIds_ranked.astype(int)

speciesIds_ranked

Index([  540,  4397,   254,  4499, 10317,  2885,  1964, 10600, 10073, 11140,
       ...
        4953, 10045,  9210, 11065,   345,  8312,  8381,  7572,  9807,  8119],
      dtype='int64', name='speciesId', length=5016)

In [15]:
# loop over speciesIds_ranked and create a model for each speciesId

f1s = []

for i in range(n):

    models_raster = {}
    top_k = 10
    y_preds_raster = {}
    y_trues_raster = {}
    cv_scores_raster = []

    for species_id in tqdm(speciesIds_ranked[:top_k]):

        # prepare X and y
        y = metadata_grouped['speciesId'].apply(lambda x: 1 if species_id in x else 0).to_numpy()
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=1-train_ratio) # no test data needed as all data is training data and CV is used

        # undersample train data
        rus = RandomUnderSampler()
        X_train, y_train = rus.fit_resample(X_train, y_train)

        # train species classifier
        xgb = XGBClassifier()
        xgb.fit(X_train, y_train)

        # cross validation
        cv_score = cross_val_score(xgb, X_train, y_train, cv=5, scoring='f1').mean()
        cv_scores_raster.append(cv_score)
        
        # print f1 score
        #print(f'{species_id} f1 score: {cv_score}')

        # save model
        models_raster[species_id] = xgb

        # predict on test data
        y_pred = xgb.predict(X_val)

        # save true values
        y_trues_raster[species_id] = y_val

        # save results
        y_preds_raster[species_id] = y_pred


    # compute f1 score for each iteration
    y_preds_raster = pd.DataFrame(y_preds_raster).values

    y_trues_raster = pd.DataFrame(y_trues_raster).values

    TP = (y_trues_raster & y_preds_raster).sum(axis=1)  # True Positives per sample
    FP = (y_trues_raster & ~y_preds_raster).sum(axis=1)  # False Positives per sample
    FN = (~y_trues_raster & y_preds_raster).sum(axis=1)  # False Negatives per sample

    # compute f1 score for each sample
    f1 = 2 * TP / (2 * TP + FP + FN)

    # Handle division by zero
    f1 = np.nan_to_num(f1)

    # compute micro-average f1 score
    micro_f1 = np.mean(f1)

    f1s.append(micro_f1)

    print(f'Iteration {i+1} micro f1 score: {micro_f1}')

print(f'Average micro f1 score: {np.mean(f1s)}')




100%|██████████| 10/10 [00:29<00:00,  2.97s/it]
  f1 = 2 * TP / (2 * TP + FP + FN)


Iteration 1 micro f1 score: 0.4380274783213316


100%|██████████| 10/10 [00:33<00:00,  3.40s/it]
  f1 = 2 * TP / (2 * TP + FP + FN)


Iteration 2 micro f1 score: 0.43763470784624847


100%|██████████| 10/10 [00:33<00:00,  3.34s/it]
  f1 = 2 * TP / (2 * TP + FP + FN)


Iteration 3 micro f1 score: 0.434562352095221


100%|██████████| 10/10 [00:31<00:00,  3.19s/it]
  f1 = 2 * TP / (2 * TP + FP + FN)


Iteration 4 micro f1 score: 0.4384257989578812


100%|██████████| 10/10 [00:30<00:00,  3.08s/it]

Iteration 5 micro f1 score: 0.43659488087794635
Average micro f1 score: 0.43704904361972574



  f1 = 2 * TP / (2 * TP + FP + FN)


In [16]:
f1s

[0.4380274783213316,
 0.43763470784624847,
 0.434562352095221,
 0.4384257989578812,
 0.43659488087794635]

In [17]:
# save results with timestamp

import datetime

now = datetime.datetime.now()
timestamp = now.strftime('%Y-%m-%d_%H-%M-%S')

results = pd.DataFrame(f1s, columns=['top_k_bc_f1'])
results.to_csv(f'{timestamp}_top_k_bc_benchmark_results_{n}.csv', index=False)