# Top

1. __Imports__ and auxiliary scripts - [here](#Imports-and-auxiliar-scripts)
2. Run __BvSB__, and __store__ dataset to classify
    * Auxiliary functions - [here](#Run-BvSB,-and-store-dataset-to-classify)
    * Runnable cells - [here](#BvSVB-get-and-store-dataset---runnable)
3. __Update__ datasets and trained model
    * Auxiliary functions - [here](#Update-datasets-and-trained-model)
    * Runnable cells - [here](#Update-datasets-and-trained-model---runnable)
4. AL iter performance - [here](#AL-iter-performance)
    * [Stats per iteration](#Stats-per-iteration)

[Bottom](#Bottom)

______________

# Imports and auxiliar scripts

[Back to top](#Top)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from tabulate import tabulate

import os
from joblib import load
import re

import geopandas as gpd
import rioxarray
import rasterio as rio
from shapely.geometry import Point, box

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, StratifiedGroupKFold, cross_val_score
from sklearn.metrics import f1_score, accuracy_score, balanced_accuracy_score

import random
random.seed(42)

base_directory = os.getcwd()

normalizer_df_directory = os.path.join(base_directory, "")
norm = pd.read_csv(normalizer_df_directory)
norm = norm[["band","mean","std","min","max"]]

palette = ['#006600', '#99ff33', '#2d8659', '#c6538c', '#808000', '#804000', '#0000ff']
class_names = ['Cashew', 'Non-cashew']
vis_params = {
        "cmap": ListedColormap(palette),
        "vmin": 1,
        "vmax": 2,
        "alpha": 0.9,
}

In [None]:
def get_normalized_X(X, norm):
    """
    Apply min-max normalization to feature dataframe. Returns norm dataframe
    """
    X_norm = X.copy(deep=True)
    for band in norm["band"]:
        norm_params = norm.loc[norm["band"]==band]
        X_norm[band] = (X_norm[band] - norm_params["min"].iloc[0]) / (norm_params["max"].iloc[0] - norm_params["min"].iloc[0]) #iloc[0] because of FutureWarning
    return X_norm

##############################################

def transform_y_2classes(y):
    """
    Transforms a y labeled array/Series from the 7 classes, where 5 is cashew, into a 2 labeled Series where 1 cashew, 2 non-cashew
    """
    y_update = pd.Series(y == 5, dtype="int")
    y_update.loc[y_update==0] = 2
    return y_update

##############################################

def balanced_accuracy_scorer(estimator, X_true, y_true):
    y_pred = estimator.predict(X_true)
    if (np.unique(y_pred) != np.unique(y_true)).all():
        print("Classes in true labels and predictions mismatch!")
        print("Unique preds: ", np.unique(y_pred))
        print("Unique trues", np.unique(y_true))
    return balanced_accuracy_score(y_true,y_pred)

def f1_cashew_scorer(estimator, X_true, y_true):
    """
    Return F1 score for cashew. Adapted to the 7-class system where cashew is class 5, or to the binary system where cashew is class 1
    """
    y_pred = estimator.predict(X_true)
    if (np.unique(y_pred) != np.unique(y_true)).all():
        print("Classes in true labels and predictions mismatch!")
        print("Unique preds: ", np.unique(y_pred))
        print("Unique trues", np.unique(y_true))
    if np.unique(y_true).shape[0] == 2:
        return f1_score(y_true,y_pred,average=None)[0] #assumes 1st class is cashew, 2nd is non      
    else:
        try:
            return f1_score(y_true,y_pred,average=None)[4] #in the 7 total classes, cashew is 5th
        except IndexError: #other number of classes, could be just 1, if cashew doesn't appear for example
            return -1

##############################################

def plot_multipolygon_boundaries(multi,color="black",linewidth=0.2,alpha=1):
    """
    Works with polygons or multipolygons, plots just the outlines of the shape
    """
    for geom in multi.geoms:
        xs, ys = geom.exterior.xy    
        plt.plot(xs,ys, color=color, linewidth=linewidth,alpha=alpha)

##############################################

def filterOverlappingPoints(df1, df2):
    """
    Returns: df1 with just the rows that do not appear in df2, according to xy position
    """
    if df2.empty == False:
        merged_df = pd.merge(df1, df2, on=["x","y"], how="left", indicator=True)
        return merged_df[merged_df["_merge"]=="left_only"].index
    else:
        return df1.index

##############################################

def find_latest_file(directory):
    '''
    Get directory+filename of a file in a directory with highest number.
    Filenames follow the form: someName_nº.extension
    Generalization of savePickle.py's find_latest_pickle()
    '''
    files = [file for file in os.listdir(directory)]
    if not files:
        return None, -1 

    #extract suffix numbers from file names and find the maximum
    suffix_numbers = [int(file.split("_")[-1].split(".")[0]) for file in files]
    index_of_max = suffix_numbers.index(max(suffix_numbers))
    
    return os.path.join(directory, files[index_of_max]), suffix_numbers[index_of_max]

##############################################

def find_n_lowest_difference_index(df, n_points):
    """
    Given a pandas dataframe df with column "Difference", return the indexes of points with lowest "Difference" between first and second classes  
    """
    df["Difference"] = -df["Difference"] #to utilize numpy's nlargest, invert sign
    largest = df.nlargest(n_points,"Difference")
    largest["Difference"] = -largest["Difference"] #re-revert sign
    return largest.index;

###############################################

def get_files_in_directory(directory_path):
    """
    Returns: list with all files in the specified directory
    """
    files = os.listdir(directory_path)
    #filter out directories, leaving only files
    files = [file for file in files if os.path.isfile(os.path.join(directory_path, file))]
    return files

###############################################

def extract_number_from_filename(file_name):
    """
    Extracts the number from a file name before the extension (if any).
    """
    match = re.search(r'(\d+)(\.\w+)?$', file_name)
    if match:
        return int(match.group(1))
    else:
        return None

* __Call and pre-process Random Sampling and RSeT datasets__

In [None]:
clean_rs_df = pd.read_pickle(os.path.join(base_directory, ""))
rs_train = pd.read_pickle(os.path.join(base_directory, ""))
rs_test = pd.read_pickle(os.path.join(base_directory, "")) #random sampling test set (which is the test set utilized for AL performance)
init_rs_df = pd.read_pickle(os.path.join(base_directory, "")) #initial seed set, that initializes the training dataset for both RS and MS heuristics

#process stored datasets

X_rs = clean_rs_df.drop(columns=["Class", "x", "y", "polygonID", "geotiff", "Pred certainty"])
y_rs = transform_y_2classes(clean_rs_df["Class"])
y7c_rs = clean_rs_df["Class"]
groups_rs = clean_rs_df["polygonID"]

init_X_rs = init_rs_df.drop(columns=["Class", "x", "y", "polygonID", "geotiff", "Pred certainty"])
init_y_rs = transform_y_2classes(init_rs_df["Class"])
init_y7c_rs = init_rs_df["Class"]
init_groups_rs = init_rs_df["polygonID"]

X_rs_train = rs_train.drop(columns=["Class", "x", "y", "polygonID", "geotiff", "Pred certainty"])
y_rs_train = transform_y_2classes(rs_train["Class"])
y7c_rs_train = rs_train["Class"]
groups_rs_train = rs_train["polygonID"]

X_rs_test = rs_test.drop(columns=["Class", "x", "y", "polygonID", "geotiff", "Pred certainty"])
y_rs_test = transform_y_2classes(rs_test["Class"])
y7c_rs_test = rs_test["Class"]
groups_rs_test = rs_test["polygonID"]

#normalize feature dataframes
X_rs_norm = get_normalized_X(X_rs, norm)
init_X_rs_norm = get_normalized_X(init_X_rs, norm)
X_rs_train_norm = get_normalized_X(X_rs_train, norm)
X_rs_test_norm = get_normalized_X(X_rs_test, norm)

#rset dataset

rset_df = pd.read_csv(os.path.join(base_directory, ""))
rset_df.drop(columns=["Unnamed: 0"], inplace=True)

X_rset = rset_df.drop(columns=["C_ID_1", "x", "y", "groupID"])
y_rset = transform_y_2classes(rset_df["C_ID_1"])
y7c_rset = rset_df["C_ID_1"]
groups_rset = rset_df["groupID"]

X_rset_norm = get_normalized_X(X_rset, norm)

____

# Run BvSB, and store dataset to classify

[Back to top](#Top)

___

# processDirectory.py

* __Main part of processDirectory:__
    * Read each geotiff iteratively and get the uncertainties for each point.
    * Store the points with highest uncertainty.
    * For that list, cut the points that were already discarded (labeled -1), or that appeared in a previous dataset.
    * __Only after that, get the patches needed!__

[Back to top.](#Top)

In [None]:
def geotiff_to_df(image_path, column_order=None, standardize=False, norm_dataframe=None):
    """
    Converts a geotiff file into a pandas dataframe.
    CORRECTED v3 - changed standardization to be global (using min max parameters from the entire region of interest) instead of batch-based normalization.
    Assumes the images are on crs "EPSG:4326".
    column_order: the order of bands in xds is different than the band order in model training. This parameter is to give the correct training column order.
    """
    xds = rioxarray.open_rasterio(image_path, masked=True)
    x_coords, y_coords = xds.coords['x'].values, xds.coords['y'].values

    #for geotiffs with only one band, tipically the class band, simpler transformation, without normalization
    if xds.shape[0] == 1:
        values = xds[0].values.flatten()
        df = pd.DataFrame({'x': list(x_coords)*len(y_coords),
                           'y': y_coords.repeat(len(x_coords)),
                           'band_0': values})
        return df

    names = xds.attrs["long_name"]
    df_final = pd.DataFrame(columns=xds.attrs["long_name"])
    
    #correct column order if needed
    if column_order is not None:
        df_final = df_final[column_order]
    #if standardize, flatten the 2D/3D array and apply min-max norm; else, just flatten the array
    if standardize:
        for i, name in enumerate(names):
            values = xds[i].values.flatten() #each band has format (y_size, x_size); flatten() transform into 1d array with size (y_size * x_size)
            norm_params = norm_dataframe.loc[norm["band"]==name]
            df_final[name] = (values - norm_params["min"].iloc[0]) / (norm_params["max"].iloc[0] - norm_params["min"].iloc[0]) #iloc[0] due to FutureWarning; though pd.Series only has 1 element
    else:
        for i, name in enumerate(names):
            values = xds[i].values.flatten()
            df_final[name] = values

    #merge lon and lat columns
    if "x" not in df_final.columns:
        df_final["x"] = list(x_coords)*len(y_coords)
        df_final["y"] = y_coords.repeat(len(x_coords))
    
    return df_final

###############################################

def get_margins(df, model, lon=None, lat=None, n=2):
    """
    Receives: df, DataFrame with the training features
    Returns: dataframe of uncertainty values for each example in df
    """
    n_classes = len(model.classes_)
    
    margins = np.zeros(df.shape[0])
    
    #identify rows with nan values; for rows WITHOUT nans, compute margins; for rows WITH nans, leave it as nan
    nan_mask = df.isna().any(axis=1)
    margins[~nan_mask] = model.decision_function(df[~nan_mask])
    margins[nan_mask] = np.nan

    classes=np.sign(margins)
    classes[classes==-1] = 2 #class 1 is cashew, class -1 is non-cashew, change to class 2
    classes[classes==0] = 2 #where margin is 0, so we could say either class 1 or 2, it just to avoid errors that we specify one of them
    
    result_data = {"1st class": classes, "Margin": margins}
    df_results = pd.DataFrame(result_data, dtype="float")
    #add longitude and latitude columns if provided
    if lon is not None:
        df_results["x"] = lon
    if lat is not None:
        df_results["y"] = lat

    return df_results

###############################################

def process_file_margins(file_path, model, column_order=None, standardize=False, norm_dataframe=None):
    """
    file_path to the GeoTIFF tile.
    Transforms GeoTIFF into sklearn readable pandas df.
    Determines class estimates and margins given a margin-based model
    Returns: df, which is just the .tif transformed to pandas dataframe; uncertain dataframe with class and margin estimates 
    """
    df = geotiff_to_df(file_path, column_order=column_order, standardize=standardize, norm_dataframe=norm_dataframe)

    longitude = df["x"]
    latitude = df["y"]
    df_nocoords = df.drop(columns=["x", "y"])
    uncertain = get_margins(df_nocoords, model, longitude, latitude)

    return uncertain, df

###############################################


def process_directory_margins(model, directory_tif_coefficients=None, column_order=None, n_points=500,
                         standardize=False, norm_dataframe=None, save_tif=False, directory_tif_predictions=None, random_seed=0):
    """
    CHANGES: To deal with a directory where there are multiple folders, each indicating a subregion, which is partitioned into multiple GeoTIFFs
    ...
    directory_tif_coefficients: path to folder with the GeoTIFF coefficient files
    model: model to determine class probability estimates
    column_order: since order of features might change, depending on the training of each model
    n_points or max_difference: 2 possible criteria to limit how many uncertain points the function returns
    standardize: boolean, if we standardize data right after reading the GeoTIFF file
    norm_dataframe: pandas dataframe with the means and std's of each training feature, for GLOBAL normalization
    save_tif: save predictions tif (1 band image)
    directory_tif_path: where to save the predictions tiff
    """
    final_df_uncertain = pd.DataFrame() #store the difference, probable classes
    final_df_coefs = pd.DataFrame() #store the training coefficients of the most uncertain points

    #stipulate directories if needed
    base_directory = os.getcwd()
    if directory_tif_coefficients is None:
        directory_tif_coefficients = os.path.join(base_directory, "")
    if directory_tif_predictions is None:
        training_set_folder = os.path.join(base_directory, "")
        lastTrainingSet, lastTrainingNum = find_latest_file(directory = training_set_folder)
        directory_tif_predictions = os.path.join(base_directory, "", str(f"Iteration {lastTrainingNum}"))
        if not os.path.exists(directory_tif_predictions):
            os.makedirs(directory_tif_predictions)
    min_difference=1
    #walk through the folders and files in the directory_tif_coefficients
    for root, dirs, files in os.walk(directory_tif_coefficients):
        for i, file_name in enumerate(files):
            file = os.path.join(root,file_name)
            print("Currently analyzing file: ", file)
            
            uncertain, coefs = process_file_margins(file, model, column_order, standardize, norm_dataframe)
            uncertain["geotiff"] = extract_number_from_filename(file_name) #to identify from which .tiff these results are
                
            #if we give max_difference as parameter, it will be the default rule; otherwise, restrict final_df by number of points 
            final_df_uncertain = pd.concat([final_df_uncertain, uncertain], axis=0, ignore_index=True)
            final_df_coefs = pd.concat([final_df_coefs, coefs], axis=0, ignore_index=True)

            min_margin = np.min(np.abs(final_df_uncertain["Margin"]))
            final_df_uncertain["absMargin"] = -np.abs(final_df_uncertain["Margin"]) #minus sign to select the smallest margins
            final_df_uncertain = final_df_uncertain.nlargest(n_points,"absMargin", keep="all")
            final_df_uncertain.drop(columns=["absMargin"], inplace=True)
            final_df_coefs = final_df_coefs.loc[final_df_uncertain.index]
            
            #reset indexes
            final_df_uncertain.reset_index(drop=True,inplace=True)
            final_df_coefs.reset_index(drop=True,inplace=True)
    
            #save GeoTIFF with the predictions for all the pixels in the current tile
            if save_tif:
                save_tif_folder = directory_tif_predictions
                
                #create tif folder if it does not exist
                if not os.path.exists(save_tif_folder):
                    os.makedirs(save_tif_folder)
                
                with rio.open(file) as src:
                    ras_meta = src.profile
                    ras_meta['dtype'] = "float32"
                    ras_meta["count"] = 1
                    
                    #read the original data to get the mask
                    original_data = src.read(1)
                    mask = np.isnan(original_data)
                
                #reshape predictions to raster
                raster_array = uncertain["1st class"].to_numpy(dtype="float32")
                raster_array = raster_array.reshape((ras_meta["height"], ras_meta["width"]))                
                raster_array[mask] = ras_meta['nodata']
                #save as GeoTIFF
                save_tif_path = os.path.join(save_tif_folder, file_name)
                with rio.open(save_tif_path, 'w', **ras_meta) as dst:
                    dst.write(raster_array, 1)
                print("Saving file in: ", save_tif_path)

        print("\n #################################################### \n")
    if (final_df_uncertain.loc[np.abs(final_df_uncertain["Margin"]) <= min_margin]).shape[0] > n_points:
        #in case there are still more than n_points, we do a final pick of which should be selected
        sampled_indices = final_df_uncertain.sample(n=n_points, random_state=random_seed).index
        final_df_uncertain = final_df_uncertain.loc[sampled_indices]
        final_df_coefs = final_df_coefs.loc[sampled_indices]
        final_df_uncertain.reset_index(drop=True,inplace=True)
        final_df_coefs.reset_index(drop=True,inplace=True)
        
    return final_df_uncertain, final_df_coefs

______________

# getPatches.py

In [None]:
def point_inside_multi_polygon(x, y, multi_polygon):
    point = Point(x, y)
    return multi_polygon.contains(point)

###############################################

def obtain_patch_coordinates(tif_to_df_unique_x, tif_to_df_unique_y, point_coordinates, polygonID, verbose):
    """
    Given coordinates for a point (central pixel), returns a dataframe with the coordinates for the direct 8 neighboring pixels. Works for a list of point_coordinates.
    tif_to_df_unique_x and y regard the possible raster x and y coordinates.
    polygonID is given to keep track of the unique patches. Every 3x3 pixel patch has a unique polygonID for identification and data split purposes.
    """
    final_x = []
    final_y = []
    final_polygonID = []
    for point in zip(point_coordinates["x"], point_coordinates["y"]):
        x_indices = []
        y_indices = []
        central_index_x = np.where(tif_to_df_unique_x == point[0])[0][0] #double [0] to obtain the index
        central_index_y = np.where(tif_to_df_unique_y == point[1])[0][0]
        x_indices.append(tif_to_df_unique_x[central_index_x])
        y_indices.append(tif_to_df_unique_y[central_index_y])
        #evaluates if the central pixel is on any border of the raster. Special cases if it is, otherwise adds the 8 neighboring pixels.
        if central_index_x == 0:
            if verbose:
                print("entered 1")
            x_indices.append(tif_to_df_unique_x[central_index_x+1])
        elif central_index_x == (tif_to_df_unique_x.shape[0]-1):
            if verbose: 
                print("entered 2")
            x_indices.append(tif_to_df_unique_x[central_index_x-1])
        else:
            x_indices.append(tif_to_df_unique_x[central_index_x+1])
            x_indices.append(tif_to_df_unique_x[central_index_x-1])
        if central_index_y == 0:
            if verbose:
                print("entered 4")
            y_indices.append(tif_to_df_unique_y[central_index_y+1])
        elif central_index_y == (tif_to_df_unique_y.shape[0]-1):
            if verbose:
                print("entered 5")
            y_indices.append(tif_to_df_unique_y[central_index_y-1])
        else:
            y_indices.append(tif_to_df_unique_y[central_index_y+1])
            y_indices.append(tif_to_df_unique_y[central_index_y-1]) 
        xy_coords = np.array(np.meshgrid(x_indices, y_indices)).T.reshape(-1,2) #from https://stackoverflow.com/questions/1208118/using-numpy-to-build-an-array-of-all-combinations-of-two-arrays
        final_x.append(xy_coords[:,0])
        final_y.append(xy_coords[:,1])
        final_polygonID.append([polygonID for i in range(xy_coords.shape[0])])
        polygonID += 1

    df_return = pd.DataFrame({"x": np.concatenate(final_x), "y": np.concatenate(final_y), "polygonID": np.concatenate(final_polygonID)})
    return df_return, polygonID #polygonID is also returned to keep track of the change in unique patches

#########################################

def get_patches_from_centroids(geotiffs_path, initial_df, num_indexes_to_return=None, check_inside_multipolygon=True, multi_polygon=None,
                               column_order=None, standardize=None, norm_dataframe=None, random_seed=0, polygonID=5000, verbose=False):
    """
    Adaptation from random_coefs_from_geotiffs()
    Given list of points (centroids) from process_directory() for example, or any other font, obtain patch points around it, that is, the 8 neighboring pixels.
    directory_path (str): The path to the directory containing the GeoTIFF files.
    num_points_per_geotiff (int): The number of points to randomly select from each GeoTIFF.
    num_indexes_to_return (int): The number of randomly selected indexes to return.
    """
    final_df = pd.DataFrame()
    random.seed(random_seed)
    for root, dirs, files in os.walk(geotiffs_path):
        for file in files:
            file_num = extract_number_from_filename(file)
            #check if file is a GeoTIFF
            if (file_num in np.unique(initial_df["geotiff"])) and (file.endswith(".tif") or file.endswith(".tiff")):
                file_path = os.path.join(root, file)
                print(f"Processing GeoTIFF: {file}")
                #transform geotiff in dataframe
                tif_to_df = geotiff_to_df(file_path)
                #see which initial_df points are in this geotiff
                in_geo_df = initial_df.loc[initial_df["geotiff"]==file_num]
                #obtain 3x3 patch dataset
                xypatches, polygonID = obtain_patch_coordinates(np.unique(tif_to_df["x"]), np.unique(tif_to_df["y"]), in_geo_df, polygonID, verbose)
                tif_to_df_patches = pd.merge(tif_to_df, xypatches[["x","y","polygonID"]], on=["x","y"], how="inner")
                tif_to_df_patches["geotiff"] = file_num
                
                if final_df.empty:
                    final_df = tif_to_df_patches
                else:
                    final_df = pd.concat([final_df, tif_to_df_patches], ignore_index=True)
                
                if verbose:
                    print("OG tif file size: ", tif_to_df.shape)
                    print("in_geo_df size: ", in_geo_df.shape)
                    print("Size with patches: ", tif_to_df_patches.shape)
                    print("polygonID: ", polygonID)

    #check if they are all inside the multipolygon
    if (check_inside_multipolygon) and (multi_polygon is not None):
        final_df.reset_index(drop=True,inplace=True)
        if verbose:
            print("Before inside_multi_polygon: ", final_df.shape)
        inside_multi_polygon = [point_inside_multi_polygon(x, y, multi_polygon) for x, y in zip(final_df["x"], final_df["y"])]
        final_df = final_df.loc[inside_multi_polygon]
        if verbose:
            print("After inside_multi_polygon: ", final_df.shape)
    
    #final cut of points from final_df
    final_df.reset_index(drop=True,inplace=True)
    if num_indexes_to_return is not None:
        if verbose:
            print("num_indexes_to_return is not None, cutting until desired number")
        selected_indexes = random.sample(range(len(final_df)), num_indexes_to_return)
        final_df = final_df.iloc[selected_indexes]
        final_df.reset_index(drop=True,inplace=True)

    if standardize:
        final_df[list(norm_dataframe["band"])] = get_normalized_X(final_df[list(norm_dataframe["band"])], norm_dataframe)
    
    #ORDER ACCORDING TO COLUMNS - polygonID, then x, then y
    final_df = final_df.sort_values(by=['polygonID', 'x', 'y'], ignore_index=True)
    
    print("FINAL polygonID: ", polygonID)
    return final_df

_____

# savePickle.py

[Back to top.](#Top)

In [None]:
def savePickle(new_uncertain, new_coefs, max_num_points=None, save=False, training_set_folder=None, discarded_file_dir=None,
               save_uncertain_folder=None, save_coefs_folder=None):
    """
    Updated - the stored pickle file has the unlabeled points, that do not appear in the previously existing training dataset, instead of just previous pickle uncertain file
    Also, in each iteration, the saved pickle only contains the newly highest uncertain points, not all the uncertain selected up to this iteration.
    """

    #stipulate directories if needed
    base_directory = os.getcwd()
    if training_set_folder is None:
        training_set_folder = os.path.join(base_directory, "")
    lastTrainingSet, lastTrainingNum = find_latest_file(directory = training_set_folder)
    file_to_save = str(f"train_dataset_{lastTrainingNum}")
    #file that stores the discarded samples (those that were already selected in a previous AL iteration but could not be annotated)
    if discarded_file_dir is None:
        discarded_file_dir = os.path.join(base_directory, "")
    #where I want to save
    if save_uncertain_folder is None:
        save_uncertain_folder = os.path.join(base_directory, "")
    if save_coefs_folder is None:
        save_coefs_folder = os.path.join(base_directory, "")
        
    directory_uncertain = os.path.join(save_uncertain_folder, file_to_save)
    directory_coefs = os.path.join(save_coefs_folder, file_to_save)

    #the files I want to compare to - most recently stored training set, and discarded file 
    try:
        storedTrainingSet = pd.read_pickle(lastTrainingSet)
    except FileNotFoundError:
        storedTrainingSet = pd.DataFrame()
    try:
        discarded_file = pd.read_pickle(discarded_file_dir)
    except FileNotFoundError:
        discarded_file = pd.DataFrame()

    print(lastTrainingSet)
    print(discarded_file_dir)
        
    #get new points of uncertain and coefs dataframes
    new_uncertain.reset_index(drop=True, inplace=True)
    new_coefs.reset_index(drop=True, inplace=True)

    #no overlap with previous training datasets
    newPointsIndex = filterOverlappingPoints(new_uncertain, storedTrainingSet)
    print(newPointsIndex)
    new_uncertain = new_uncertain.loc[newPointsIndex]
    new_coefs = new_coefs.loc[newPointsIndex]    
    new_uncertain.reset_index(drop=True, inplace=True) #reset again otherwise indices might work weird with filterOverlappingPoints
    new_coefs.reset_index(drop=True, inplace=True)
    #no points that were already discarded
    nodiscPointsIndex = filterOverlappingPoints(new_uncertain, discarded_file) 
    print(nodiscPointsIndex)
    new_uncertain = new_uncertain.loc[nodiscPointsIndex]
    new_coefs = new_coefs.loc[nodiscPointsIndex]
    new_uncertain.reset_index(drop=True, inplace=True) #reset again otherwise indices might work weird with filterOverlappingPoints
    new_coefs.reset_index(drop=True, inplace=True)

    #perform a final cut/selection of points if needed
    if (max_num_points is not None) and (new_uncertain.shape[0] > max_num_points):
        index = find_n_lowest_difference_index(new_uncertain,max_num_points)
        new_uncertain = new_uncertain.loc[index]
        new_coefs = new_coefs.loc[index]

    new_uncertain.reset_index(drop=True, inplace=True)
    new_coefs.reset_index(drop=True, inplace=True)
    
    #save updated dataframes to specified or original pickle file
    if save:
        new_coefs.to_pickle(directory_coefs + ".pkl")
        new_uncertain.to_csv(directory_uncertain + ".csv")

    return new_uncertain, new_coefs

________

# BvSVB get and store dataset - runnable

[Back to top](#Top)

* __Run BvSB__

Atenção: não estou a usar corte de pontos/polígonos em mais nenhum lado, a única seleção de nº de pontos é em process_directory. Então, nº pontos aqi selecionado será o upper bound do nº de patches que vou ter de classificar (alguns podem ser removidos ou assim).

In [None]:
base_directory = os.getcwd()
clf_folder = os.path.join(base_directory, "") #call a specific model to get the column order. The model is not important, but consistent column order between AL iterations is
latest_classifier, latest_classifier_num = find_latest_file(clf_folder) #latest classifier - trained with the latest, most upgraded training set

print("Latest classifier:")
print(latest_classifier)

clf = load(latest_classifier)

In [None]:
%%time

geojson_file = ""
gdf = gpd.read_file(geojson_file)
multi_polygon = gdf.unary_union

uncertain, coefs = process_directory_margins(model=clf, column_order=clf.feature_names_in_, n_points=35, save_tif=True, standardize=True, norm_dataframe=norm,
                                     directory_tif_predictions=None)# max_difference=1e-5,# standardize=True, norm_dataframe=norm,
coefs["geotiff"] = uncertain["geotiff"]

In [None]:
uncertain

* __Get patches__

In [None]:
%%time
geotiffs_path = os.path.join(base_directory, "")

num_indexes_to_return = None
patch_coefs = get_patches_from_centroids(geotiffs_path, coefs, random_seed=0, polygonID=5245, check_inside_multipolygon=False, standardize=True, norm_dataframe=norm)

* __Cut patches (if needed) and get uncertainties__

In [None]:
patch_uncertain = get_margins(patch_coefs.drop(columns=["x","y", "polygonID", "geotiff"]), clf, patch_coefs["x"], patch_coefs["y"])
patch_uncertain["geotiff"] = patch_coefs["geotiff"]
patch_uncertain["polygonID"] = patch_coefs["polygonID"]
patch_uncertain

* __Store points__

In [None]:
patch_uncertain

In [None]:
patch_coefs

In [None]:
patch_uncertain2, patch_coefs2 = savePickle(patch_uncertain, patch_coefs, save=True)

__________

# Update datasets and trained model

[Back to top](#Top)

_____

# updatePickleAndTrainingSet.py

[Back to top.](#Top)

In [None]:
import os
import pandas as pd

def concatenateClassToPickle(pickle_file, csv_file):
    """
    Classes can be written on Uncertain .csv file. But in python we are always working with the pickle.
    So, get Class from .csv to the .pkl file.
    Should be used for both the Data/Coefs and Data/Uncertain files
    Note: csv file can have row order changed, as long as we have "Unnamed: 0" column to re-order it
    """
    pickle = pd.read_pickle(pickle_file)
    csv = pd.read_csv(csv_file,encoding="latin-1") #because of ? characters
    #order columns first
    csv.set_index(csv["Unnamed: 0"],inplace=True)
    csv.sort_index(inplace=True) 

    pickle["Class"] = csv["Class"]
    pickle["Pred certainty"] = csv["Pred certainty"]
    pickle.to_pickle(pickle_file)
    return;

###############################################

def find_latest_pickle(directory):
    """
    Given directory, gets latest pickle file (with complete directory), and the number that appears right before the .extension (someName_"number".extension)
    Only implemented for pickle files, because!
    """
    pickle_files = [file for file in os.listdir(directory) if file.endswith(".pkl")]
    if not pickle_files:
        return None, -1 

    #extract suffix numbers from file names and find the maximum
    suffix_numbers = [int(file.split("_")[-1].split(".")[0]) for file in pickle_files]
    latest_suffix = max(suffix_numbers)
    latest_pickle_file = str(f"train_dataset_{latest_suffix}.pkl")

    return latest_pickle_file, latest_suffix

###############################################

def filter_by_column_value(df, column_name="Class", value=-1):
    df_not_value = df.loc[df[column_name] != value]
    df_value = df.loc[df[column_name] == value][["x","y"]] # we only need their positions
    df_not_value.reset_index(drop=True, inplace=True)
    df_value.reset_index(drop=True, inplace=True)
    return df_not_value, df_value

###############################################

def concatenateTrainingSet(train_set_directory=None, train_features_directory=None, discard_file=None, save=False):
    """
    Checks Data/Coefs, with newly added labels.
    Checks last saved training dataset.
    Creates a new training dataset, merging the 2.
    """

    #stipulate directories if needed
    base_directory = os.getcwd()
    if train_set_directory is None:
        train_set_directory = os.path.join(base_directory, "")
    if train_features_directory is None:
        train_features_directory = os.path.join(base_directory, "")

    latest_pickle_file, latest_suffix = find_latest_pickle(train_features_directory)
    latest_suffix += 1

    #from the last saves pixel, which contains a column with the indication of the class, filter which samples are kept and added to the train set, and those that are discarded
    #SAMPLES TO DISCARD HAD THEIR CLASS ASSIGNED AS -1 DURING ANNOTATION
    df1 = pd.read_pickle(os.path.join(train_features_directory, latest_pickle_file))
    print(os.path.join(train_features_directory, latest_pickle_file))
    print(os.path.join(train_set_directory, latest_pickle_file))
    df1_keep, df1_discard = filter_by_column_value(df1)

    if latest_pickle_file is None:
        result_df = df1_keep
    else:
        df2 = pd.read_pickle(os.path.join(train_set_directory, latest_pickle_file))
        result_df = pd.concat([df2, df1_keep], axis=0, ignore_index=True)

    #store coordinates of the discarded points
    if discard_file is not None:
        try:
            discard_df = pd.read_pickle(discard_file)
            df1_discard_index = filterOverlappingPoints(df1_discard, discard_df)
            print(discard_df.index)
            print(df1_discard.index)
            print(df1_discard_index)
            df1_discard = df1_discard.loc[df1_discard_index]
            discard_df = pd.concat([discard_df,df1_discard], axis=0, ignore_index=True)
        except FileNotFoundError:
            discard_df = df1_discard
        discard_df.reset_index(drop=True, inplace=True)
        discard_df.to_pickle(discard_file)
    
    if save:
        new_pickle_file = os.path.join(train_set_directory, f"train_dataset_{latest_suffix}.pkl")
        result_df.to_pickle(new_pickle_file)
    return result_df

###############################################

def updateSets(training_directory=None, uncertain_directory=None, coefs_directory=None, discard_file=None, save=False):
    base_directory = os.getcwd()
    if training_directory == None:
        training_directory = os.path.join(base_directory, "")
    if uncertain_directory == None:
        uncertain_directory = os.path.join(base_directory, "")
    if coefs_directory == None:
        coefs_directory = os.path.join(base_directory, "")

    last_run, last_run_num = find_latest_pickle(training_directory)

    coefs_pickle_file = os.path.join(coefs_directory, f"train_dataset_{last_run_num}.pkl")
    uncertain_csv_file = os.path.join(uncertain_directory, f"train_dataset_{last_run_num}.csv")

    #pass the classes on the annotated csv file to the training features file 
    concatenateClassToPickle(coefs_pickle_file,uncertain_csv_file)

    #stipulate the discard directory/file
    if discard_file is None:
        discard_file = os.path.join(base_directory, "Discarded.pkl")

    print(coefs_pickle_file)
    print(uncertain_csv_file)
    #update training set with kept samples
    result_df = concatenateTrainingSet(training_directory, discard_file=discard_file, save=save)
    return result_df

______________

# Update datasets and trained model - runnable

[Back to top](#Top)

In [None]:
from joblib import dump, load
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score

base_directory = os.getcwd()
training_directory = os.path.join(base_directory, "")
uncertain_directory = os.path.join(base_directory, "")
coefs_directory = os.path.join(base_directory, "")
discard_file = os.path.join(base_directory, "")

#identify last AL iteration
last_run, last_run_num = find_latest_pickle(directory = training_directory)

result_df = updateSets(training_directory, uncertain_directory=None, coefs_directory=None, discard_file=None, save=True)

* __Retrain model with updated training set and save__

In [None]:
X_train_update = result_df.drop(columns=["Class","x","y", "polygonID", "geotiff", "Pred certainty"])
y_train_update = transform_y_2classes(result_df["Class"])

svm_params = {'C': 0.0117, 'class_weight': "balanced", 'degree': 4, 'gamma': 0.6439, 'kernel': "poly"} #best params after HP tuning
clf = SVC(**svm_params, random_state=42)

clf.fit(X_train_update,y_train_update)

filename = os.path.join(base_directory, "", f"svm_{last_run_num+1}") 
dump(clf, filename)

____

# AL iter performance

[Back to top](#Top)

* __MS-AL results__

In [None]:
svm_params = {'C': 0.0117, 'class_weight': "balanced", 'degree': 4, 'gamma': 0.6439, 'kernel': "poly"}
clf = SVC(**svm_params, random_state=42)
clf.fit(X_rs_train_norm, y_rs_train)

base_directory = os.getcwd()
models_directory = os.path.join(base_directory, "")
training_sets_directory = os.path.join(base_directory, "")
line1_values = [balanced_accuracy_scorer(clf, X_rs_test_norm, y_rs_test), f1_cashew_scorer(clf, X_rs_test_norm, y_rs_test)]

n_iters = len(os.listdir(training_sets_directory))

iters=np.arange(n_iters)
train_sizes=[]
f1_train=[]
balacc_train=[]
f1_test=[]
balacc_test=[]
ovacc_train=[]
ovacc_test=[]

models = os.listdir(models_directory)
for i,train_file in enumerate(os.listdir(training_sets_directory)):
    #get train sizes
    train_path = os.path.join(training_sets_directory, train_file)
    train_set = pd.read_pickle(train_path)
    train_set = train_set.loc[train_set["Class"]>0]
    train_sizes.append(train_set.shape[0])
    #get model results
    model_path = os.path.join(models_directory, models[i])
    clf = load(model_path) 
    f1_train.append(f1_cashew_scorer(clf, train_set[norm["band"]], transform_y_2classes(train_set["Class"])))
    balacc_train.append(balanced_accuracy_scorer(clf, train_set[norm["band"]], transform_y_2classes(train_set["Class"])))
    f1_test.append(f1_cashew_scorer(clf, X_rs_test_norm, y_rs_test))
    balacc_test.append(balanced_accuracy_scorer(clf, X_rs_test_norm, y_rs_test))
    ovacc_train.append(clf.score(train_set[norm["band"]], transform_y_2classes(train_set["Class"])))
    ovacc_test.append(clf.score(X_rs_test_norm,y_rs_test))

#fill results_ms dataframe
results_ms = pd.DataFrame()
results_ms["iterID"] = iters
results_ms["trainSize"] = train_sizes
results_ms["ovacc_train"] = ovacc_train
results_ms["balacc_train"] = balacc_train
results_ms["f1_train"] = f1_train
results_ms["ovacc_test"] = ovacc_test
results_ms["balacc_test"] = balacc_test
results_ms["f1_test"] = f1_test

results_ms

* __RS results__

In [None]:
svm_params = {'C': 0.0117, 'class_weight': "balanced", 'degree': 4, 'gamma': 0.6439, 'kernel': "poly"}
clf = SVC(**svm_params, random_state=42)
clf.fit(X_rs_train_norm, y_rs_train)


results_maxrs = pd.DataFrame()
results_maxrs["iterID"] = [min(np.min(results_rs["iterID"]), np.min(results_ms["iterID"])), max(np.max(results_rs["iterID"]), np.max(results_ms["iterID"]))]
results_maxrs["trainSize"] = [min(np.min(results_rs["trainSize"]), np.min(results_ms["trainSize"])), max(np.max(results_rs["trainSize"]), np.max(results_ms["trainSize"]))]
results_maxrs["ovacc_train"] = [clf.score(X_rs_train_norm, y_rs_train) for i in range(2)]
results_maxrs["balacc_train"] = [balanced_accuracy_scorer(clf, X_rs_train_norm, y_rs_train) for i in range(2)]
results_maxrs["f1_train"] = [f1_cashew_scorer(clf, X_rs_train_norm, y_rs_train) for i in range(2)]
results_maxrs["ovacc_test"] = [clf.score(X_rs_test_norm, y_rs_test) for i in range(2)]
results_maxrs["balacc_test"] = [balanced_accuracy_scorer(clf, X_rs_test_norm, y_rs_test) for i in range(2)]
results_maxrs["f1_test"] = [f1_cashew_scorer(clf, X_rs_test_norm, y_rs_test) for i in range(2)]

* __Plots__

In [None]:
#extract iterative step
plot_maxrs=True #if you want to plot an horizontal line with the performance of the full RS train set model
results = [results_rs, results_ms, results_maxrs]
columns_to_plot = ['ovacc_train', 'balacc_train', 'f1_train', 'ovacc_test', 'balacc_test', 'f1_test']
titles_to_plot = ["Ov. Accuracy train", "Bal. Accuracy train", "F1 train", "Ov. Accuracy test", "Bal. Accuracy test", "F1 test"]

plt.figure(figsize = (18,8))
for i, col in enumerate(columns_to_plot):
    plt.subplot(2,3,i+1) #6 plots, 2x3 matrix
        
    plt.plot(results[0]["trainSize"], results[0][col], marker='o', linestyle='-', label="RS")
    plt.plot(results[1]["trainSize"], results[1][col], marker='o', linestyle='-', label="MS-AL")
    if plot_maxrs:
        plt.plot(results[2]["trainSize"], results[2][col], linestyle='--', label="Full RS train set")
    
    plt.xlabel('trainSize')
    plt.ylabel('Performance')
    plt.title(titles_to_plot[i])
    plt.grid(True)
    plt.legend()

plt.tight_layout()
plt.show()

In [None]:
results_ms

In [None]:
results_rs

________________

# Stats per iteration

[Back to top](#Top)

* __Class proportions per AL iteration__

In [None]:
plt.figure(figsize=(15, 10))

colors = {-1: "black", 1: '#006600', 2: '#99ff33', 3: "#2d8659", 4: '#c6538c', 5: '#808000', 6: '#804000', 7: '#0000ff', 8:'orange'}

base_directory = os.getcwd()
models_directory = os.path.join(base_directory, "")
training_sets_directory = os.path.join(base_directory, "")
coef_sets_directory = os.path.join(base_directory, "")
n_iters = len(os.listdir(coef_sets_directory))

proportions = []

iters=np.arange(n_iters)
for i,train_file in enumerate(os.listdir(coef_sets_directory)):
    train_path = os.path.join(coef_sets_directory, train_file)
    train_set = pd.read_pickle(train_path)

    #we ended up with 8 MS-AL iterations in total, so divided the plots into a 4x2 subplot matrix 
    plt.subplot(len(iters)//4+1,4,i+1)

    #barplots of pixel counts per class
    class_counts = train_set['Class'].value_counts()
    class_counts = class_counts.sort_index()
    class_counts.plot(kind='bar', color=[colors[cls] for cls in class_counts.index])
    
    plt.title(f'Iteration {i+1}')
    plt.xlabel('Class')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    proportion = class_counts.get(-1, 0) / len(train_set)
    proportions.append([i+1, len(train_set), proportion.round(3)])
     
plt.tight_layout()
plt.show()

headers = ["Iteration", "Train Set Size", "Proportion of Class -1"]
print(tabulate(proportions, headers=headers, tablefmt="grid"))

* __Percentage of discarded points per AL iteration__

In [None]:
proportions = np.array(proportions)
plt.grid()
plt.gca().set_axisbelow(True)
plt.axvline(5, linestyle="--", color="orange", label="Peak performance iteration")
plt.scatter(proportions[:,0]+1, proportions[:,2])
plt.legend()
plt.xlabel("Iteration")
plt.ylabel("% of discarded points")
plt.title("Percentage of Discarded Points per Iteration");
plt.tight_layout()
plt.savefig("ms_percentage_discarded_points_per_iteration.png")
plt.show()

________________

# Bottom

[Back to Top.](#Top)