# Análise dos Resultados Obtidos durante as Iterações do Processo de Active Learning
Nesse notebok estão algumas métricas de avaliação do processo iterativo.

- Evolução da quantidade durante as iterações
- Evolução das métricas de algumas métricas, como: Acurácia, F1-Score, KappaScore e IoU

## Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys

sys.path.append("..")

import os
from PIL import Image
from collections.abc import Iterable
from os.path import dirname, join

import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

import numpy as np
import pandas as pd
import seaborn as sns
import torch.optim
from joblib import Parallel, delayed
from matplotlib.ticker import FuncFormatter
from skimage.color import label2rgb
# load label
from skimage.measure import label
import subprocess

from utils import autolabel
from pred2raster import pred2raster
from sample_selection import get_components_stats
from src.io_operations import fix_relative_paths, load_args, read_tiff, read_yaml

from IPython.display import HTML, display

from millify import  millify

from matplotlib import rc

In [None]:
import matplotlib.font_manager

In [None]:
# Style plots
sns.set_theme(style="whitegrid", font="Times New Roman")

# set figsize
plt.rcParams["figure.figsize"] = (8, 5)

# increase DPI
plt.rcParams["figure.dpi"] = 120

# font family
plt.rcParams['font.family'] = 'times new roman'


# Dict to store colors
dict_colors = {}

# Parallel jobs to load metrics
N_JOBS = 4

## Defina a pasta com a saída do processo de Active Learning

In [None]:
# repo with data
VERSION_FOLDER = "/media/dariossh/Extreme SSD/Arquivos SSH/2.8.5_version_data"
DATA_PATH = join(dirname(os.getcwd()), VERSION_FOLDER)

# load args from the version
args = load_args(join(DATA_PATH, "args.yaml"))

In [None]:
INPUT_PATH = "../amazon_input_data"

In [None]:
def get_iter_stats(iter_folder, labels_file="all_labels_set.tif"):
    num_iter = int(iter_folder.split("_")[1])
    if num_iter != 0:
        label_path = join(DATA_PATH, iter_folder, "new_labels", labels_file)
        
    else:
        label_path = args.train_segmentation_path

    label_img = read_tiff(label_path)

    components = label(label_image = label_img)

    components_stats = get_components_stats(components, label_img)
    components_stats.reset_index(inplace=True)

    iter_num = int(iter_folder.split("_")[1])

    components_stats["iter"] = f"iter_{iter_num:03d}"

    components_stats["iter_num"] = iter_num

    return components_stats

## Data loading

In [None]:
id_tree = pd.read_csv(join(INPUT_PATH,"id_trees.csv"), index_col="ID")["Nome popular"].sort_values()

In [None]:
# load data from all iterations
iter_folders = os.listdir(DATA_PATH)

iter_folders = [folder for folder in iter_folders if folder.startswith("iter_")]

iter_folders.sort()

iter_folders.pop();

In [None]:
OUTPUT_PATH_ALL_LABELS_STATS = join(DATA_PATH, "all_labels_stats.parquet")

if not os.path.exists(OUTPUT_PATH_ALL_LABELS_STATS):
    # carregando dados sobre cada iteracao
    all_iter_stats = Parallel(n_jobs=N_JOBS)(
        delayed(get_iter_stats)(iter_folder, "all_labels_set.tif")
        for iter_folder in iter_folders
    )

    # all_iter_stats corresponde dados gerados para o banco de imagens
    all_labels_stats = pd.concat(all_iter_stats)

    all_labels_stats["tree_name"] = all_labels_stats["tree_type"].map(id_tree)
    
    all_labels_stats.to_parquet(OUTPUT_PATH_ALL_LABELS_STATS)


all_labels_stats = pd.read_parquet(OUTPUT_PATH_ALL_LABELS_STATS)

In [None]:
OUTPUT_PATH_SELECTED_LABELS_STATS = join(DATA_PATH, "selected_labels_stats.parquet")

if not os.path.exists(OUTPUT_PATH_SELECTED_LABELS_STATS):
    selected_iter_stats = Parallel(n_jobs=N_JOBS)(
        delayed(get_iter_stats)(iter_folder, "selected_labels_set.tif")
        for iter_folder in iter_folders
    )

    # all_iter_stats corresponde dados gerados para o cojunto de treinamento
    selected_labels_stats = pd.concat(selected_iter_stats)
    
    selected_labels_stats["tree_name"] = selected_labels_stats["tree_type"].map(id_tree)

    selected_labels_stats.to_parquet(OUTPUT_PATH_SELECTED_LABELS_STATS)

selected_labels_stats = pd.read_parquet(OUTPUT_PATH_SELECTED_LABELS_STATS)


In [None]:
metrics_evo = []

for iter in iter_folders[1:]:
    accu = read_yaml(os.path.join(DATA_PATH, iter, "test_metrics.yaml"))
    
    metrics_evo.append(accu)
    
metrics_evo = pd.DataFrame(metrics_evo)

In [None]:
if all_labels_stats["tree_type"].nunique() == 17:
    dict_colors["tree_name"] = {
        key: color for key, color in zip(id_tree, sns.color_palette("tab20"))
    }

    dict_colors["tree_type"] = {
        key: color for key, color in zip(range(0, 18), sns.color_palette("tab20"))
    }

In [None]:

if all_labels_stats["tree_type"].nunique() == 14:
    
    DEFAULT_COLORS = ('silver', 'blue', 'yellow', 'magenta', 'green', 
                        'indigo', 'darkorange', 'cyan', 'pink', 'yellowgreen', 
                        'red', 'darkgreen', 'gold', 'teal')

    dict_colors["tree_name"] = {
        key: color for key, color in zip(id_tree, DEFAULT_COLORS)
    }

    dict_colors["tree_type"] = {
        key: color for key, color in zip(range(0, 18), DEFAULT_COLORS)
    }

## Evolução da Quantidade de Instâncias e Área Segmentada
Para essa análise, serão consideradas instâncias presentes no inventário de amostras. Nesse inventário estão as amostras em que o modelo teve alta confiança na predição e que passaram pelo filtro de seleção com base nas heurísticas.

In [None]:
# change size fig
fig = plt.gcf()

count_tree_by_iter = all_labels_stats.groupby(["iter_num"], as_index=False, sort=True)["label"].nunique()
count_tree_by_iter.rename(columns={"label": "n_trees"}, inplace=True)

ax = sns.lineplot(
    data=count_tree_by_iter,
    x="iter_num",
    y="n_trees",
)

ax.set_title("Number of trees per iteration")

# set outside box legend
# ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, title="Tree Types")

plt.show()

In [None]:
# Segmentation area per iteration
# change size fig
fig = plt.gcf()
fig.set_size_inches(12, 5)

area_tree_by_iter = all_labels_stats.groupby(["iter_num"], as_index=False, sort=True)["area"].sum()
area_tree_by_iter.rename(columns={"label": "n_trees"}, inplace=True)

ax = sns.lineplot(
    data=area_tree_by_iter,
    x="iter_num",
    y="area",
)

ax.set_title("Segmented area per iteration")
ax.set_ylabel("Area (pxl²)")
ax.set_xlabel("Iteration")

# set outside box legend
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, title="Tree Types")

plt.show()



In [None]:
count_tree_type_by_iter = all_labels_stats.groupby(["iter_num", "tree_name"], as_index=False, sort=True)["label"].nunique()
count_tree_type_by_iter.rename(columns={"label": "n_trees"}, inplace=True)

In [None]:
# change size fig
fig = plt.gcf()
fig.set_size_inches(12, 5)

ax = sns.lineplot(
    data=count_tree_type_by_iter,
    x="iter_num",
    y="n_trees",
    hue="tree_name",
    errorbar = None,
    palette=dict_colors["tree_name"],
)

ax.set_title("Number of trees per iteration")

# set outside box legend
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, title="Tree Types")

plt.show()

print("Evolução por espécies")
display(
    count_tree_type_by_iter.pivot(columns="iter_num", index="tree_name", values="n_trees").style.background_gradient(axis = 1)
)

In [None]:
fig = plt.gcf()
fig.set_size_inches(12, 5)

ax = sns.lineplot(
    data=all_labels_stats,
    x="iter_num",
    y="area",
    hue="tree_name",
    estimator="sum",
    errorbar=None,
    palette=dict_colors["tree_name"],
)

ax.set_title("Segmentation Area per iteration")

ax.set_ylabel("Area (pxl)")
ax.set_xlabel("Iterarion")

ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f"{millify(y, 2)}"))

# set outside box legend
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, title="Tree Types")

plt.show()

In [None]:
fig = plt.gcf()
fig.set_size_inches(17, 3)

ax = sns.barplot(
    data=all_labels_stats[all_labels_stats["iter_num"] == 0],
    x="tree_name",
    y="label",
    estimator=lambda group: group.nunique(),
    errorbar=None,
    order=all_labels_stats[all_labels_stats["iter_num"] == 0]
            .groupby("tree_name")["label"]
            .nunique()
            .sort_values(ascending=False)
            .index,
    palette=dict_colors["tree_name"],
    hue = "tree_name"
)

ax.set_title(f"Number of Trees at the Start")

ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f"{millify(y, 2)}"))

ax.legend(loc='upper left', bbox_to_anchor=(1, 1))

ax.tick_params(axis='x', rotation=45)

autolabel(ax, "{:,.0f}")

plt.show()


### LAST ITER  ###
fig = plt.gcf()
fig.set_size_inches(17, 3)


ax = sns.barplot(
    data=all_labels_stats[all_labels_stats["iter_num"] == all_labels_stats["iter_num"].max()],
    x="tree_name",
    y="label",
    estimator=lambda group: group.nunique(),
    errorbar=None,
    order=all_labels_stats[all_labels_stats["iter_num"] == all_labels_stats["iter_num"].max()].groupby("tree_name")["label"]
        .nunique()
        .sort_values(ascending=False)
        .index,
    palette=dict_colors["tree_name"],
    hue="tree_name"
)


ax.set_title(f"Number of Trees at {all_labels_stats['iter_num'].max()}th Iteration")

autolabel(ax, "{:,.0f}")

ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f"{millify(y, 2)}"))

ax.tick_params(axis='x', rotation=45)

plt.show()

In [None]:
fig = plt.gcf()
fig.set_size_inches(17, 4)

ax = sns.barplot(
    data=all_labels_stats[all_labels_stats["iter_num"] == 0],
    x="tree_name",
    y="area",
    estimator=lambda group: group.sum(),
    errorbar=None,
    order=all_labels_stats[all_labels_stats["iter_num"] == 0]
            .groupby("tree_name")["area"]
            .sum()
            .sort_values(ascending=False)
            .index,
    palette=dict_colors["tree_name"],
    hue = "tree_name"
)

ax.set_title(f"Segmentation Area at the Start")

ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f"{millify(y, 2)}"))

ax.legend(loc='upper left', bbox_to_anchor=(1, 1))

ax.tick_params(axis='x', rotation=45)

autolabel(ax, "{:,.0f}")

plt.show()


### LAST ITER  ###
fig = plt.gcf()
fig.set_size_inches(17, 4)


ax = sns.barplot(
    data=all_labels_stats[all_labels_stats["iter_num"] == all_labels_stats["iter_num"].max()],
    x="tree_name",
    y="area",
    estimator=lambda group: group.sum(),
    errorbar=None,
    order=all_labels_stats[all_labels_stats["iter_num"] == all_labels_stats["iter_num"].max()].groupby("tree_name")["area"]
        .sum()
        .sort_values(ascending=False)
        .index,
    palette=dict_colors["tree_name"],
    hue="tree_name"
)


ax.set_title(f"Segmentation Area at {all_labels_stats['iter_num'].max()}th Iteration")

autolabel(ax, "{:,.0f}")

ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f"{millify(y, 2)}"))

ax.tick_params(axis='x', rotation=45)

plt.show()

## Evolução Temporal do Conjunto de Treino

O conjunto de treino corresponde apenas as labels usadas para treinar o modelo, ou seja, esses dados são apenas um subconjunto do inventário com amostras de alta confiança.

Essas análises tem como objetivo identificar o balanceamento das amostras usadas no treino

In [None]:
ax = sns.lineplot(
    data=selected_labels_stats,
    x="iter_num",
    y="area",
    hue="tree_name",
    estimator="sum",
    errorbar=None,
    palette=dict_colors["tree_name"],
)

ax.set_title("Segmentation Area per iteration")

# set y log scale
ax.set_yscale("log")

ax.set_ylabel("Area (pxl)")
ax.set_xlabel("Iterarion")

# set outside box legend
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, title="Tree Types")

plt.show()

In [None]:
count_by_iter = selected_labels_stats.groupby(
    ["tree_name", "iter_num"], as_index=False, sort=True
)["label"].nunique()
count_by_iter.rename(columns={"label": "n_trees"}, inplace=True)

# change size fig
fig = plt.gcf()
fig.set_size_inches(10, 5)

ax = sns.lineplot(
    data=count_by_iter,
    x="iter_num",
    y="n_trees",
    hue="tree_name",
    errorbar=None,
    palette=dict_colors["tree_name"],
)
# set outside box legend
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, title="Tree Types")
ax.set_title("Number of trees per iteration")

plt.show()

# Evolução da Distribuição da Área Durante as Iterações

In [None]:

def format_y_ticks(value, _):
    return millify(value, precision=1)



In [None]:
# Group the data by tree type
grouped_data = all_labels_stats.groupby('tree_name')

# Create a boxplot for each tree type
for num, (name, group) in enumerate(grouped_data):
    
    fig, axes = plt.subplots(figsize=(7, 3), sharey=True, sharex=True)

    sns.boxplot(
        group,
        x="iter_num",
        y="area",
        hue = "tree_name",
        palette=dict_colors["tree_name"],
        ax = axes,
        showfliers = False
    )
    
    # Set the labels and title
    axes.set_xlabel('Iteration')
    axes.set_ylabel('Area (pxl²)')
    axes.set_title(f'{name}')

    ax.yaxis.set_major_formatter(plt.FuncFormatter(format_y_ticks))

    # Show the plot
    plt.show()



## Evolução das Métricas Durante as Iterações

In [None]:
ax = sns.lineplot(metrics_evo["Accuracy"])
ax.set_ylim(40, 100)
ax.set_title("Accuracy")
plt.show()

In [None]:
ax = sns.lineplot(metrics_evo["avgF1"])
ax.set_ylim(40, 100)
ax.set_title("F1 Score")
plt.show()

In [None]:
ax = sns.lineplot(metrics_evo["KappaScore"])
ax.set_ylim(40, 100)
ax.set_title("KappaScore")
plt.show()

In [None]:
ax = sns.lineplot(metrics_evo["avgIOU"])
ax.set_ylim(0, 100)
ax.set_title("Intersection Over True Label Area")
plt.show()

## Evolução do F1-Score para Cada Espécie

In [None]:
metrics_evo_tree = metrics_evo["F1"].apply(pd.Series)

metrics_evo_tree_stack = metrics_evo_tree.stack().reset_index()

metrics_evo_tree_stack.rename(columns={0:"f1_score", "level_1":"tree_type",'level_0':"iter" }, inplace=True)

metrics_evo_tree_stack["iter"] += 1
metrics_evo_tree_stack["tree_type"] += 1

metrics_evo_tree_stack["tree_name"] = metrics_evo_tree_stack["tree_type"].map(id_tree)

In [None]:
g = sns.FacetGrid(
    metrics_evo_tree_stack,
    col="tree_name",
    col_wrap=3, 
    hue = "tree_name",
    palette = dict_colors["tree_name"]
)

g.map(sns.lineplot, "iter", "f1_score")

g.set_titles("{col_name}")

plt.show()

In [None]:
VIEW_CONTOUR_PATH = join(VERSION_FOLDER, "visualization", "synthetic_all_labels")

views_path = os.listdir(VIEW_CONTOUR_PATH)
views_path.sort()

LAST_VIEW = join(VIEW_CONTOUR_PATH, views_path[-1])

VIEW_SEG_CONTOUR = plt.imread(LAST_VIEW)

In [None]:
plt.figure(dpi = 600)
plt.imshow(VIEW_SEG_CONTOUR)
plt.axis("off")
plt.show()

## Animação da Evolução de Amostras

In [None]:
VIEW_SEG_PATH = join(VERSION_FOLDER, "visualization", "all_labels")

# Load your images
image_paths = os.listdir(VIEW_SEG_PATH)  # Update with your image paths
image_paths = [join(VIEW_SEG_PATH, image_path) for image_path in image_paths]
image_paths.sort()

In [None]:
images = [Image.open(path) for path in image_paths]

# Create a figure and axis
fig, ax = plt.subplots()

def init():
    ax.imshow(images[0])
    ax.axis('off')  # Turn off axis

# Function to update the plot for each frame of the animation
def update(frame):
    ax.imshow(images[frame])
    ax.axis('off')  # Turn off axis
    ax.set_title(f"Iter: {frame}")

    # return ax

# Create the animation
num_frames = len(images)
ani = FuncAnimation(fig, update, frames=num_frames, blit=False, interval = 1000)

# Display the animation in the notebook
# ani
display(HTML(ani.to_jshtml()))

plt.close()

In [None]:
images = [Image.open(path) for path in image_paths]

for num, image in enumerate(images):
    fig, ax = plt.subplots()
    ax.set_title(f"Iter {num+1}")
    
    ax.imshow(image)
    plt.axis("off")
    plt.show()
    

# Exportando Relatório

In [None]:
DATA_PATH

In [None]:
# Example of running 'ls -l' and capturing output
result = subprocess.run([
    "jupyter", 
    "nbconvert", 
    "--to", 
    "html",
    "report_metrics.ipynb", 
    f"--output-dir={DATA_PATH}"], 
    capture_output=True, 
    text=True
)

print(result.stdout)