In [1]:
# Load the auto reload extension to automatically reload modules when files on disk are updated
%load_ext autoreload
# it will automatically be reloaded without the need to restart the kernel.
%autoreload 2

In [6]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import os
from tqdm import tqdm
from glob import glob
from agrilearn.crop_classification import evaluate_utils as eval_util

In [7]:
# Models and checkpoint path
DATASET_NAME = "teste_pre_safra_2024_2025"
GEOPACKAGE_PATH = "/agrilearn_app/datasets/teste_pre_safra_2024_2025/geopackage/processed/data_merged_crop_120_06_08_2024_input_mvp_processing.gpkg"

EOPATCH_PATH = f"/agrilearn_app/datasets/{DATASET_NAME}/eopatch/input_train/"
OUTPUT_PREDICTIONS = f"/agrilearn_app/output/experiment_01/predictions/{DATASET_NAME}"
OUTPUT_RESULTS = f"/agrilearn_app/output/experiment_01/results/{DATASET_NAME}"

In [None]:
## 1. Read Dataset

In [5]:
df = evaluate_utils.read_multiplies_files_from_path(OUTPUT_PREDICTIONS)

There are 8340 files to read


Reading files:: 100%|██████████████████████████| 8340/8340 [00:34<00:00, 238.61it/s]


In [None]:
df.head()

In [None]:
import plotly.graph_objects as go

fig_html = create_sankey_figure_html(df_=df,
                                     label_true='monitoring_class',
                                     label_pred='crop_class_rnn')

In [None]:
fig_html

In [None]:
source_target = prepate_data_to_sankey(df_=df,
                                       label_true='monitoring_class',
                                       label_pred='crop_class_rnn')


In [None]:
source_target

In [None]:
source_target['target']

In [None]:
fig_html

In [None]:
# Identificar todas as classes únicas (real e predito)
unique_classes = pd.Index(
    pd.concat([source_target[label_true], source_target[label_pred]]).unique())
unique_classes

In [None]:
classes_index = {name: idx for idx, name in enumerate(unique_classes)}
classes_index

In [None]:
# Mapear as classes para índices numéricos
source_target['source'] = source_target[label_true].map(classes_index)
source_target['target'] = source_target[label_pred].map(classes_index)

In [None]:
source_target

In [None]:
colorDict={
                         'SOYBEAN': '#f71b1b',
                         'CORN': '#1b7ef7',
                         'COTTON': '#f3f71b',
                         'RICE': '#12e23f',
                         'SUGAR_CANE': '#f78c1b',
                         'WHEAT': '#1bf7e2',
                         'PASTURE': '#1bf7e2'
                     }

In [None]:
class_colors = [colorDict[cls] for cls in source_target[label_true].unique(
)] + [colorDict[cls] for cls in source_target[label_pred].unique()]
class_colors

In [None]:
unique_classes.tolist()

In [None]:
# Criar o gráfico Sankey
import plotly.graph_objects as go

fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=unique_classes.tolist(),
        color='blue'
    ),
    link=dict(
        source=source_target['source'],
        target=source_target['target'],
        value=source_target['count'],
        hovercolor=["midnightblue", "lightskyblue", "gold", "mediumturquoise", "lightgreen", "cyan"]
    )
)])
fig

In [None]:
source_target

In [None]:
# Layout do gráfico
fig.update_layout(
    title_text="Gráfico Sankey: Relação entre Real e Predito",
    font_size=12,
    width=700,
    height=800)

fig

In [None]:
from pysankey import sankey

In [None]:
gdf[label_true].values[1:]

In [None]:
ax = sankey(
    left=source_target[label_true], 
    right=source_target[label_pred],
    leftWeight=source_target['source'],
    rightWeight=source_target['target'],
    colorDict=colorDict,
    fontsize=12,
    aspect=20,
    color_gradient=True
)

In [None]:
OUTPUT_RESULTS

In [None]:
ax.figure.savefig('/agrilearn_app/output/experiment_01/results/teste_pre_safra_2024_2025/sankey.png')

In [None]:
# gdf.period.unique()

# # select only the ones of first season
# gdf = gdf[gdf.period.isin(['2021/2022', '2020/2021', '2019/2020', '2023/2024',
#                                               '2017/2018', '2018/2019', '2022/2023'])]

In [None]:
# gdf[(gdf["gt_class"] == "SUGAR_CANE")].crop_class_rnn.value_counts().to_frame()

In [None]:
# gdf_valid = gdf[['crop_class_gt', 'crop_class_rnn']].dropna()
# gdf_valid = gdf_valid[~gdf_valid.crop_class_gt.isin(["WHEAT","SUGAR_CANE"])] 

In [None]:
# gdf_valid.crop_class_rnn.value_counts()

In [None]:
# colorDict

In [None]:
# gdf_valid.crop_class_gt.value_counts()

In [None]:
# Calculate the percentage of each combination of class_gt and class_pred
combination_counts = gdf_valid.groupby(['crop_class_gt', 'crop_class_rnn']).size().reset_index(name='count')
total_counts = gdf_valid['crop_class_gt'].value_counts().reset_index()
total_counts.columns = ['crop_class_gt', 'total_count']

# Merge the counts to calculate the percentage
merged_counts = combination_counts.merge(total_counts, on='crop_class_gt')
merged_counts['value'] = (merged_counts['count'] / merged_counts['total_count']) * 100

# Create the final dataframe
final_df = merged_counts[['crop_class_gt', 'crop_class_rnn', 'value']]
final_df.columns = ['class_gt', 'class_pred', 'value']

print(final_df)

In [None]:
# Prepare data for Sankey diagram
label_list = list(colorDict.keys())
# source_indices = final_df['class_gt'].apply(lambda x: label_list.index(x)).tolist()
# target_indices = final_df['class_pred'].apply(lambda x: label_list.index(x) + len(colorDict)).tolist()
values = final_df['value'].tolist()

In [None]:
index_to_class = {cls: idx for idx, cls in enumerate(final_df['class_gt'].unique())}

In [None]:
source_indices = final_df['class_gt'].map(index_to_class).tolist()
target_indices = final_df['class_pred'].map(index_to_class).tolist()
target_indices = [x + 4 for x in target_indices]

In [None]:
import plotly.graph_objects as go


# Create a color for each class
class_colors = [colorDict[cls] for cls in final_df['class_gt'].unique()] + [colorDict[cls] for cls in final_df['class_pred'].unique()]

fig = go.Figure(data=[go.Sankey(
  node = dict(
    pad = 15,
    thickness = 20,
    line = dict(color = "black", width = 0.5),
    label = final_df['class_gt'].unique().tolist() + final_df['class_pred'].unique().tolist(),
    color = class_colors
  ),
  link = dict(
    source = source_indices,
    target = target_indices,
    value = values,
    color=[        class_colors[src] if final_df.iloc[i]['class_gt'] != final_df.iloc[i]['class_pred'] 
        else f"rgba({int(class_colors[src][1:3], 16)}, {int(class_colors[src][3:5], 16)}, {int(class_colors[src][5:7], 16)}, 0.4)"
        for i, src in enumerate(source_indices)],
  ))])

fig.update_layout(width=700, height=800)
# fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
fig.show()

# fix the plot

In [None]:
gdf.period.unique()

In [None]:
# Reset matplotlib settings to default
plt.rcdefaults()

gdf_not_null = gdf[gdf.crop_class_rnn.notnull()]

# Define the true labels and predicted labels
true_labels = gdf_not_null['crop_class_gt']
predicted_labels = gdf_not_null['crop_class_rnn']

labels = list(set(np.concatenate((gdf_not_null['crop_class_gt'].unique(), gdf_not_null['crop_class_rnn'].unique())).tolist()))

# Create the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels, labels=labels)

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Reds",text_kw={'size': 14})
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)

plt.title("Confusion matrix considering all samples", fontdict={'fontsize': 10})

plt.show()

In [None]:
# Reset matplotlib settings to default
plt.rcdefaults()

gdf_not_null_score = gdf_not_null[pd.to_numeric(gdf_not_null['crop_score_rnn']) > 0.6]  

# Define the true labels and predicted labels
true_labels = gdf_not_null_score['crop_class_gt']
predicted_labels = gdf_not_null_score['crop_class_rnn']

labels = list(set(np.concatenate((gdf_not_null_score['crop_class_gt'].unique(), gdf_not_null_score['crop_class_rnn'].unique())).tolist()))

# Create the confusion matrix
cm_score = confusion_matrix(true_labels, predicted_labels, labels=labels)

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm_score, display_labels=labels)
disp.plot(cmap="Reds",text_kw={'size': 14})
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)

plt.title("Confusion matrix considering samples with score above 0.6 (HIGH)", fontdict={'fontsize': 10})

plt.show()

In [None]:
# Calculate the difference between the two confusion matrices
cm_diff = cm_score - cm

# Calculate the difference in percentage
cm_diff_percentage = (cm_diff / cm) * 100
cm_diff_percentage = np.nan_to_num(cm_diff_percentage)  # Replace NaN with 0

# Display the difference in percentage confusion matrix
disp_diff_percentage = ConfusionMatrixDisplay(confusion_matrix=cm_diff_percentage, display_labels=labels)
disp_diff_percentage.plot(cmap="Blues_r", text_kw={'size': 14})
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)

plt.title("Relative difference between confusion matrices \n(all samples - samples with score > 0.6)", fontdict={'fontsize': 10})


plt.show()

In [None]:
# get percentage of nulls in column crop_class_rnn in gdf 
gdf['crop_class_rnn'].isnull().sum() / gdf.shape[0] *100

In [None]:
gdf.shape[0]

In [None]:
gdf['crop_class_rnn'].isnull().sum()

In [None]:
gdf[["crop_class_gt"]].value_counts().to_frame()
#

In [None]:
gdf_high = gdf[pd.to_numeric(gdf.crop_score_rnn) >= 0.6]
gdf_medium = gdf[(pd.to_numeric(gdf.crop_score_rnn) < 0.6) & (pd.to_numeric(gdf.crop_score_rnn) >= 0.5)]
gdf_low = gdf[pd.to_numeric(gdf.crop_score_rnn) < 0.5]

In [None]:
len(gdf_high)/len(gdf) * 100, len(gdf_medium)/len(gdf) * 100, len(gdf_low)/len(gdf) * 100

In [None]:
gdf_high[(gdf_high["crop_class_gt"] == "SOYBEAN")].crop_class_rnn.value_counts().to_frame()

In [None]:
# calculate the total omission for each class in bucket HIGH

interest_class = "RICE"

# Filter the dataframe for the specific crop_class_gt
filtered_df = gdf_high[gdf_high["crop_class_gt"] == interest_class]

# Calculate the sum of values where crop_class_rnn is different from crop_class_gt
sum_diff = filtered_df[filtered_df["crop_class_rnn"] != interest_class]["crop_class_rnn"].value_counts().sum()

# Calculate the total sum of values
total_sum = filtered_df["crop_class_rnn"].value_counts().sum()

# Calculate the ratio
ratio = sum_diff / total_sum * 100

print(f"Ratio of different crop_class_rnn to total: {ratio}")