In [None]:
import os
import libs.data as data

In [None]:
os.getcwd()

In [None]:
from importlib import reload
import libs.plot_context as pc
import localizable_resources as lr

def reload_libs_env():
    from dotenv import load_dotenv
    load_dotenv(".env", override=True)

    reload(pc)
    reload(lr)

reload_libs_env()

In [None]:
global_sizes = pc.rc_sizes(16, 21, 24, [8, 8])
MyPlot = pc.create_plot_context(global_sizes, reload_libs_env)

In [None]:
df = data.load_data_from_files(debug=True)

In [None]:
df

# User comments

Nessa seção vou analisar as principais reclamações e sugestões dos usuários.

In [None]:
df["review"].value_counts()

## Principais Reclamações e Sugestões dos Usuários

Esse resumo foi gerado pelo Chat GPT 5, copiando os reviews dos usuários encontrados e exibidos anteriormente.

---

### 1. Reconhecimento e Conexões de Poros
- Poros grandes às vezes são divididos indevidamente em dois.  
- Poros menores nem sempre são identificados.  
- Alguns poros desconectados aparecem como conectados na segmentação.  
- Microporosidade não detectada.  
- Falta detecção de poros muito pequenos (< 100 pixels).  

---

### 2. Interface e Compreensão dos Parâmetros
- Dificuldade para entender as cores no espaço de parâmetros.  
- Falta de clareza sobre o significado dos eixos X e Y da segmentação.  
- Texto explicativo para seleção de pixels pouco claro.  

---

### 3. Problemas de Recorte e Correspondência de Imagens
- Cortes na fotomicrografia que geram discrepâncias com a imagem segmentada.  
- Diferença entre imagens recortadas e segmentadas dependendo do formato do recorte (quadrado, retangular horizontal/vertical).  
- Barra lateral impedindo clique em parte da imagem.  

---

### 4. Considerações sobre Bolhas
- Bolhas foram bem reconhecidas, mas também representam porosidade e precisam ser consideradas na análise para evitar subestimativa.  

---

### 5. Ajustes Técnicos Sugeridos
- Permitir modificar o tamanho mínimo de pixels para detecção.  
- Melhorar isolamento de bolhas sem perder detecção de poros menores. 

---

# Data exploration

In [None]:
filtered_df = df
filtered_df = filtered_df[
            ~filtered_df['canceled']
            & ~filtered_df["name"].str.contains("Test", case=False, na=False)]
items_by_email = filtered_df.groupby("email").size().sort_values(ascending=False)
print(f"Number of unique emails: {len(items_by_email)}")
items_by_email

In [None]:
print("Unique reviews per file:")
mask = df['review'].notna() & df['review'].astype(str).str.strip().ne('')
df_clean = df[mask]
df_clean.groupby("filename")["review"].nunique()

In [None]:
print("Unique users per file:")
df.groupby("filename")["email"].nunique()

In [None]:
print("Total non-canceled 'clicks' per file:")
mask = ~df['canceled']
df_clean = df[mask]
df_clean["filename"].value_counts()

In [None]:
print("Total canceled items, and associated files (should not exist?):")
df[df["canceled"]]["filename"].value_counts()

In [None]:
print("Total canceled items, and associated files (should not exist?):")
df.groupby("filename")["folder_name"].nunique()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

def compute_group_stats(df):
    groups = sorted(df["experience"].unique())
    K_means, K_ci = [], []
    C_means, C_ci = [], []

    for g in groups:
        sub = df[df["experience"] == g]
        K = sub["clicked_x"].to_numpy()
        C = sub["clicked_y"].to_numpy()

        # Means
        K_means.append(K.mean())
        C_means.append(C.mean())

        # Standard errors (for CI)
        K_ci.append(1.96 * K.std(ddof=1) / np.sqrt(len(K)))
        C_ci.append(1.96 * C.std(ddof=1) / np.sqrt(len(C)))

    return groups, K_means, K_ci, C_means, C_ci


def plot_group_bars(ax, groups, means, ci, ylabel, title):

    ax.bar(groups, means, yerr=ci, capsize=6)
    ax.set_ylabel(ylabel)
    ax.set_xlabel("Experience Level")
    ax.set_title(title)


In [None]:
groups, K_means, K_ci, C_means, C_ci = compute_group_stats(filtered_df)
name = "K_bar_chart"
with MyPlot(f"images/{name}.pdf", figsize=[10, 6]) as mp:

    fig, ax = plt.subplots()
    
    plot_group_bars(
        ax,
        groups, K_means, K_ci,
        ylabel="K_max",
        title="Inter-group variability in K_max",
    )

name = "C_bar_chart"
with MyPlot(f"images/{name}.pdf", figsize=[10, 6]) as mp:
    
    fig, ax = plt.subplots()
    
    plot_group_bars(
        ax,
        groups, C_means, C_ci,
        ylabel="C_min",
        title="Inter-group variability in C_min",
    )

In [None]:
def compute_group_distance_matrix(df):
    groups = sorted(df["experience"].unique())
    G = len(groups)
    D = np.zeros((G, G))

    for i, g1 in enumerate(groups):
        for j, g2 in enumerate(groups):
            u1 = df[df["experience"] == g1]
            u2 = df[df["experience"] == g2]

            Theta1 = np.vstack([u1["clicked_x"], u1["clicked_y"]]).T
            Theta2 = np.vstack([u2["clicked_x"], u2["clicked_y"]]).T

            # pairwise distances
            dd = np.sqrt(((Theta1[:, None, :] - Theta2[None, :, :]) ** 2).sum(axis=2))
            D[i, j] = dd.mean()

    return groups, D


def plot_distance_heatmap(ax, groups, D):

    cax = ax.imshow(D, cmap="viridis")
    fig = ax.get_figure()
    fig.colorbar(cax, ax=ax)

    ax.set_xticks(range(len(groups)))
    ax.set_yticks(range(len(groups)))
    ax.set_xticklabels(groups)
    ax.set_yticklabels(groups)

    ax.set_xlabel("Experience Level")
    ax.set_ylabel("Experience Level")
    ax.set_title("Inter-group Parameter Distance")


In [None]:
groups, D = compute_group_distance_matrix(filtered_df)
name = "group_distances"
with MyPlot(f"images/{name}.pdf", figsize=[10, 6]) as mp:

    fig, ax = plt.subplots()
    
    plot_distance_heatmap(
        ax,
        groups, D,
    )


In [None]:
print(len(filtered_df["email"].value_counts()))
display(filtered_df.groupby("experience").get_group(5)["email"].value_counts())
print(len(filtered_df))

In [None]:
import matplotlib.pyplot as plt
import libs.plots as plots

USE_COLOR_BAR = True
SHOW_MEANS = False
SHOW_DISPERSION = True
SHOW_AGGREGATION = True
SHOW_EXP_LEVELS = [1, 2, 3, 4, 5]
USE_CANCELED_OR_TESTS = False

name = "best_color_params"
with MyPlot(f"images/{name}.pdf", figsize=[12, 8]) as mp:

    # Filter data based on user input
    filtered_df = df.sort_values(by=["experience"])

    if not USE_CANCELED_OR_TESTS:
        filtered_df = filtered_df[
            ~filtered_df['canceled']
            & ~filtered_df["name"].str.contains("Test", case=False, na=False)]

    # Plot the clicked points
    fig, ax = plt.subplots()

    plots.plot_best_color_params(
        ax, filtered_df,
        use_color_bar=USE_COLOR_BAR,
        show_means=SHOW_MEANS,
        show_dispersion=SHOW_DISPERSION,
        show_exp_levels=SHOW_EXP_LEVELS,
        show_aggregation=SHOW_AGGREGATION,
    )


# Locating cropped regions in source images

The web application was not saving the information of where the cropped images came from.

What we have at the moment are:
- the cropped image saved in JPEG format, and resized to 1/8 of the size in each dimension
- the options.json file containing the name of the original image

What we need to do to:
- load the cropped image, and resize it to 8 times in each dimension
- use a function to search the cropped image by image similarity inside the original image
- draw a box over the original image to indicate the source of the cropped image

In [None]:
import cv2

def search_subimage_in_main_image(cropped, original):
    """
    Function to search for a cropped image within the original image.
    
    Args:
        cropped: The cropped image (cv2 image)
        original: The original image (cv2 image)
    
    Returns:
        Dictionary with match information
    """
    # Convert both to grayscale for matching
    original_gray = cv2.cvtColor(original, cv2.COLOR_BGR2GRAY)
    crop_small_gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
    
    scale_factor = 8
    crop_upscaled = cv2.resize(crop_small_gray, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_CUBIC)

    # Template matching
    res = cv2.matchTemplate(original_gray, crop_upscaled, cv2.TM_CCOEFF_NORMED)
    min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)
    
    return {
        'max_val': max_val,
        'top_left': max_loc,
        'cropped_shape': crop_upscaled.shape
    }

In [None]:
import cv2

cropped_file = "static/output/14ec62b9-586b-4e30-92ba-c75111a54212/0/cropped.jpg"
original_image_file = 'static/imgs_sections/2-SMC-1-AL_72,99m_n=_2,5x_cesc.jpg'
cropped_image = cv2.imread(cropped_file)
original_image = cv2.imread(original_image_file)

plt.imshow(cropped_image[:, :, ::-1])
plt.show()

result = search_subimage_in_main_image(cropped_image, original_image)
top_left = result['top_left']
cropped_shape = result['cropped_shape']

# Draw rectangle on the original image
cv2.rectangle(original_image, top_left, (top_left[0] + cropped_shape[1], top_left[1] + cropped_shape[0]), (255, 255, 255), 20)

plt.imshow(original_image[:, :, ::-1])
plt.show()

In [None]:
import cv2

# Path to the original image
original_image_path = {
    "2-SMC-1-AL_72,99m_n=_2,5x_cesc.jpg": 'static/imgs_sections/2-SMC-1-AL_72,99m_n=_2,5x_cesc.jpg',
    "2-SMC-1-AL_73,97m_n=_2,5x_cesc.jpg": 'static/imgs_sections/2-SMC-1-AL_73,97m_n=_2,5x_cesc.jpg',
    "72.53_jpeg_escal.jpg": 'static/imgs_sections/72.53_jpeg_escal.jpg',
}
original_image = {}

# Load the original image once
for k in original_image_path:
    try:
        original_image[k] = cv2.imread(original_image_path[k])
        if original_image[k] is None:
            print(f"Could not load original image: {original_image_path[k]}")
        else:
            print(f"Loaded original image: {k} with shape {original_image[k].shape}")
    except Exception as e:
        print(f"Error loading original image: {e}")


In [None]:
import glob
import cv2
import os
import json

# Use glob to find all cropped images
cropped_pattern = 'static/output/*/*/cropped.jpg'
cropped_files = glob.glob(cropped_pattern)

print(f"Found {len(cropped_files)} cropped images")

RESET_CROPPED_INFO_FILES = False

results = []  # Store results for each cropped image
# Iterate over each cropped image
for cropped_file in cropped_files:
    try:
        # Load the cropped image
        cropped_image = cv2.imread(cropped_file)
        path = os.path.dirname(cropped_file)
        if RESET_CROPPED_INFO_FILES:
            os.remove(f'{path}/cropped.json')  # Remove the cropped file to reset it
        if os.path.exists(f'{path}/cropped.json'):
            print(f"Skipping already processed file: {cropped_file}")
            with open(f'{path}/cropped.json', 'r') as f:
                result = json.load(f)
            results.append(result)
            continue
        
        print(f"Processing file: {cropped_file}")
        options_file = f'{path}/options.json'
        with open(options_file, 'r') as f:
            options = json.load(f)
            
        source_image_key = options["image_select.filename"]
        source_image = original_image.get(source_image_key, None)
        
        if cropped_image is None:
            print(f"Could not load cropped image: {cropped_file}")
            continue
            
        # Apply the function
        result = search_subimage_in_main_image(cropped_image, source_image)
        
        # Store results with file path
        result['cropped_file'] = cropped_file.replace('\\', '/')
        results.append(result)
        
        with open(f'{path}/cropped.json', 'w') as f:
            json.dump(result, f, indent=4)
        
        print(f"Processed {cropped_file}: match_value={result['max_val']:.4f}, position={result['top_left']}")
        
    except Exception as e:
        print(f"Error processing {cropped_file}: {e}")

print(f"Processed (or loaded info about) {len(results)} cropped images.")

In [None]:
results

In [None]:
import json

for it, result in enumerate(results):
    cropped_file = result["cropped_file"]
    path = os.path.dirname(cropped_file)
    
    subimage_file = f'{path}/cropped_original_size.jpg'
    if os.path.exists(subimage_file):
        print(f"Subimage already exists: {subimage_file}")
        continue
    
    options_file = f'{path}/options.json'
    with open(options_file, 'r') as f:
        options = json.load(f)
    source_image_key = options["image_select.filename"]
    
    top_left = result['top_left']
    cropped_shape = result['cropped_shape']
    
    x, y = top_left
    h, w = cropped_shape[:2]
    subimage = original_image[source_image_key][y:y+h, x:x+w]

    # Save the subimage
    cv2.imwrite(subimage_file, subimage)
    print(f"Saved subimage for {cropped_file} at {subimage_file}")

In [None]:
import json
from libs.colors import parse_color

SHOW_CROPPED_IMAGES = False

# 0 -> use white rectangles
# 1 -> use colored rectangles
# 2 -> use experience levels as colors from the colormap
USE_RECTANGLE_COLORS = 0

USE_COLOR_MAP = 'rainbow'

# Use the notebook slim-section-rects-interactive-visualizer.ipynb
# to select the rectangles to show
SPECIAL_RECTANGLES = {
    # When more users are added, the numeric indices must be updated.
    # TODO: We should probably use guid and session number instead of indices.
    # 2: 'purple',
    # # 22: 'red',
    # 32: 'red',
    # 35: 'green',
}

LINE_WIDTH = 30#20  # Width of the rectangle lines

original_image_copy = {}
for k in original_image_path:
    original_image_copy[k] = original_image[k].copy()

cmap = plt.get_cmap(USE_COLOR_MAP, len(results))

count_non_canceled = 0
for it, result in enumerate(results):
    cropped_file = result["cropped_file"]
    path = os.path.dirname(cropped_file)
    
    if os.path.exists(f'{path}/params_select.state=Cancel'):
        print(f"Skipping canceled image {cropped_file}.")
        continue
    
    options_file = f'{path}/options.json'
    with open(options_file, 'r') as f:
        options = json.load(f)
    source_image_key = options["image_select.filename"]
    
    canceled = data.is_option_canceled(options)
    test = data.is_option_test(options)
    if canceled or test:
        print(f"Skipping canceled or test image {cropped_file}.")
        continue

    # if source_image_key != "2-SMC-1-AL_72,99m_n=_2,5x_cesc.jpg":
    #     print(f"Skipping image {cropped_file} as it is not the target image.")
    #     continue

    top_left = result['top_left']
    cropped_shape = result['cropped_shape']
    
    # Draw rectangle on the original image
    if it in SPECIAL_RECTANGLES:
        rgba_color = parse_color(SPECIAL_RECTANGLES[it], cmap)  # Returns (R, G, B, A) in 0-1 range
    elif USE_RECTANGLE_COLORS == 1:
        rgba_color = cmap(((it * 97) % len(results)) / len(results))  # Returns (R, G, B, A) in 0-1 range
    elif USE_RECTANGLE_COLORS == 2:
        experience = options["user"]["experience"]
        rgba_color = cmap((experience - 1) / 5)  # Returns (R, G, B, A) in 0-1 range
    else:
        rgba_color = (1, 1, 1, 1)  # White color in RGBA
    bgr_color = (int(rgba_color[2] * 255), int(rgba_color[1] * 255), int(rgba_color[0] * 255))  # Convert to BGR 0-255
    
    cv2.rectangle(original_image_copy[source_image_key],
                  top_left,
                  (top_left[0] + cropped_shape[1], top_left[1] + cropped_shape[0]),
                  bgr_color,  # Use BGR color tuple
                  LINE_WIDTH)

    if SHOW_CROPPED_IMAGES:
        cropped_image = cv2.imread(result['cropped_file'])
        plt.imshow(cropped_image[:, :, ::-1])  # Convert BGR to RGB for display
        print(f"Cropped Image: {result['cropped_file']}")
        plt.show()
    
    count_non_canceled += 1
    # break

print(f"Processed {count_non_canceled} non-canceled cropped images.")

plt.imshow(original_image_copy["2-SMC-1-AL_72,99m_n=_2,5x_cesc.jpg"][:, :, ::-1])  # Convert BGR to RGB for display
plt.title("Original Image")
plt.axis('off')
plt.show()

In [None]:
for key in original_image_copy.keys():
    name, ext = os.path.splitext(key)
    cv2.imwrite(f'images/found_cropped_areas_{name}.jpg', original_image_copy[key])
    import libs.images as images
    images.resize_image_file(
        f'images/found_cropped_areas_{name}.jpg',
        f'images/found_cropped_areas_{name}_small.jpg',
        percentage=25)

# Creating large versions of the parameter-space and isolating the segmentation of each clicked point

1. Large version of the parameter-space
   - this will show where the user clicked
2. Isolating the segmentation of each clicked point
   - For each click in the parameter-space, save the associated segmentation

In [None]:
import json
from skimage.measure import label, regionprops
from libs.images import binarize_c_k, regions_df, rmm
import numpy as np
import os

REPROCESS_BIG_IMAGES = False

SEGMENT_IN_ORIGINAL_SIZE = False  # If True, segment the original size image, otherwise use segmented small image

for it, result in enumerate(results):
    cropped_file = result["cropped_file"]
    path = os.path.dirname(cropped_file)
    
    params_space_img_file = f'{path}/main_image.png'
    big_params_space_img_file = f'{path}/main_image_big.png'
    if REPROCESS_BIG_IMAGES:
        if os.path.exists(big_params_space_img_file):
            os.remove(big_params_space_img_file)
    if os.path.exists(big_params_space_img_file):
        print(f"Big image already exists: {big_params_space_img_file}")
        continue
    
    print(f"Big image processing: {big_params_space_img_file}")
    
    clicks_file = f'{path}/clicked_points.json'
    if not os.path.exists(clicks_file):
        print(f"Clicked points file not found: {clicks_file}")
        continue
    with open(clicks_file, 'r') as f:
        clicks_data = json.load(f)
    
    params_space_image = cv2.imread(params_space_img_file)
    params_space_image_big = cv2.resize(params_space_image, None, fx=8, fy=8, interpolation=cv2.INTER_NEAREST)

    # Save the subimage
    plt.imshow(params_space_image_big[:, :, ::-1])  # Convert BGR to RGB for display
    ax = plt.gca()
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xticklabels([])
    ax.set_yticklabels([])

    # Keep spines (the box) visible
    for spine in ax.spines.values():
        spine.set_visible(True)

    
    for click in clicks_data["points"]:
        x = click['x']*8
        y = click['y']*8
        ax.plot(x, y, marker='x', color='orange', markersize=24, mew=6, linestyle='None')

    plt.tight_layout()
    plt.savefig(big_params_space_img_file)
    plt.close()
    
    # extracting the clicked points segmentation
    if SEGMENT_IN_ORIGINAL_SIZE:
        cropped_file = f'{path}/cropped_original_size.jpg'
        cropped_image = cv2.imread(cropped_file)
        cropped_image = cropped_image.astype(np.uint8)  # Ensure correct dtype
        
        colors = [
                np.array([0, 0, 0]),
                np.array([0, 0, 255]),
                np.array([255, 0, 0]),
                np.array([0, 255, 0]),
                np.array([255, 255, 255]),
            ]

        thresh = clicks_data["min_pore_size"]*8*8 # Convert to pixels in the big image
        for click in clicks_data["points"]:
            k = click['x']*8
            c = click['y']*8
            
            label_img, regions = binarize_c_k(cropped_image, c, k)
            regions = regionprops(label_img)
            df = regions_df(regions, label_img)
            
            bin_image_4 = np.zeros((*label_img.shape, 3), dtype=np.uint8)
            id_region = 0
            for it, region in enumerate(regions):
                #display(df)
                if df["area"].iloc[it] > thresh:
                    id_region += 1
                    #color_value = np.array([255, 255, 255])
                    color_value = colors[rmm(id_region, 0, len(colors)-1)]
                    bin_image_4[region.coords.T[0], region.coords.T[1]] = color_value
            # Save the subimage
            cv2.imwrite(f'{path}/segmentation_x={int(k/8)}_y={int(c/8)}.png', bin_image_4)
    
    else:
        tiles_file = f'{path}/stitched_tiles.png'
        tiles_image = cv2.imread(tiles_file)
        
        cropped_file = f'{path}/cropped.jpg'
        cropped_image = cv2.imread(cropped_file)
        
        tile_height, tile_width = cropped_image.shape[:2]
        
        for click in clicks_data["points"]:
            x = click['x']
            y = click['y']
            
            if x >= 0 and y >= 0:
                
                tile_image = tiles_image[y*tile_height:(y+1)*tile_height, x*tile_width:(x+1)*tile_width]
                # Save the subimage
                cv2.imwrite(f'{path}/segmentation_x={x}_y={y}.png', tile_image)


# Calculating porosity and number of pores for the segmentation of each click in parameters-space

In [None]:
results

In [None]:
import pandas as pd
from tqdm import tqdm
import hashlib

columns = ["source_image_key","x","y","canceled","sz_kind",
           "cut_kind","porosity","pore_count","experience",
           "image_hash"]

def compute_hash_from_image(image):
    """Compute a hash for a given image (numpy array)."""
    image_bytes = image.tobytes()
    return hashlib.sha256(image_bytes).hexdigest()

if os.path.exists('static/output/clicks_data.csv'):
    print("Cached clicks data file found")
    df_clicks = pd.read_csv('static/output/clicks_data.csv')
    for col in columns:
        if col not in df_clicks.columns:
            df_clicks[col] = pd.NA
    df_clicks = df_clicks[columns]
else:
    df_clicks = pd.DataFrame(columns=columns)  # Initialize empty DataFrame

# Use glob to find all cropped images
options_file_pattern = 'static/output/*/*/options.json'
options_files = glob.glob(options_file_pattern)

print(f"Found {len(options_files)} options files")

full_img_hashes = {}
triplet_to_row = {}
for idx, row in df_clicks.iterrows():
    if row["cut_kind"] == "full" and isinstance(row["image_hash"], str) and row["image_hash"] != "":
        full_img_hashes[(row["source_image_key"], row["sz_kind"])] = row["image_hash"]
    key = (row["image_hash"], row["x"], row["y"])
    if key not in triplet_to_row:
        triplet_to_row[key] = (*row[["porosity", "pore_count"]].values,)


clicks_data = []
pbar = tqdm(options_files)
for options_file in pbar:
    path = os.path.dirname(options_file)
    
    with open(options_file, 'r') as f:
        options = json.load(f)
    
    if "image_select.filename" not in options:
        pbar.set_description(f"No image_select.filename in {options_file}")
        continue
    
    pbar.set_description(f"Processing file: {options_file}")
    
    source_image_key = options["image_select.filename"]
    
    for click in options.get("params_select.clicked_points", []):
        if isinstance(click, dict) and "x" in click and "y" in click:
            
            
            # Loading image files for segmentation
            big_cropped_file = f'{path}/cropped_original_size.jpg'
            small_source_file = f'static/imgs_sections/12.5/{source_image_key}'
            small_cropped_file = f'{path}/cropped.jpg'
    
            big_source_image = original_image[source_image_key]
            big_cropped_image = cv2.imread(big_cropped_file)
            small_source_image = cv2.imread(small_source_file)
            small_cropped_image = cv2.imread(small_cropped_file)
            
            all_files = [
                (big_source_image, "big", "full", f"{source_image_key}"),
                (big_cropped_image, "big", "cropped", big_cropped_file),
                (small_source_image, "small", "full", small_source_file),
                (small_cropped_image, "small", "cropped", small_cropped_file),
                ]
            
            for img, sz, cut, fname in all_files:
                if img is None:
                    print(f"Could not load image: {fname}")
                    continue
                
                if cut == "full" and (source_image_key, sz) in full_img_hashes:
                    image_hash = full_img_hashes[(source_image_key, sz)]
                else:
                    image_hash = compute_hash_from_image(img)
                key = (image_hash, click['x'], click['y'])
                
                if key in triplet_to_row:
                    pbar.set_description(f"Using cached data for key: {key}")
                    porosity, pore_count = triplet_to_row[key]
                else:
                    pbar.set_description(f"Computing segmentation for key: {key}")
                    k = click['x']*8
                    c = click['y']*8
                    binaryImage = cv2.inRange(
                        img,
                        ( c,   0,   0,   0),
                        (255, 255,  64, k))
                    
                    label_img = label(binaryImage)
                    regions = regionprops(label_img)
                    
                    total_area = sum(region.area for region in regions)
                    h, w = label_img.shape
                    porosity = total_area / (h * w) if (h * w) > 0 else 0
                    pore_count = len(regions)
                    
                    triplet_to_row[key] = (porosity, pore_count)
                    if cut == "full" and isinstance(image_hash, str) and image_hash != "":
                        full_img_hashes[(source_image_key, sz)] = image_hash

                # Append the click data and associated segmentation metrics
                clicks_data.append({
                    'source_image_key': source_image_key,
                    'x': click['x'],
                    'y': click['y'],
                    'canceled': options.get("params_select.state", "") == "Cancel",
                    'sz_kind': sz,
                    'cut_kind': cut,
                    'porosity': porosity,
                    'pore_count': pore_count,
                    'experience': options["user"]["experience"],
                    "image_hash": image_hash,
                })

df_clicks = pd.DataFrame(clicks_data)

In [None]:
df_clicks

In [None]:
df_clicks.to_csv('static/output/clicks_data.csv', float_format="%.15f", index=False)
df_clicks

In [None]:
from matplotlib import ticker
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def plot_porosity_distribution_arrow(df_clicks, data_field="porosity", sz_kind="small", cut_kind="full", 
                                       show_extension_lines=True, extension_factor=0.1,
                                       extension_style='--', extension_alpha=0.7,
                                       legends_offsets=None, exp_levels=None,
                                       arrow_locs=None,
                                       title=None, x_label=None,
                                       clip_min_x=0, plot_width=10, plot_height=6,
                                       remove_y_axis=False):
    """
    Plot CDF with extended horizontal lines after reaching 100% probability.
    
    Parameters:
    - show_extension_lines: Whether to show dashed lines extending the CDF
    - extension_factor: How much to extend beyond max value (0.1 = 10%)
    - extension_style: Line style for extensions ('--', ':', '-', etc.)
    - extension_alpha: Transparency of extension lines
    """
    df_work = df_clicks[
                (df_clicks["source_image_key"] == "2-SMC-1-AL_72,99m_n=_2,5x_cesc.jpg")
            & (df_clicks["canceled"] == False)
            & (df_clicks["sz_kind"] == sz_kind)
            & (df_clicks["cut_kind"] == cut_kind)
            ]
    
    name = f"cdf_{data_field}_by_experience_sz={sz_kind}_cut={cut_kind}_extended"
    print(f"FILE_NAME = images/{name}.pdf")
    with MyPlot(f"images/{name}.pdf", figsize=[plot_width, plot_height]) as mp:

        print(f"sz={sz_kind}\ncut={cut_kind}")

        # Plot CDF for different experience levels
        plt.figure()

        # Find the overall maximum value for extending the plot
        all_data = df_work[data_field]
        global_max = all_data.max() if len(all_data) > 0 else 1.0
        extended_max = global_max * (1 + extension_factor)

        # Store colors for consistent extension lines
        colors = {}
        
        if exp_levels is None:
            exp_levels = [2, 3, 4, 5]

        for exp in exp_levels:
            data = df_work[df_work["experience"] == exp][data_field]
            
            if len(data) > 0:
                # Plot the main CDF and capture the line object to get its color
                line = sns.ecdfplot(data, label=lr.str.experience_level(exp), linewidth=3)
                
                # Get the color of the last plotted line
                color = plt.gca().lines[-1].get_color()
                colors[exp] = color
                
                if show_extension_lines:
                    # Get the maximum value for this experience level
                    exp_max = data.max()
                    
                    # Add horizontal extension line
                    plt.plot([exp_max, extended_max], [1.0, 1.0], 
                            color=color, linewidth=3, linestyle=extension_style, 
                            alpha=extension_alpha)

        if global_max > 10000:
            def thousands_formatter(x, pos):
                return f"{int(x/1000)}k"
            plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(thousands_formatter))
        
        if legends_offsets is None:
            legends_offsets = {}
        default_legends_offsets = {
            2: (-0.2, 0.2),  # Offset for experience 2
            3: (-0.2, 0.1),  # Offset for experience 3
            4: (-0.2, 0.0),  # Offset for experience 4
            5: (-0.2, -0.1),  # Offset for experience 5
        }
        default_legends_offsets.update(legends_offsets)
        legends_offsets = default_legends_offsets
    
        if arrow_locs is None:
            arrow_locs = {}
    
        plt.xlabel(x_label if x_label is not None else lr.str.porosity)
        plt.ylabel(lr.str.cumulative_probability)
        plt.title(title if title is not None else f"{lr.str.cdf_porosity_by_experience}")
        
        if remove_y_axis:
            plt.gca().tick_params(axis='y', which='both', left=False, labelleft=False)
            plt.gca().set_ylabel("")              # Remove axis label
            plt.gca().set_yticklabels([])         # Remove tick labels
            plt.gca().tick_params(axis='y', length=0)  # Hide tick marks
        # Replace legend with floating text annotations
        # plt.legend()  # Comment out the traditional legend
        
        # Add floating text labels with arrows pointing to the best points
        for exp in exp_levels:
            data = df_work[df_work["experience"] == exp][data_field]
            
            if len(data) > 0 and exp in colors:
                # Find a good point to annotate (e.g., median or a point around 50% CDF)
                sorted_data = np.sort(data)
                median_idx = int(len(sorted_data) * arrow_locs.get(exp, 0.5))
                best_x = sorted_data[median_idx - 1] if len(sorted_data) > 0 else 0
                best_y = median_idx / len(sorted_data) if len(sorted_data) > 0 else 0.5
                
                offset = legends_offsets[exp]
                
                # Create floating text annotation with arrow
                plt.annotate(
                    lr.str.experience_level(exp),
                    xy=(best_x, best_y),  # Point on the curve to point to
                    xytext=(best_x + offset[0], best_y + offset[1]),  # Text position (offset)
                    bbox=dict(boxstyle="round,pad=0.3", facecolor=colors[exp], alpha=0.8, edgecolor='white'),
                    arrowprops=dict(
                        arrowstyle="->", 
                        color=colors[exp], 
                        lw=2,
                        connectionstyle="arc3,rad=0.2"  # Curved arrow
                    ),
                    fontsize=15,
                    fontweight='bold',
                    color='white'
                )
        
        plt.grid(True, alpha=0.3)
        plt.xlim(clip_min_x, extended_max)
        plt.ylim(0, 1.05)
        
        # # Add annotation about the extension
        # plt.text(0.98, 0.02, f"Extended to max + {extension_factor*100:.0f}%", 
        #         transform=plt.gca().transAxes, ha='right', va='bottom',
        #         bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8),
        #         fontsize=10)

def plot_porosity_distribution(df_clicks, data_field="porosity", sz_kind="small", cut_kind="full", 
                               source_image_key="2-SMC-1-AL_72,99m_n=_2,5x_cesc.jpg",
                                       show_extension_lines=True, extension_factor=0.1,
                                       extension_style='--', extension_alpha=0.7,
                                       exp_levels=None,
                                       title=None, x_label=None,
                                       clip_min_x=0, plot_width=10, plot_height=6,
                                       remove_y_axis=False):
    """
    Plot CDF with extended horizontal lines after reaching 100% probability.
    
    Parameters:
    - show_extension_lines: Whether to show dashed lines extending the CDF
    - extension_factor: How much to extend beyond max value (0.1 = 10%)
    - extension_style: Line style for extensions ('--', ':', '-', etc.)
    - extension_alpha: Transparency of extension lines
    """
    filter = ((df_clicks["canceled"] == False)
            & (df_clicks["sz_kind"] == sz_kind)
            & (df_clicks["cut_kind"] == cut_kind))
    if source_image_key is None:
        filter = filter & (df_clicks["source_image_key"].notna())
    
    df_work = df_clicks[
                filter
            ]
    
    name = f"cdf_{data_field}_by_experience_sz={sz_kind}_cut={cut_kind}_extended"
    print(f"FILE_NAME = images/{name}.pdf")
    with MyPlot(f"images/{name}.pdf", figsize=[plot_width, plot_height]) as mp:

        print(f"sz={sz_kind}\ncut={cut_kind}")

        # Plot CDF for different experience levels
        plt.figure()

        # Find the overall maximum value for extending the plot
        all_data = df_work[data_field]
        global_max = all_data.max() if len(all_data) > 0 else 1.0
        extended_max = global_max * (1 + extension_factor)

        # Store colors for consistent extension lines
        colors = {}
        
        if exp_levels is None:
            exp_levels = [2, 3, 4, 5]

        for exp in exp_levels:
            data = df_work[df_work["experience"] == exp][data_field]
            
            if len(data) > 0:
                # Plot the main CDF and capture the line object to get its color
                line = sns.ecdfplot(data, label=lr.str.experience_level(exp), linewidth=3)
                
                # Get the color of the last plotted line
                color = plt.gca().lines[-1].get_color()
                colors[exp] = color
                
                if show_extension_lines:
                    # Get the maximum value for this experience level
                    exp_max = data.max()
                    
                    # Add horizontal extension line
                    plt.plot([exp_max, extended_max], [1.0, 1.0], 
                            color=color, linewidth=3, linestyle=extension_style, 
                            alpha=extension_alpha)

        if global_max > 10000:
            def thousands_formatter(x, pos):
                return f"{int(x/1000)}k"
            plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(thousands_formatter))
        
    
        plt.xlabel(x_label if x_label is not None else lr.str.porosity)
        plt.ylabel(lr.str.cumulative_probability)
        plt.title(title if title is not None else f"{lr.str.cdf_porosity_by_experience}")
        
        if remove_y_axis:
            plt.gca().tick_params(axis='y', which='both', left=False, labelleft=False)
            plt.gca().set_ylabel("")              # Remove axis label
            plt.gca().set_yticklabels([])         # Remove tick labels
            plt.gca().tick_params(axis='y', length=0)  # Hide tick marks
            
        # Replace legend with floating text annotations
        plt.legend()  # Comment out the traditional legend
        
        
        plt.grid(True, alpha=0.3)
        plt.xlim(clip_min_x, extended_max)
        plt.ylim(0, 1.05)
        

In [None]:
plot_porosity_distribution_arrows(df_clicks, sz_kind="small", cut_kind="full", exp_levels=[2, 3, 5],
                           legends_offsets={2: (-0.12, 0.1), 3: (-0.14, 0.15), 5: (-0.1, 0.1)},
                           arrow_locs={2: 0.2, 3:0.4, 5: 0.8},
                           extension_factor=0.05, clip_min_x=0.08, plot_width=8)
plot_porosity_distribution_arrows(df_clicks, sz_kind="big", cut_kind="full", exp_levels=[2, 3, 5],
                           legends_offsets={2: (-0.10, 0.1), 3: (-0.14, 0.15), 5: (-0.1, 0.1)},
                           arrow_locs={2: 0.2, 3:0.4, 5: 0.8},
                           extension_factor=0.05, clip_min_x=0.1, plot_width=8)
plot_porosity_distribution_arrows(df_clicks, sz_kind="small", cut_kind="cropped",
                           legends_offsets={2: (-0.18, 0.1), 3: (-0.15, 0.15), 4: (-0.2, -0.05), 5: (-0.02, -0.35)},
                           arrow_locs={2: 0.2, 3: 0.4, 4: 0.8, 5: 0.5})
plot_porosity_distribution_arrows(df_clicks, sz_kind="big", cut_kind="cropped",
                           legends_offsets={2: (-0.16, 0.1), 3: (-0.15, 0.15), 4: (-0.2, -0.05), 5: (-0.02, -0.35)},
                           arrow_locs={2: 0.1, 3: 0.4, 4: 0.8, 5: 0.5})

In [None]:
df_clicks["pore_count"]

In [None]:
plot_porosity_distribution_arrows(df_clicks, data_field="pore_count", sz_kind="small", cut_kind="full", exp_levels=[2, 3, 5],
                           legends_offsets={2: (-600, 0.1), 3: (-800, 0.15), 5: (-800, 0.1)},
                           arrow_locs={2: 0.4, 5: 0.8},
                           title=lr.str.cdf_pore_count_by_experience,
                           x_label=lr.str.pore_count,
                           clip_min_x=600, extension_factor=0.05, plot_width=7,
                           remove_y_axis=True)


In [None]:
plot_porosity_distribution_arrows(df_clicks, data_field="pore_count", sz_kind="big", cut_kind="full", exp_levels=[2, 3, 5],
                           legends_offsets={2: (-20000, 0.1), 3: (-32000, 0.15), 5: (-40000, 0.1)},
                           arrow_locs={2: 0.2, 5: 0.8},
                           title=lr.str.cdf_pore_count_by_experience,
                           x_label=lr.str.pore_count,
                           clip_min_x=20000, extension_factor=0.05, plot_width=8)


In [None]:
items_for_cdf1 = df_clicks[(df_clicks["experience"] == 4)
          & (df_clicks["canceled"] == False)
          & (df_clicks["sz_kind"] == "small")
          & (df_clicks["cut_kind"] == "full")]["porosity"].sort_values()

n = len(items_for_cdf1)
cdf = np.arange(1, n+1) / n

plt.plot(items_for_cdf1, cdf, marker='.', linestyle='none')
plt.show()

In [None]:
plot_porosity_distribution(
    df_clicks, sz_kind="small", cut_kind="full", source_image_key=None,
    extension_factor=0.05, clip_min_x=0.0, plot_width=8)
plot_porosity_distribution(
    df_clicks, sz_kind="big", cut_kind="full", source_image_key=None,
    extension_factor=0.05, clip_min_x=0.0, plot_width=8)
plot_porosity_distribution(
    df_clicks, sz_kind="small", cut_kind="cropped", source_image_key=None,
    )
plot_porosity_distribution(
    df_clicks, sz_kind="big", cut_kind="cropped", source_image_key=None,
    )


In [None]:
plot_porosity_distribution(df_clicks, data_field="pore_count", sz_kind="small", cut_kind="full",
                           title=lr.str.cdf_pore_count_by_experience,
                           x_label=lr.str.pore_count,
                           clip_min_x=0, extension_factor=0.05, plot_width=7,
                           remove_y_axis=True)


In [None]:
plot_porosity_distribution(df_clicks, data_field="pore_count", sz_kind="big", cut_kind="full", exp_levels=[2, 3, 5],
                           title=lr.str.cdf_pore_count_by_experience,
                           x_label=lr.str.pore_count,
                           clip_min_x=20000, extension_factor=0.05, plot_width=8)
