This notebook documents the pipeline for intercellular space segmentation and quantification from transmission electron microscopy (TEM) images, described in the manuscript.


**Workflow**  
1. **Adaptive Local Thresholding & Mask Assembly (via 'segment_intensity_filter.ipynb'**  
   - Images divided into overlapping sliding windows.  
   - Contrast equalization applied within each window.  
   - Local histograms analyzed:  
     - Multimodal → Otsu’s threshold + conservative offset.  
     - Unimodal → Empirical percentile threshold.  
   - Local thresholds constrained against a global percentile threshold to limit artifacts.  
   - Binary masks from overlapping windows combined by union.  
   - This preserved local adaptivity while ensuring global spatial continuity.

2. **Mask Aorrection (via 'annotate_segmentation.py')**  
   - Masks visualized in *Napari* for QC, with manual correction via custom plugin.  

3. **Quantification (via 'quantify_space_size.ipynb')**  
   - Individual intercellular space sizes measured.  
   - Spaces < 20,000 pixels (≈1.53 μm²) excluded.  
   - Per-image outputs:  
     - List of individual space sizes  
     - Mean space size  
     - Total space count  
   - Condition-level analysis: pooled space-size distributions used to estimate probability density functions.  

**Outputs**  
- Segmentation masks (Napari-compatible)  
- Overlays of raw images with masks for QC  
- Excel tables of space sizes, means, and counts  
- Pooled distributions for statistical analysis  

In [None]:
# imports
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from scipy.stats import gaussian_kde

## Step1: convert mask labels to numbers

In [None]:
# convert pixel to micron based on metadata of images on different magnification
scope_pixel = {
    "1400x": 130.62,
    "1200x": 114.32,
    "440x": 43.69}

# define function to convert segmented & corrected mask labels (csv.) into matrix
def get_area(file, by_scope=False):
    scope = os.path.basename(file).split("_")[-2] 
    mag = scope_pixel[scope]
    df = pd.read_csv(file)
    df = df.drop(df[df.area < 20000].index)     # remove area smaller than 20000 pixels

    labels = list(df.label)
    areas = list(df.area)
    space_areas = np.array(areas[:-1])
    

    total_area = areas[-1]
    
    sum_areas = np.sum(space_areas)
    mean_area = np.mean(space_areas)
    total_space_num = len(space_areas)
    space_ratio = sum_areas/total_area
    
    if by_scope:
        sum_areas = sum_areas/(mag**2)
        mean_area = mean_area/(mag**2)
        space_areas = space_areas/(mag**2)
        total_area = total_area/(mag**2)

    return space_areas, mean_area, sum_areas, space_ratio, total_space_num, total_area # parameters of interest, in micron

In [None]:
# call function
file_path = "input_folder_name" # folder where csv. locate
os.listdir(file_path)

summary=[]

for f in os.listdir(file_path):
    file_extension = os.path.splitext(f)[-1]
    if file_extension == '.csv':
        res = get_area(file_path+f, by_scope=True) #by_scope: to transform pixel to um
        des = pd.DataFrame(res[0]).describe()
        des_trans= des.transpose()
     
        stats = {
            "sum_areas":[res[2]], 
            "space_ratio":[res[3]], 
            "total_area":[res[-1]], 
            'file_name':[f]
        }
        summary.append(pd.concat([des_trans, pd.DataFrame(stats)], axis=1))
        
df = pd.concat(summary, ignore_index=True)
df.to_excel(f"{file_path}output_excel_name.xlsx") # output containing the parameters of interest per label csv.

## Step2: visualise space size distribution - probability density function

In [None]:
base_folder = "input_folder" # folder where csv. locate
output_folder = "output_folder_name"
# os.listdir(base_folder)

In [None]:
# === Helper: extract scope from filename ===
def extract_scope_from_filename(filename):
    for key in scope_pixel:
        if key in filename:
            return key
    raise ValueError(f"Scope not found in filename: {filename}")

In [None]:
## calculate and draw plot
# === Collect all space area data for combined KDE ===
all_space_area_values = []

# === Loop through each subfolder ===
subfolders = [f for f in os.listdir(base_folder) if os.path.isdir(os.path.join(base_folder, f))]

for folder in subfolders:
    folder_path = os.path.join(base_folder, folder)
    folder_areas = []

    for file in os.listdir(folder_path):
        if file.endswith(".csv"):
            file_path = os.path.join(folder_path, file)

            try:
                scope_key = extract_scope_from_filename(file)
                pixel_size = scope_pixel[scope_key]
            except:
                print(f"⚠️ Skipping {file} — scope not found.")
                continue

            try:
                df = pd.read_csv(file_path)
                df = df.iloc[:-1]  # Remove last row (e.g. "Total")
                area_pixels = df["area"].values
                area_microns = area_pixels / (pixel_size ** 2)
                folder_areas.extend(area_microns)
            except Exception as e:
                print(f"❌ Error reading {file}: {e}")
                continue

    if len(folder_areas) < 3:
        print(f"⚠️ Not enough data in {folder}")
        continue

    folder_areas = np.array(folder_areas)
    all_space_area_values.append((folder, folder_areas))

    # === Plot KDE for this folder ===
    kde = gaussian_kde(folder_areas)
    x_vals = np.linspace(0, 20, 300)
    kde_vals = kde(x_vals)

    plt.figure(figsize=(4, 5))
    plt.plot(x_vals, kde_vals, color='red', linewidth=2, label='KDE (PDF)')
    plt.xlim(0, 20)
    plt.ylim(0, 0.40)
    plt.xlabel("Space Area (μm²)", fontsize=10)
    plt.ylabel("Probability Density", fontsize=10)
    plt.title(f"PDF of Space Area: {folder}", fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.3)
    plt.tight_layout()

    # Save individual plot
    save_path = os.path.join(output_folder, f"{folder}_PDF_space_area.pdf")
    plt.savefig(save_path, format="pdf")
    plt.close()

# === Combined PDF Plot Across All Subfolders ===
if all_space_area_values:
    plt.figure(figsize=(6, 4))
    for folder_name, data in all_space_area_values:
        kde = gaussian_kde(data)
        x_vals = np.linspace(0, 20, 300)
        kde_vals = kde(x_vals)
        plt.plot(x_vals, kde_vals, linewidth=2, label=folder_name)

    plt.xlabel("Space Area (μm²)", fontsize=11)
    plt.ylabel("Probability Density", fontsize=11)
    plt.title("Combined KDE (PDF) of Space Area", fontsize=13)
    plt.legend(fontsize=9)
    plt.xlim(0, 20)
    plt.ylim(0,0.40)
    plt.grid(True, linestyle='--', alpha=0.3)
    plt.tight_layout()

    combined_path = os.path.join(output_folder, "PDF_space_size.pdf")
    plt.savefig(combined_path, format="pdf")
    plt.show()

## Step3: visualise space size distribution - voilin plot

In [None]:
# === Collect data ===
data = []

# Loop through subfolders
subfolders = [f for f in os.listdir(base_folder) if os.path.isdir(os.path.join(base_folder, f))]

for folder in subfolders:
    folder_path = os.path.join(base_folder, folder)

    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(folder_path, file)

            try:
                scope_key = extract_scope_from_filename(file)
                pixel_size = scope_pixel[scope_key]
            except Exception:
                print(f"⚠️ Skipping {file} — unknown scope.")
                continue

            try:
                df = pd.read_csv(file_path)
                df = df.iloc[:-1]  # Exclude last row ('total')
                areas_pixel = df["area"].values
                areas_micron = areas_pixel / (pixel_size ** 2)

                for value in areas_micron:
                    data.append({
                        "condition": folder,
                        "space_area_um2": value
                    })

            except Exception as e:
                print(f"❌ Error in {file}: {e}")
                continue

# === Create DataFrame ===
df_plot = pd.DataFrame(data)



# === Print out datapoints per condition ===
for cond in order:
    subset = df_plot[df_plot["condition"] == cond]["space_area_um2"].tolist()
    print(f"\nCondition: {cond}")
    print(f"Number of points: {len(subset)}")

# Plot
plt.figure(figsize=(4,6))

sns.violinplot(
    data=df_plot,
    x="condition",
    y="space_area_um2",
    inner="box",
    cut=0,
    palette="pastel",
    linewidth=1,
    order=order,
    bw=0.2
)

plt.ylabel("Space Area (μm²)", fontsize=12)
plt.xlabel("Condition", fontsize=12)
plt.title("Violin Plot of Space Area Across Conditions", fontsize=14)
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.3)
plt.tight_layout()

plt.savefig(os.path.join(output_folder, "violin_plot_space_size.pdf"))
plt.show()