In [45]:

"""
FlowCAM → EcoTaxa converter (single sample version)
---------------------------------------------------
This script processes a single FlowCAM sample folder containing:
 - one .lst file
 - corresponding image files

It reads the sample metadata from an Excel file, injects it into the EcoTaxa
object table, and outputs a zipped EcoTaxa-compatible archive (.zip)
containing the .tsv file and images.

Before running:
1. Install dependencies:
   pip install morphocut pandas openpyxl tqdm

2. Edit the paths below to match your setup.
"""
# ok so here it is - my final code, made after many hours of talking with chatGPT : )
# it should work, but the only for one sample per processing
# the paths to the sample, to the file for final product , and to the metadata excel file should be defined
# remmember to prepare proper excel sheet with metadata in EcoTaxa style (object_; sample_; acq_; etc)
#author: Adam Makatun

'\nFlowCAM → EcoTaxa converter (single sample version)\n---------------------------------------------------\nThis script processes a single FlowCAM sample folder containing:\n - one .lst file\n - corresponding image files\n\nIt reads the sample metadata from an Excel file, injects it into the EcoTaxa\nobject table, and outputs a zipped EcoTaxa-compatible archive (.zip)\ncontaining the .tsv file and images.\n\nBefore running:\n1. Install dependencies:\n   pip install morphocut pandas openpyxl tqdm\n\n2. Edit the paths below to match your setup.\n'

In [30]:
from pathlib import Path
import pandas as pd
import os
from morphocut.core import Pipeline
from morphocut.file import Find
from morphocut.integration.flowcam import FlowCamReader
from morphocut.image import RGB2Gray, ImageProperties
from morphocut.stream import TQDM
from morphocut.str import Format
from morphocut.contrib.ecotaxa import EcotaxaWriter
from morphocut.contrib.zooprocess import CalculateZooProcessFeatures
import zipfile
from io import BytesIO

In [16]:

# === USER SETTINGS ==========================================================

# Path to the folder with one FlowCAM sample (contains .lst and images)
input_folder = Path("F:/FlowCam/testujemy/FC_file")

# Path to the metadata Excel file
metadata_file = Path("F:/FlowCam/testujemy/meta.xlsx")

# Output .zip file path (will contain EcoTaxa .tsv + images)
output_file = Path("F:/FlowCam/testujemy/done/test.zip")

# ============================================================================


In [17]:
# --- Load metadata and create a dictionary ----------------------------------
metadata_df = pd.read_excel(metadata_file)

# Ensure consistent column naming
metadata_df.columns = [str(c).strip() for c in metadata_df.columns]

# --- Detect .lst file and extract sample_id ---------------------------------
lst_files = list(input_folder.glob("*.lst"))
if len(lst_files) == 0:
    raise FileNotFoundError(f"No .lst file found in {input_folder}")
if len(lst_files) > 1:
    print(f"⚠️ Multiple .lst files found, using the first one: {lst_files[0].name}")

lst_file = lst_files[0]
sample_id = lst_file.stem  # filename without extension

# --- Extract matching metadata row ------------------------------------------
meta_row = metadata_df.loc[metadata_df["sample_id"] == sample_id]
if meta_row.empty:
    raise ValueError(f"Sample ID '{sample_id}' not found in metadata Excel.")
metadata_dict = meta_row.iloc[0].to_dict()

print(f"Processing sample: {sample_id}")
print(f"Matched metadata: {metadata_dict}")


Processing sample: ZGG_20-0m_FC300_2
Matched metadata: {'sample_id': 'ZGG_20-0m_FC300_2', 'sample_ship': 'r/v Oceanograf', 'sample_operator': 'Adam Makatun', 'sample_sampling_gear': 'MultiNet/100', 'sample_season': 'summer', 'sample_total_volume_m3': 9, 'sample_concentrated_sample_volume_ml': 25, 'sample_dilution_factor': 0.5, 'acq_instrument': 'FlowCAM VS-IV', 'acq_celltype': 'FC300', 'acq_volume_ml': 4.194, 'acq_imaged_volume_ml': 1.7437, 'object_lat': 54.8336886, 'object_lon': 19.29635, 'acq_id': 1}


In [25]:
output_file.parent.mkdir(parents=True, exist_ok=True)


In [41]:
# --- Run MorphoCut pipeline --------------------------------------------------
with Pipeline() as p:
    # Find the .lst file (only one expected)
    lst_fn = Find(str(input_folder), [".lst"])

    # Read FlowCAM data
    obj = FlowCamReader(lst_fn)

    # Extract image and mask
    img = obj.image
    mask = obj.mask
    img_gray = RGB2Gray(img, True)

    # Copy FlowCAM object metadata
    object_meta = obj.data

    # Construct object ID
    object_id = Format("{lst_name}_{id}", lst_name=obj.lst_name, _kwargs=object_meta)
    object_meta["id"] = object_id

    # Extract region properties (size, shape, etc.)
    regionprops = ImageProperties(mask, img_gray)

    # Calculate ZooProcess-like features
    object_meta = CalculateZooProcessFeatures(regionprops, object_meta)


    # Write to EcoTaxa .zip
    EcotaxaWriter(
        str(output_file),
        [(Format("{object_id}.jpg", object_id=object_id), img)],
        object_meta=object_meta,
    )

    # Add progress bar
    TQDM(object_id)

# Execute the pipeline
p.run()

print(f"\n✅ Done! EcoTaxa archive created at:\n{output_file}")

ZGG_20-0m_FC300_2_10000: 100%|████████████████████████████████████████████████████| 10.0k/10.0k [01:42<00:00, 97.4it/s]

EcotaxaWriter: Wrote 10,000 entries to ecotaxa_export.tsv.
EcotaxaWriter: Wrote 10,000 objects to F:\FlowCam\testujemy\done\test.zip.

✅ Done! EcoTaxa archive created at:
F:\FlowCam\testujemy\done\test.zip





In [42]:
# Now we need to add the metadata to the .tsv file (unfortunatelly, I could not do that in the morphocut pipeline - somhow it is not working)

# Paths
zip_path = r"F:/FlowCam/testujemy/done/test.zip"
metadata_file = r"F:/FlowCam/testujemy/meta.xlsx"
# Load Excel metadata and set sample_id as index
meta_df = pd.read_excel(metadata_file).set_index("sample_id")

In [43]:
# Open zip and read TSV
with zipfile.ZipFile(zip_path, 'r') as z:
    tsv_name = [f for f in z.namelist() if f.endswith(".tsv")][0]
    tsv_bytes = z.read(tsv_name)
    
    # Keep other files (images) in memory
    other_files = {f: z.read(f) for f in z.namelist() if f != tsv_name}

# Read TSV as text to preserve headers
lines = tsv_bytes.decode("utf-8").splitlines()
header = lines[0].split("\t")
types = lines[1].split("\t")
data_lines = lines[2:]

# Remove 'object_label' if present
if "object_label" in header:
    idx = header.index("object_label")
    header.pop(idx)
    types.pop(idx)
    data_lines = ["\t".join([v for i, v in enumerate(line.split("\t")) if i != idx]) for line in data_lines]

# Create DataFrame
df = pd.DataFrame([line.split("\t") for line in data_lines], columns=header)

# Extract sample_id from object_id (assumes format: sampleID_something)
sample_id = df['object_id'].iloc[0].rsplit("_", 1)[0]
print("Detected sample ID:", sample_id)

# Get Excel metadata for this sample
if sample_id not in meta_df.index:
    raise ValueError(f"Sample ID '{sample_id}' not found in metadata Excel")
new_metadata = meta_df.loc[sample_id].to_dict()

# Add sample_id as a new column
df['sample_id'] = sample_id
header.append('sample_id')
types.append("[t]")  # treat as string

# Add Excel metadata as new columns
for col, val in new_metadata.items():
    df[col] = str(val)
    header.append(col)
    types.append("[t]" if isinstance(val, str) else "[f]")

# Ensure all data are strings
df = df.astype(str)

# Write updated TSV back into a BytesIO
tsv_buffer = BytesIO()
tsv_buffer.write(("\t".join(header) + "\n").encode("utf-8"))
tsv_buffer.write(("\t".join(types) + "\n").encode("utf-8"))
for _, row in df.iterrows():
    tsv_buffer.write(("\t".join(row) + "\n").encode("utf-8"))
tsv_buffer.seek(0)

# Recreate zip with updated TSV and original images
with zipfile.ZipFile(zip_path, 'w') as z:
    # Add updated TSV
    z.writestr(tsv_name, tsv_buffer.read())
    # Add other original files
    for f, content in other_files.items():
        z.writestr(f, content)

print(f"✅ Updated TSV inside zip: {zip_path}")


Detected sample ID: ZGG_20-0m_FC300_2
✅ Updated TSV inside zip: F:/FlowCam/testujemy/done/test.zip
