# Feature extraction
Converted from `01_feature_extraction.py`.

This notebook runs the feature extraction pipeline: loading mask files, computing embeddings, extracting image features, and computing CLIP-based text similarities.

## Imports

In [None]:
import copy
import cv2
import os
import numpy as np
import pandas as pd 
import sys
from collections import Counter, defaultdict
from tqdm import tqdm
import warnings

from autodistill_grounded_sam import GroundedSAM
from autodistill.detection import CaptionOntology
import torch

# local utils
from utils import utils_python

## Prepare 

In [None]:
## set working directory
os.chdir(r"D:/git-repos/mluerig/nymphalid-phenomics/")

# optional: suppress warnings (there will be a lot of them)
warnings.filterwarnings("ignore")

## check for cuda 
torch.cuda.device_count()

## Get files

In [None]:
input_dir = f"data_raw/images_sample"

file_dict = {}
for file_name in os.listdir(input_dir):
    filepath = os.path.join(input_dir, file_name)
    file_dict[file_name] = {
        "image_name" : file_name,
        "image_path" : filepath
        }

data_imgs = pd.DataFrame.from_dict(file_dict, orient="index")
data_imgs.reset_index(inplace=True, drop=True)

data_imgs

In [None]:
## file I/O
path_data_results = "data_raw/tables/segmentation_results.pkl"
output_dir = "data_raw/segmentation_masks/all_masks"

## set of processed images
dict_results = {}

#%% model setup
model = GroundedSAM(
    ontology=CaptionOntology({"butterfly": "butterfly",}),
    text_threshold = 0.1,
    box_threshold = 0.1,
    )


In [None]:
min_area = 10_000 ## this may need to be adjusted based on image resolution, but you do probably want to filter out small masks

pbar = tqdm(total=len(data_imgs), position=0, leave=False, desc="Segmenting images")
for idx1, row in data_imgs.iterrows():

    image_name = row["image_name"]
    base_image_name = os.path.splitext(image_name)[0]
    
    # Load image and predict
    try:
        image = cv2.imread(row["image_path"])
        assert image is not None, "Failed to load the image."
    except:
        dict_results[base_image_name] = {"status": "no detections", "image_name": image_name}
        pbar.update(1)
        continue
    
    ## do prediction
    result = model.predict(image)
    
    # Check for masks and process
    if len(result.mask) > 0:
        for idx2, (area, mask) in enumerate(zip(result.area, result.mask)):
            detection_name = base_image_name + f"_{idx2+1}.png"
                
            # Filter mask and save. if mask is below min_area, it will return an empty dict
            roi, info = utils_python.filter_mask(image, mask, min_area)
            if roi is not None:
                cv2.imwrite(os.path.join(output_dir, detection_name), roi)
                                
            # Store info
            info["confidence"] = result.confidence[idx2]
            info["area"] = area
            info["mask_idx"] = idx2 + 1
            info["image_name"] = image_name
            dict_results[detection_name] = info
    else:
        # Add an empty entry if no detections
        dict_results[detection_name] = {"status": "no detections", "image_name": image_name}

    pbar.update(1)


## post-processing

what follows here is an extensive post-processing pipeline that is explained in the methods part