In [1]:
! pip install mrcfile
! pip install -U cryoet-data-portal
! pip install matplotlib
! pip install groundingdino-py
! pip install scikit-learn
! pip install awscli
! pip install --upgrade urllib3

Collecting mrcfile
  Downloading mrcfile-1.5.3-py2.py3-none-any.whl.metadata (6.9 kB)
Downloading mrcfile-1.5.3-py2.py3-none-any.whl (44 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mrcfile
Successfully installed mrcfile-1.5.3
Collecting cryoet-data-portal
  Downloading cryoet_data_portal-4.2.1-py3-none-any.whl.metadata (2.0 kB)
Collecting boto3 (from cryoet-data-portal)
  Downloading boto3-1.35.81-py3-none-any.whl.metadata (6.7 kB)
Collecting deepmerge (from cryoet-data-portal)
  Downloading deepmerge-2.0-py3-none-any.whl.metadata (3.5 kB)
Collecting gql[requests] (from cryoet-data-portal)
  Downloading gql-3.5.0-py2.py3-none-any.whl.metadata (9.2 kB)
Collecting strcase (from cryoet-data-portal)
  Downloading strcase-1.0.0-py3-none-any.whl.metadata (1.2 kB)
Colle

In [57]:
# Core imports
import argparse
import json
import os
import subprocess
from pathlib import Path

# Data handling imports
import cryoet_data_portal as portal
import matplotlib.pyplot as plt
import mrcfile
import numpy as np
import requests
from PIL import Image
from sklearn.model_selection import train_test_split

# CryoET Data Portal Client
client = portal.Client()


# Data Portal Interaction Functions
def find_dataset_by_id(dataset_id):
    """Find a dataset by its ID."""
    datasets = portal.Dataset.find(client, [portal.Dataset.id == dataset_id])
    return datasets[0]


def get_dataset_to_runs_for_dataset_id(dataset_id):
    """Get mapping of dataset ID to its runs."""
    dataset = find_dataset_by_id(dataset_id)
    return {dataset.id: [run.name for run in dataset.runs]}


def get_run_to_tomograms_for_dataset_id(dataset_id):
    """Get mapping of run names to their tomograms."""
    dataset = find_dataset_by_id(dataset_id)
    return {run.name: run.tomograms for run in dataset.runs}


def get_annotations_for_tomogram(tomogram):
    """Get annotations for a specific tomogram."""
    return portal.Annotation.find(client, [portal.Tomogram.id == tomogram.id])


# File Processing Functions
def download_mrc_for_tomogram(dataset_id, tomogram, output_dir):
    """Download MRC file for a tomogram."""
    url = tomogram.https_mrc_file
    dir_name = os.path.join(
        output_dir, str(dataset_id), tomogram.run.name, str(tomogram.id)
    )
    os.makedirs(dir_name, exist_ok=True)
    local_file = os.path.join(dir_name, f"{tomogram.voxel_spacing}_downloaded.mrc")
    response = requests.get(url)
    with open(local_file, "wb") as f:
        f.write(response.content)
    return local_file


def visualize_slice_and_save(mrc_path, z_slice, tomogram_id, output_dir):
    """Visualize and save a specific slice from an MRC file."""
    with mrcfile.open(mrc_path) as mrc:
        slice = mrc.data[z_slice, :, :]
        plt.imshow(slice, cmap="gray")
        plt.colorbar()
        plt.title(f"Tomogram Slice {z_slice}")
        output_path = os.path.join(output_dir, f"{tomogram_id}_{z_slice}_slice.png")
        plt.savefig(output_path)
        plt.close()


def sync_annotations(dataset_to_runs, tomograms, output_dir, dataset_id):
    """Generate AWS sync commands for annotations."""
    commands = set()
    for dataset_id, run_names in dataset_to_runs.items():
        for run_name in run_names:
            tomograms_in_run = tomograms[run_name]
            for tomogram in tomograms_in_run:
                cmd = f"aws s3 --no-sign-request sync s3://cryoet-data-portal-public/{dataset_id}/{run_name}/Reconstructions/VoxelSpacing{tomogram.voxel_spacing}/Annotations {output_dir}/{dataset_id}/{run_name}/Annotations"
                commands.add(cmd)
    return commands


def process_and_save_all_mrc_layers(mrc_path, output_dir):
    """Process and save all layers from an MRC file as PNG images."""
    mrc_dir = os.path.dirname(mrc_path)
    voxel_spacing = os.path.basename(mrc_path).replace("_downloaded.mrc", "")
    with mrcfile.open(mrc_path) as mrc:
        num_layers = mrc.data.shape[0]
        for z in range(num_layers):
            slice = mrc.data[z, :, :]
            slice_norm = (slice - np.min(slice)) / (np.max(slice) - np.min(slice))
            slice_norm = (slice_norm * 255).astype(np.uint8)
            img = Image.fromarray(slice_norm)
            output_path = os.path.join(mrc_dir, f"{voxel_spacing}_{z}_slice.png")
            img.save(output_path)
            # print(f"Processed layer {z}/{num_layers-1}")


# COCO Dataset Functions
# args.output_dir, annotation_files
def create_coco_dataset(output_dir, dataset_id, dataset_to_runs, tomograms, all_annotation_files):
    """Create COCO format dataset from images and annotation files."""
    # Initialize COCO format structure
    coco_format = {"images": [], "annotations": [], "categories": []}

    # Create categories
    for annotation_files in all_annotation_files.values():
        for cat_id, category in enumerate(annotation_files.keys(), 1):
            if category not in [cat["name"] for cat in coco_format["categories"]]:
                coco_format["categories"].append({"id": cat_id, "name": category})

    category_map = {cat["name"]: cat["id"] for cat in coco_format["categories"]}

    image_id = 0
    annotation_id = 0

    # Process each image

    run_names = dataset_to_runs[dataset_id]
    for run_name in run_names:
        image_dir = os.path.join(output_dir, str(dataset_id), run_name)
        tomogram_ids = [tomogram.id for tomogram in tomograms[run_name]]
        for tomogram_id in tomogram_ids:
            tomogram_slices = Path(image_dir) / f"{tomogram_id}"
            # import pdb; pdb.set_trace()
            for img_path in Path(tomogram_slices).glob("*_slice.png"):

                img = Image.open(img_path)
                width, height = img.size

                coco_format["images"].append(
                    {
                        "id": image_id,
                        "file_name": f"{tomogram_slices}/{img_path.name}",
                        "width": width,
                        "height": height,
                    }
                )

                z_index = int(img_path.stem.split("_")[-2])

                # Process annotations for each category
                annotation_files = all_annotation_files[run_name]
                for category, anno_file in annotation_files.items():
                    with open(anno_file) as f:
                        points = [json.loads(line) for line in f]

                    for point in points:
                        if abs(point["location"]["z"] - z_index) <= 0.5:
                            box_size = 30
                            bbox = [
                                int(point["location"]["x"]) - box_size // 2,
                                int(point["location"]["y"]) - box_size // 2,
                                box_size,
                                box_size,
                            ]

                            coco_format["annotations"].append(
                                {
                                    "id": annotation_id,
                                    "image_id": image_id,
                                    "category_id": category_map[category],
                                    "bbox": bbox,
                                    "area": box_size * box_size,
                                    "iscrowd": 0,
                                }
                            )
                            annotation_id += 1

                image_id += 1

    return coco_format

def create_annotation_mapping(output_dir, dataset_id, run_name):
    """
    Create mapping of class names to their annotation file paths.

    Args:
        output_dir (str): Base directory where annotations are stored

    Returns:
        dict: Mapping of class names to their .ndjson file paths
    """
    annotation_files = {}

    # Walk through the Annotations directory
    annotations_path = Path(output_dir) / str(dataset_id) / run_name / "Annotations"

    # Check all numbered directories (100, 101, etc.)
    for dir_path in sorted(annotations_path.glob("[0-9]*")):
        # Look for .ndjson files
        for file_path in dir_path.glob("*.ndjson"):
            # Get the class name from the filename (before the first hyphen)
            class_name = file_path.stem.split("-")[0]

            # Convert path to relative path string
            relative_path = str(file_path.relative_to(output_dir))

            annotation_files[class_name] = relative_path

    return annotation_files


def process_coco_split(data, prompt_text, clean=True):
    """Process a COCO dataset split."""
    # Clean up image IDs and annotations
    image_ids = set(img["id"] for img in data["images"])
    valid_annotations = [
        anno for anno in data["annotations"] if anno["image_id"] in image_ids
    ]

    # Remove text from images and add to annotations
    for img in data["images"]:
        if "text" in img:
            del img["text"]

    for ann in valid_annotations:
        ann["text"] = prompt_text

    data["annotations"] = valid_annotations

    return data

In [73]:
# def main():
#     """Main function to process CryoET data."""
#     Parse arguments
#     parser = argparse.ArgumentParser(description="Process CryoET Data")
#     parser.add_argument(
#         "--dataset_id", type=int, required=True, help="Dataset ID to fetch data for"
#     )
#     parser.add_argument(
#         "--output_dir",
#         type=str,
#         required=True,
#         help="Output directory for processed data",
#     )
# args = parser.parse_args()

output_dir = "."
dataset_id = 10440
args = argparse.Namespace(dataset_id=dataset_id, output_dir=output_dir)
# Create output directory
os.makedirs(args.output_dir, exist_ok=True)

# Process dataset
dataset_to_runs = get_dataset_to_runs_for_dataset_id(args.dataset_id)
tomograms = get_run_to_tomograms_for_dataset_id(args.dataset_id)

# Sync annotations and process MRC files
sync_cmds = sync_annotations(dataset_to_runs, tomograms, args.output_dir, dataset_id)
for cmd in sync_cmds:
    subprocess.run(cmd.split())

for run_name, tomogram_list in tomograms.items():
    for tomogram in tomogram_list:
        mrc_path = download_mrc_for_tomogram(
            args.dataset_id, tomogram, args.output_dir
        )
        process_and_save_all_mrc_layers(mrc_path, args.output_dir)

all_annotations = {}
# Define annotation files
for d, runs in dataset_to_runs.items():
    for run_name in runs:
        annotation_files = create_annotation_mapping(args.output_dir, d, run_name)
        all_annotations[run_name] = annotation_files

# Create COCO datasets
prompt_text = "Find ferritin complex, beta amylase, beta galactosidase, cytosolic ribosome, thyroglobulin, and virus"
coco_data = create_coco_dataset(output_dir, dataset_id, dataset_to_runs, tomograms, all_annotations)

# Split into train and val
train_imgs, val_imgs = train_test_split(coco_data["images"], test_size=0.2)

# Create train and val datasets
train_data = coco_data.copy()
train_data["images"] = train_imgs
train_data = process_coco_split(train_data, prompt_text)

val_data = coco_data.copy()
val_data["images"] = val_imgs
val_data = process_coco_split(val_data, prompt_text)

# Save final datasets
with open(f"train_coco.json", "w") as f:
    json.dump(train_data, f, indent=2)
with open(f"val_coco.json", "w") as f:
    json.dump(val_data, f, indent=2)

# Print statistics
print(f"Training images: {len(train_data['images'])}")
print(f"Training annotations: {len(train_data['annotations'])}")
print(f"Validation images: {len(val_data['images'])}")
print(f"Validation annotations: {len(val_data['annotations'])}")


Training images: 4121
Training annotations: 3936
Validation images: 1031
Validation annotations: 1060


## start training grounding dino


In [75]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [76]:
!pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118

import torch, torchvision
print("torch version:",torch.__version__, "cuda:",torch.cuda.is_available())
!pip install -U openmim
!mim install "mmengine>=0.7.0"
!mim install "mmcv>=2.0.0,<2.1.0"

# ! git clone https://github.com/open-mmlab/mmdetection.git
! pip install -v -e /content/drive/MyDrive/mmdetection
! pip install numpy==1.23.5

Looking in indexes: https://download.pytorch.org/whl/cu118
torch version: 2.0.1+cu118 cuda: False
Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html
Looking in links: https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/index.html
mmcv: 2.0.1
Using pip 24.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Obtaining file:///content/drive/MyDrive/mmdetection
  Running command python setup.py egg_info
  No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
  running egg_info
  creating /tmp/pip-pip-egg-info-kds3d8st/mmdet.egg-info
  writing manifest file '/tmp/pip-pip-egg-info-kds3d8st/mmdet.egg-info/SOURCES.txt'
  writing manifest file '/tmp/pip-pip-egg-info-kds3d8st/mmdet.egg-info/SOURCES.txt'
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting terminaltables (from mmdet==3.3.0)
  Obtaining dependency information for terminaltables from https://files.pythonhosted.org/packages/c4/fb/ea621e0a19733e01fe4005d46087d38

In [77]:
from transformers import BertConfig, BertModel
from transformers import AutoTokenizer

config = BertConfig.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, config=config)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

config.save_pretrained("bert-base-uncased")
model.save_pretrained("bert-base-uncased")
tokenizer.save_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

('bert-base-uncased/tokenizer_config.json',
 'bert-base-uncased/special_tokens_map.json',
 'bert-base-uncased/vocab.txt',
 'bert-base-uncased/added_tokens.json',
 'bert-base-uncased/tokenizer.json')

In [81]:
import json
import os

def load_coco_json(file_path):
    """Load COCO JSON file."""
    with open(file_path, 'r') as f:
        return json.load(f)

def save_coco_json(data, file_path):
    """Save COCO JSON file."""
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=2)

def summarize_coco_data(data):
    """Print a summary of the COCO dataset."""
    print(f"Images: {len(data['images'])}")
    print(f"Annotations: {len(data['annotations'])}")
    print(f"Categories: {len(data['categories'])}")

def check_first_image_path(data):
    """Check if the first image path exists."""
    first_image_path = data['images'][0]['file_name']
    print(f"First image path: {first_image_path}")
    print(f"Exists?: {os.path.exists(first_image_path)}")

def find_missing_image_ids(data):
    """Find image IDs missing in annotations."""
    image_ids = set(img['id'] for img in data['images'])
    anno_image_ids = set(anno['image_id'] for anno in data['annotations'])
    missing_ids = anno_image_ids - image_ids
    print(f"Missing image IDs: {missing_ids}")
    return missing_ids

def cleanup_annotations(data):
    """Remove annotations without matching images."""
    image_ids = set(img['id'] for img in data['images'])
    valid_annotations = [anno for anno in data['annotations'] if anno['image_id'] in image_ids]
    data['annotations'] = valid_annotations
    return valid_annotations

def process_coco_file(file_path):
    """Load, validate, and clean up a COCO dataset file."""
    print(f"Processing {file_path}")
    data = load_coco_json(file_path)
    summarize_coco_data(data)
    check_first_image_path(data)
    find_missing_image_ids(data)
    valid_annotations = cleanup_annotations(data)
    save_coco_json(data, file_path)
    print(f"Annotations after cleanup: {len(valid_annotations)}")


"""Main function to process train and validation COCO files."""
train_file = 'train_coco.json'
val_file = 'val_coco.json'

process_coco_file(train_file)
process_coco_file(val_file)


Processing train_coco.json
Images: 4121
Annotations: 3936
Categories: 6
First image path: 10440/TS_86_3/17035/10.012_82_slice.png
Exists?: True
Missing image IDs: set()
Annotations after cleanup: 3936
Processing val_coco.json
Images: 1031
Annotations: 1060
Categories: 6
First image path: 10440/TS_69_2/17021/10.012_94_slice.png
Exists?: True
Missing image IDs: set()
Annotations after cleanup: 1060


In [82]:
! wget https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth -P checkpoints/

--2024-12-14 02:24:36--  https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth
Resolving download.openmmlab.com (download.openmmlab.com)... 47.246.23.232, 47.246.23.233, 47.246.23.234, ...
Connecting to download.openmmlab.com (download.openmmlab.com)|47.246.23.232|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 691901857 (660M) [application/octet-stream]
Saving to: ‘checkpoints/groundingdino_swint_ogc_mmdet-822d7e9d.pth’


2024-12-14 02:25:37 (10.9 MB/s) - ‘checkpoints/groundingdino_swint_ogc_mmdet-822d7e9d.pth’ saved [691901857/691901857]



In [86]:
! python /content/drive/MyDrive/mmdetection/tools/analysis_tools/browse_dataset.py finetune_config.py --output-dir inspect_dataset

In [None]:
! python /content/drive/MyDrive/mmdetection/tools/train.py finetune_config.py

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
%%bash

python /content/drive/MyDrive/mmdetection/demo/image_demo.py \
    /content/drive/MyDrive/10440_TS_99_9_17042/10.012_58_slice.png \
    /content/finetune_config.py \
    --weights /content/work_dirs/finetune_config/epoch_15.pth \
    --texts 'Find thyroglobulin' \
    --pred-score-thr 0.05