In [5]:
import os
import cv2
import json
import torch
import random
import numpy as np
from tqdm import tqdm
from PIL import Image
from glob import glob
import os.path as osp
import torch.nn.functional as F
from sklearn.cluster import KMeans
from torchvision import transforms
from transformers import CLIPModel
from torchvision import transforms
from argparse import ArgumentParser
from utils_dir.backbones_utils import load_backbone, extract_backbone_features, get_backbone_params, load_backbone_and_tokenizer
from utils_dir.coco_to_seg import coco_to_seg

from build_prototypes import build_object_prototypes
from argparse import ArgumentParser

In [6]:
import json
train_coco = '/home/gridsan/manderson/ovdsat/data/dior/train_coco_subset_N5-1.json'

In [11]:
# Load the JSON file
with open(train_coco, "r") as f:
    data = json.load(f)

# Check if 'images' exists
if "images" in data:
    print("Found 'images' section")

    # Check for a specific item, e.g., file_name = "dog.jpg"
    exists = any(img.get("file_name") == "21589.jpg" for img in data["images"])

    if exists:
        print("Item exists in images ✅")
    else:
        print("Item not found ❌")
else:
    print("'images' key not found in JSON")

Found 'images' section
Item exists in images ✅


In [10]:
parser = ArgumentParser()
parser.add_argument('--data_dir', type=str, default='data/simd_subset_10')
parser.add_argument('--save_dir', type=str, default='/mnt/ddisk/boux/code/ovdsat/run/classification_benchmark_exp')
parser.add_argument('--annotations_file', type=str, default='/mnt/ddisk/boux/code/data/simd/train_coco_subset_N10.json')
parser.add_argument('--backbone_type', type=str, default='dinov2')
parser.add_argument('--target_size', nargs=2, type=int, metavar=('width', 'height'), default=(602, 602))
parser.add_argument('--window_size', type=int, default=224)
parser.add_argument('--scale_factor', type=int, default=1)
parser.add_argument('--num_b', type=int, default=10, help='Number of background samples to extract per image')
parser.add_argument('--k', type=int, default=200, help='Number of background prototypes (clusters for k-means)')
parser.add_argument('--store_bg_prototypes', action='store_true', default=False)

_StoreTrueAction(option_strings=['--store_bg_prototypes'], dest='store_bg_prototypes', nargs=0, const=True, default=False, type=None, choices=None, required=False, help=None, metavar=None)

In [11]:
DATA_DIR = 'data'
backbone = 'dinov2'
dataset = 'dior'
N=5
M=6

In [12]:
args = parser.parse_args([
    '--data_dir', f'{DATA_DIR}/{dataset}/JPEGImages',
    '--save_dir', f'run/init_prototypes/boxes/{dataset}_N{N}-{M}',
    '--annotations_file', f'{DATA_DIR}/{dataset}/train_coco_subset_N{N}-{M}.json',
    '--backbone_type', backbone,
    '--target_size', '602', '602',
    '--window_size', '224',
    '--scale_factor', '1',
    '--num_b', '10',
    '--k', '200',
    '--store_bg_prototypes'
])

In [15]:
# Convert COCO annotations to segmentation masks
init_data_path = os.path.join('data', 'init_data', args.save_dir.split('/')[-1])
coco_to_seg(args.annotations_file, args.data_dir, init_data_path)

data/init_data/dior_N5-6
data/dior/JPEGImages
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Processed category: groundtrackfield
Processed category: baseballfield
Processed category: bridge
Processed category: Expressway-toll-station
Processed category: vehicle
Processed category: airplane
Processed category: airport
Processed category: tenniscourt
Processed category: trainstation
Processed category: storagetank
Processed category: stadium
Processed category: windmill
Processed category: ship
Processed category: golffield
Processed category: overpass
Processed category: chimney
Processed category: dam
Processed category: basketballcourt
Processed category: harbor
Processed category: Expressway-Service-area
Processing complete.


### Random

In [7]:
init_data_path = 'data/init_data/test-n'
annotations_file = '/home/gridsan/manderson/ovdsat/data/test-n.json'

In [8]:
coco_to_seg(annotations_file, f'{DATA_DIR}/{dataset}/JPEGImages', init_data_path)

data/init_data/test-n
data/dior/JPEGImages
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Processed category: groundtrackfield
Processed category: baseballfield
Processed category: stadium
Processed category: windmill
Processed category: ship
Processed category: golffield
Processed category: overpass
Processed category: chimney
Processed category: dam
Processed category: basketballcourt
Processed category: harbor
Processed category: Expressway-Service-area
Processed category: bridge
Processed category: Expressway-toll-station
Processed category: vehicle
Processed category: airplane
Processed category: airport
Processed category: tenniscourt
Processed category: trainstation
Processed category: storagetank
Processing complete.


In [6]:
# Convert COCO annotations to segmentation masks
init_data_path = os.path.join('data', 'init_data', args.save_dir.split('/')[-1])
coco_to_seg(args.annotations_file, args.data_dir, init_data_path)


# Load model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = load_backbone(args.backbone_type)
model = model.to(device)
model.eval()
patch_size, _ = get_backbone_params(args.backbone_type)

# Build object prototypes
obj_category_dict = build_object_prototypes(args, model, init_data_path, device, patch_size)

data/init_data/dior_N5-1
data/dior/JPEGImages
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
Processed category: groundtrackfield


KeyboardInterrupt: 

### Text (zero-shot)

In [30]:
zero_shot_prot = torch.load('/home/gridsan/manderson/ovdsat/run/text_prototypes/boxes/dior/prototypes_remoteclip-14.pt')

In [32]:
print(zero_shot_prot['prototypes'].shape)

torch.Size([20, 768])


### CoOp

In [6]:
context_path = '/home/gridsan/manderson/ovdsat/CoOp/output/dior/CoOp/vit_l14_remote-ep100-ctx2_5shots/nctx4_cscFalse_ctpmiddle/seed1/prompt_learner/model.pth.tar-100'
model, tokenizer = load_backbone_and_tokenizer('remoteclip-14')

In [7]:
context = torch.load(context_path, map_location=torch.device('cpu'))

In [8]:
print(context.keys())

dict_keys(['state_dict', 'epoch', 'optimizer', 'scheduler', 'val_result'])


In [5]:
print(context['state_dict'].keys())

odict_keys(['ctx', 'token_prefix', 'token_suffix'])


In [9]:
prefix = context['state_dict']['token_prefix']
ctx = context['state_dict']['ctx']
suffix = context['state_dict']['token_suffix']

In [12]:
print(prefix.shape)
print(ctx.shape)
print(suffix.shape)

torch.Size([20, 1, 768])
torch.Size([4, 768])
torch.Size([20, 72, 768])


In [11]:
# # for end, class specific
# prompts = torch.cat(
#     [
#         prefix,  # (n_cls, 1, dim)
#         ctx,     # (n_cls, n_ctx, dim)
#         suffix,  # (n_cls, *, dim)
#     ],
#     dim=1,
# )
# prompts.shape

RuntimeError: Tensors must have same number of dimensions: got 3 and 2

In [18]:
classes = ['expressway service area', 'expressway toll station', 'airplane', 'airport', 'background', 'baseball field', 'basketball court', 'bridge', 'chimney', 'dam', 'golf field', 'ground track field', 'harbor', 'overpass', 'ship', 'stadium', 'storage tank', 'tennis court', 'train station', 'vehicle', 'windmill']
for name in classes:
    tokens = len(tokenizer.encode(name))
    print(tokens)
    # token_embed = model.token_embedding(tokens)
    # print(token_embed[:10])

3
3
1
1
1
2
2
1
1
1
2
3
1
1
1
1
2
2
2
1
1


In [15]:
# for middle, unified
classes = ['ground track field', 'baseball field', 'bridge', 'expressway toll station', 'vehicle', 'airplane', 'airport', 'tennis court', 'train station', 'storage tank', 'stadium', 'windmill', 'ship', 'golf field', 'overpass', 'chimney', 'dam', 'basketball court', 'harbor', 'expressway service area']
name_lens = [len(tokenizer.encode(name)) for name in classes]
n_ctx = 4
n_cls = 20

if ctx.dim() == 2:
    ctx = ctx.unsqueeze(0).expand(n_cls, -1, -1)

half_n_ctx = n_ctx // 2
prompts = []
for i in range(n_cls):
    name_len = name_lens[i]
    prefix_i = prefix[i : i + 1, :, :]
    class_i = suffix[i : i + 1, :name_len, :]
    print(class_i.shape)
    suffix_i = suffix[i : i + 1, name_len:, :]
    ctx_i_half1 = ctx[i : i + 1, :half_n_ctx, :]
    ctx_i_half2 = ctx[i : i + 1, half_n_ctx:, :]
    prompt = torch.cat(
        [
            prefix_i,     # (1, 1, dim)
            ctx_i_half1,  # (1, n_ctx//2, dim)
            class_i,      # (1, name_len, dim)
            ctx_i_half2,  # (1, n_ctx//2, dim)
            suffix_i,     # (1, *, dim)
        ],
        dim=1,
    )
    prompts.append(prompt)
prompts = torch.cat(prompts, dim=0)
prompts.shape

torch.Size([1, 3, 768])
torch.Size([1, 2, 768])
torch.Size([1, 1, 768])
torch.Size([1, 3, 768])
torch.Size([1, 1, 768])
torch.Size([1, 1, 768])
torch.Size([1, 1, 768])
torch.Size([1, 2, 768])
torch.Size([1, 2, 768])
torch.Size([1, 2, 768])
torch.Size([1, 1, 768])
torch.Size([1, 1, 768])
torch.Size([1, 1, 768])
torch.Size([1, 2, 768])
torch.Size([1, 1, 768])
torch.Size([1, 1, 768])
torch.Size([1, 1, 768])
torch.Size([1, 2, 768])
torch.Size([1, 1, 768])
torch.Size([1, 3, 768])


torch.Size([20, 77, 768])

In [9]:
text_encoder = model.transformer
text_feats = text_encoder(prompts.to('cpu'))
text_feats.shape

torch.Size([20, 77, 768])

In [10]:
text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
text_feats = text_feats[:, 0, :] #CLS token
text_feats.shape # Final shape of text prototypes should be [n_classes, dim=768]

torch.Size([20, 768])

In [None]:
coop_classes = ['Ground Track Field', 'Baseball Field', 'Bridge', 'Expressway Toll Station', 'Vehicle', 'Airplane', 'Airport', 'Tennis Court', 'Train Station', 'Storage Tank', 'Stadium', 'Windmill', 'Ship', 'Golf Field', 'Overpass', 'Chimney', 'Dam', 'Basketball Court', 'Harbor', 'Expressway Service Area']

In [None]:
with open('/home/gridsan/manderson/ovdsat/data/text/dior_labels.txt', "r") as f:
        classes = [line.strip() for line in f]

### Understanding CoOp

In [14]:
import open_clip

In [40]:
n_ctx = 16

In [46]:
classnames = ['airplane', 'boat', 'car']
name_lens = [len(open_clip.tokenize(name)) for name in classnames]
name_lens

[1, 1, 1]

In [47]:
prompt_prefix = " ".join(["X"] * n_ctx)
prompt_prefix

'X X X X X X X X X X X X X X X X'

In [56]:
prompts = [prompt_prefix + " " + name + "." for name in classnames]
print(len(prompts[0]))
prompts # Prompts [n_classes, num_words]

41


['X X X X X X X X X X X X X X X X airplane.',
 'X X X X X X X X X X X X X X X X boat.',
 'X X X X X X X X X X X X X X X X car.']

In [57]:
tokenized_prompts = torch.cat([open_clip.tokenize(p) for p in prompts])
tokenized_prompts.shape # Tokenized prompts [n_classes, num_tokens] (num_tokens usually 77 with padding)

torch.Size([3, 77])

In [64]:
print(tokenized_prompts[0])

tensor([49406,   343,   343,   343,   343,   343,   343,   343,   343,   343,
          343,   343,   343,   343,   343,   343,   343, 16451,   269, 49407,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0])


In [63]:
with torch.no_grad():
    embedding = model.token_embedding(tokenized_prompts)
embedding.shape # Token embeddings [n_classes, num_tokens, dim]

torch.Size([3, 77, 768])

### Create Eurosat json

In [8]:
import pandas as pd
import json

# Load CSV files
train_df = pd.read_csv("/home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/train.csv")
train_df.reset_index(drop=True, inplace=True) 
val_df = pd.read_csv("/home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/validation.csv")
val_df.reset_index(drop=True, inplace=True) 
test_df = pd.read_csv("/home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/test.csv")
test_df.reset_index(drop=True, inplace=True) 

# Remove columns that are unnamed or have 'Unnamed' in their names
train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
val_df = val_df.loc[:, ~val_df.columns.str.contains('^Unnamed')]
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]

# Ensure the column names are consistent with the expected format (impath, label, classname)
train_df.columns = ['impath', 'label', 'classname']
val_df.columns = ['impath', 'label', 'classname']
test_df.columns = ['impath', 'label', 'classname']

# Convert DataFrames to list of lists
train_data = train_df.values.tolist()
val_data = val_df.values.tolist()
test_data = test_df.values.tolist()

# Combine into one dictionary
dataset = {
    "train": train_data,
    "val": val_data,
    "test": test_data
}

# Save as JSON
with open("/home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/dataset.json", "w") as json_file:
    json.dump(dataset, json_file, indent=4)

print("JSON file saved")

JSON file saved


In [9]:
from dassl.utils import read_json

split = read_json("/home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/dataset.json")

In [10]:
train_split = split['train']

In [11]:
train_split[0]

['AnnualCrop/AnnualCrop_142.jpg', 0, 'AnnualCrop']

In [12]:
i = 0
for impath, label, classname in train_split:
    if i < 1:
        print(impath, label, classname)
    else:
        break
    i += 1

AnnualCrop/AnnualCrop_142.jpg 0 AnnualCrop


In [14]:
from transformers import CLIPModel

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
model = CLIPModel.from_pretrained('/home/gridsan/manderson/ovdsat/weights/clip-vit-large-patch14')

In [17]:
model

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05,

### Convert dassl datums to csv splits

In [21]:
import pickle
import pandas as pd
import os
import glob

In [4]:
with open('/home/gridsan/manderson/ovdsat/data/eurosat/split_fewshot/shot_1-seed_1.pkl', 'rb') as f:
    data = pickle.load(f)
    
data # has train and val

{'train': [<dassl.data.datasets.base_dataset.Datum at 0x7fa0cc85ba60>,
  <dassl.data.datasets.base_dataset.Datum at 0x7fa0cc86e130>,
  <dassl.data.datasets.base_dataset.Datum at 0x7fa0cc86e250>,
  <dassl.data.datasets.base_dataset.Datum at 0x7fa0cc868940>,
  <dassl.data.datasets.base_dataset.Datum at 0x7fa0cc879730>,
  <dassl.data.datasets.base_dataset.Datum at 0x7fa0cc8796d0>,
  <dassl.data.datasets.base_dataset.Datum at 0x7fa0cc8794f0>,
  <dassl.data.datasets.base_dataset.Datum at 0x7fa0cc879430>,
  <dassl.data.datasets.base_dataset.Datum at 0x7fa0cc879460>,
  <dassl.data.datasets.base_dataset.Datum at 0x7fa0cc879910>],
 'val': [<dassl.data.datasets.base_dataset.Datum at 0x7fa0cc74f430>,
  <dassl.data.datasets.base_dataset.Datum at 0x7f9f863bf6a0>,
  <dassl.data.datasets.base_dataset.Datum at 0x7fa0cc75dbb0>,
  <dassl.data.datasets.base_dataset.Datum at 0x7f9f860aad00>,
  <dassl.data.datasets.base_dataset.Datum at 0x7f9f7e9011c0>,
  <dassl.data.datasets.base_dataset.Datum at 0x7f9f7e

In [11]:
print(data['train'][0]._impath)
print(data['train'][0]._label)
print(data['train'][0]._classname)

/home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/AnnualCrop/AnnualCrop_832.jpg
0
AnnualCrop


In [24]:
# Function to process a .pkl file and save train/val sets as CSV
def convert_pkl_to_csv(pkl_path):
    # Extract shot and seed from filename (assumes format: shot_X-seed_Y.pkl)
    filename = os.path.basename(pkl_path).replace(".pkl", "")
    print(filename)
    
    # Load the .pkl file
    with open(pkl_path, "rb") as f:
        data = pickle.load(f)
    
    # Extract train and validation sets
    train_set = data.get("train", [])  # Default to empty list if missing
    val_set = data.get("val", [])  # Default to empty list if missing
    
    # Function to extract relevant attributes from datum objects
    def extract_data(dataset):
        return [{"Filename": d._impath, "Label": d._label, "ClassName": d._classname} for d in dataset]
    
    # Convert to DataFrame
    train_df = pd.DataFrame(extract_data(train_set))
    val_df = pd.DataFrame(extract_data(val_set))

    # Save to CSV
    train_csv_path = f"{pkl_path[:-4]}-train.csv"
    val_csv_path = f"{pkl_path[:-4]}-val.csv"
    train_df.to_csv(train_csv_path, index=False)
    val_df.to_csv(val_csv_path, index=False)
    
    print(f"Saved: {train_csv_path}, {val_csv_path}")

In [25]:
# Directory path
dir_path = "/home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/split_fewshot"

# Find all .pkl files in the directory
pkl_files = glob.glob(f"{dir_path}/*.pkl")

for pkl_file in pkl_files:
    convert_pkl_to_csv(pkl_file)

shot_2-seed_5
Saved: /home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/split_fewshot/shot_2-seed_5-train.csv, /home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/split_fewshot/shot_2-seed_5-val.csv
shot_1-seed_3
Saved: /home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/split_fewshot/shot_1-seed_3-train.csv, /home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/split_fewshot/shot_1-seed_3-val.csv
shot_4-seed_1
Saved: /home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/split_fewshot/shot_4-seed_1-train.csv, /home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/split_fewshot/shot_4-seed_1-val.csv
shot_4-seed_5
Saved: /home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/split_fewshot/shot_4-seed_5-train.csv, /home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/split_fewshot/shot_4-seed_5-val.csv
shot_2-seed_1
Saved: /home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/split_fewshot/shot_2-seed_1-train.csv, /home/gridsan/manderson/ovdsat/data/eurosat/EuroSAT/split_fewshot/shot_2-seed_1-val.csv
