In [1]:
from attr import attrib
import torch
from torch import nn, optim
import argparse
import os
import utils
from clip import clip
from clip import model as c_model
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
# from dataset import CLIP_COCO_dataset
BATCH_SIZE = 128
EPOCH = 20
LR=1e-6
WARMUP = 3000
device = "cuda:1" if torch.cuda.is_available() else "cpu" # If using GPU then use mixed precision training.
import json
import numpy as np

import wandb
from scheduler import cosine_lr
from torch.cuda.amp import GradScaler, autocast
from collections import Counter
from datasets_zoo import snare_datasets, collate_fn
import time, datetime
from utils import MetricLogger, cosine_lr_schedule
import subprocess
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
root_dir="/ltstorage/home/2pan/dataset/VG_Attribution"
annotation_file = os.path.join(root_dir, "visual_genome_attribution.json")
image_dir = os.path.join(root_dir, "images")
with open(annotation_file, "r") as f:
    dataset = json.load(f)

for item in dataset:
    item["image_path"] = os.path.join(image_dir, item["image_path"])
    
model, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Must set jit=False for training

train_dataset = snare_datasets.VG_Attribution(preprocess, attribute_ownership=True, dataset=dataset)
train_dataloader = DataLoader(train_dataset,batch_size = 2, num_workers=4, shuffle=True, collate_fn = collate_fn._default_collate)

In [5]:
@torch.no_grad()
def evaluation(model, data_loader, device, validate=False):
    model.eval()

    print('Computing features for evaluation...')

    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('eval_loss', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Evaluation loss'
    print_freq = 50
    step=0
    total_loss = 0
    scores = []
    for i,batch in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
    # for batch in train_dataloader :
        step+=1
        
        image_options = []
        image_embeddings = model.encode_image(batch["image_options"].to(device)).cpu()  # B x D
        image_embeddings = image_embeddings / image_embeddings.norm(dim=1, keepdim = True)
        image_options.append(np.expand_dims(image_embeddings.numpy(), axis=1))

        caption_options = []
        # print(batch['caption_options'])
        """
        [('a road with a red dirt on a small moped on a man helmet',), 
        ('a man a red helmet a a small moped on on dirt road with',), 
        ('man a a small on helmet a red with moped on dirt road a',)]

        "A man with a red helmet on a small moped on a dirt road. ", 
        
        """
        cur_loss = 0
        print(batch)
        for idx in range(len(batch["caption_options"])):

            caption_tokenized = torch.cat([clip.tokenize(c) for c in batch["caption_options"][idx]])
            print(caption_tokenized.shape)
            caption_embeddings = model.encode_text(caption_tokenized.to(device)).cpu()  # B x D
            break

            # caption_embeddings = caption_embeddings - self.blank

            # caption_embeddings = caption_embeddings / np.linalg.norm(caption_embeddings, axis=1,
            #                                                             keepdims=True)  # B x D
            caption_embeddings = caption_embeddings / caption_embeddings.norm(dim=1, keepdim = True)
            caption_options.append(np.expand_dims(caption_embeddings.numpy(), axis=1))

        # print(len(caption_options))
        image_options = np.concatenate(image_options, axis=1)  # B x K x D
        # print("image_options", image_options.shape)
        caption_options = np.concatenate(caption_options, axis=1)  # B x L x D
        # print("caption_options", caption_options.shape)
        batch_scores = np.einsum("nkd,nld->nkl", image_options, caption_options)  # B x K x L

        break
        scores.append(batch_scores)
    if validate:
        wandb.log({"eval_avg_loss":total_loss/step})
    print(metric_logger.global_avg())
    all_scores = np.concatenate(scores, axis=0)  # N x K x L
    print("all_scores", all_scores.shape)
    return all_scores
    
def convert_models_to_fp32(model,eval): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        if not eval:
            p.grad.data = p.grad.data.float() 

In [7]:
convert_models_to_fp32(model, True)
# print(test_dataset.all_attributes)
print(train_dataloader.dataset.attribute_ownership)
all_scores=evaluation(model, train_dataloader, "cuda:1", False)


True
Computing features for evaluation...


  storage = elem.storage()._new_shared(numel, device=elem.device)
  storage = elem.storage()._new_shared(numel, device=elem.device)
  storage = elem.storage()._new_shared(numel, device=elem.device)
  storage = elem.storage()._new_shared(numel, device=elem.device)


{'image_options': tensor([[[[ 0.4413,  0.4121,  0.4413,  ..., -1.5295, -1.5149, -1.5149],
          [ 0.4413,  0.4413,  0.4559,  ..., -1.5587, -1.5295, -1.5295],
          [ 0.3829,  0.4413,  0.4559,  ..., -1.5587, -1.5441, -1.5149],
          ...,
          [-0.3470, -0.3908, -0.7996,  ..., -0.8142, -0.9164, -0.7704],
          [-0.0550,  0.0033, -0.0405,  ..., -0.8288, -0.7996, -1.0185],
          [-0.0405, -0.1280, -0.1134,  ..., -0.7850, -0.7266, -0.5660]],

         [[ 0.5141,  0.4841,  0.4841,  ..., -1.4519, -1.4069, -1.4069],
          [ 0.4991,  0.4991,  0.5141,  ..., -1.4669, -1.4219, -1.4069],
          [ 0.4691,  0.4991,  0.5591,  ..., -1.4369, -1.4069, -1.4069],
          ...,
          [-0.2513, -0.3114, -0.6265,  ..., -0.6565, -0.6865, -0.4764],
          [ 0.0038,  0.0638,  0.0488,  ..., -0.7166, -0.5965, -0.7766],
          [ 0.0188, -0.0712, -0.1313,  ..., -0.5365, -0.3714, -0.2213]],

         [[ 0.7523,  0.6955,  0.7381,  ..., -1.1389, -1.1105, -1.1105],
          [ 

ValueError: need at least one array to concatenate

In [7]:
print(len(train_dataset.all_attributes))
train_dataset.all_attributes


28748


['open_crouched',
 'open_gray',
 'open_metal',
 'open_long sleeved',
 'open_brown',
 'open_striped',
 'crouched_gray',
 'crouched_metal',
 'crouched_long sleeved',
 'crouched_brown',
 'crouched_striped',
 'gray_long sleeved',
 'gray_brown',
 'gray_striped',
 'metal_long sleeved',
 'metal_brown',
 'metal_striped',
 'unpeeled_black',
 'unpeeled_light colored',
 'unpeeled_square',
 'unpeeled_white',
 'black_unpeeled',
 'black_attached',
 'black_light colored',
 'black_square',
 'black_white',
 'unpeeled_light colored',
 'unpeeled_square',
 'unpeeled_white',
 'attached_light colored',
 'attached_square',
 'attached_white',
 'gray_empty',
 'gray_orange',
 'gray_green',
 'empty_orange',
 'empty_green',
 'orange_green',
 'brown_square',
 'brown_white',
 'burnt_square',
 'burnt_white',
 'brown_little',
 'brown_small',
 'burnt_little',
 'burnt_small',
 'square_little',
 'square_small',
 'white_little',
 'white_small',
 'wood_brown',
 'paved_green',
 'paved_large',
 'paved_white',
 'green_large'

In [9]:
aa = np.unique(train_dataset.all_attributes)
print(len(aa))
print(aa)

10364
['abandoned_pink' 'abstract_blue' 'adidas_green' ... 'young_worn'
 'young_wrinkled' 'young_yellow']


In [16]:
score = np.squeeze(all_scores, axis=1)
attr_mask = (train_dataset.all_attributes == aa[0])
score_sub = score[attr_mask]
print(score_sub)
print(attr_mask.sum())
# test_result = train_dataset.evaluate_scores(all_scores)

[]


AttributeError: 'bool' object has no attribute 'sum'