In [2]:
# img_path = "data/test"

In [1]:
import torch
from torch.utils.data import Dataset
from PIL import Image
import os
import os.path as osp


class TestDataset(Dataset):
    def __init__(self, img_path, input_transform=None):
        self.img_path = img_path
        self.image_filenames = os.listdir(img_path)
        self.input_transform = input_transform

    def __len__(self):
        return len(self.image_filenames)

    def __getitem__(self, idx):
        img = Image.open(osp.join(self.img_path, self.image_filenames[idx]))
        if self.input_transform:
            img = self.input_transform(img)
        return img, self.image_filenames[idx]


In [7]:
from torchvision import transforms

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

test_data_transform = transforms.Compose([
    transforms.Resize((448, 448)),
    transforms.ToTensor(),
    normalize])

In [3]:
test_dataset = TestDataset("data/test", input_transform=test_data_transform)

In [4]:
test_dataset[0]

(tensor([[[ 2.0948,  2.0948,  2.1633,  ..., -2.0665, -2.0152, -2.0837],
          [ 2.0605,  2.0777,  2.1462,  ..., -2.0837, -2.0323, -2.1008],
          [ 1.9920,  2.0263,  2.1119,  ..., -2.1008, -2.0837, -2.1179],
          ...,
          [-2.1008, -2.0665, -2.0837,  ...,  1.8550,  1.8208,  1.9578],
          [-2.1008, -2.0837, -2.1008,  ...,  1.8379,  1.8037,  1.8893],
          [-2.1008, -2.0837, -2.1008,  ...,  1.8208,  1.8037,  1.8379]],
 
         [[ 2.3235,  2.3060,  2.3585,  ..., -1.9832, -1.9307, -1.9657],
          [ 2.2710,  2.2885,  2.3410,  ..., -2.0007, -1.9482, -1.9832],
          [ 2.2010,  2.2360,  2.3235,  ..., -2.0182, -1.9832, -2.0182],
          ...,
          [-2.0007, -1.9832, -1.9832,  ...,  2.0609,  2.0259,  2.1485],
          [-2.0007, -2.0007, -2.0007,  ...,  2.0259,  2.0084,  2.0784],
          [-2.0007, -2.0007, -2.0182,  ...,  2.0084,  1.9909,  2.0259]],
 
         [[ 2.3786,  2.4134,  2.5703,  ..., -1.7696, -1.7347, -1.7870],
          [ 2.3786,  2.4134,

In [1]:
import argparse


def parser_args(args=None):
    parser = argparse.ArgumentParser(description='Query2Label MSCOCO Training')
    parser.add_argument('--dataname', help='dataname', default='my', choices=['coco14', 'my'])
    parser.add_argument('--dataset_dir', help='dir of dataset', default='/comp_robot/liushilong/data/COCO14/')
    parser.add_argument('--img_size', default=448, type=int,
                        help='size of input images')

    parser.add_argument('--output', metavar='DIR',
                        help='path to output folder')
    parser.add_argument('--num_class', default=4375, type=int,
                        help="Number of query slots")
    parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                        help='use pre-trained model. default is False. ')
    parser.add_argument('--optim', default='AdamW', type=str, choices=['AdamW', 'Adam_twd'],
                        help='which optim to use')

    # loss
    parser.add_argument('--eps', default=1e-5, type=float,
                        help='eps for focal loss (default: 1e-5)')
    parser.add_argument('--dtgfl', action='store_true', default=False,
                        help='disable_torch_grad_focal_loss in asl')
    parser.add_argument('--gamma_pos', default=0, type=float,
                        metavar='gamma_pos', help='gamma pos for simplified asl loss')
    parser.add_argument('--gamma_neg', default=2, type=float,
                        metavar='gamma_neg', help='gamma neg for simplified asl loss')
    parser.add_argument('--loss_dev', default=-1, type=float,
                        help='scale factor for loss')
    parser.add_argument('--loss_clip', default=0.0, type=float,
                        help='scale factor for clip')

    parser.add_argument('-j', '--workers', default=0, type=int, metavar='N',
                        help='number of data loading workers (default: 32)')
    parser.add_argument('--epochs', default=80, type=int, metavar='N',
                        help='number of total epochs to run')

    parser.add_argument('--val_interval', default=1, type=int, metavar='N',
                        help='interval of validation')

    parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                        help='manual epoch number (useful on restarts)')
    parser.add_argument('-b', '--batch-size', default=256, type=int,
                        metavar='N',
                        help='mini-batch size (default: 256), this is the total '
                             'batch size of all GPUs')

    parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
                        metavar='LR', help='initial learning rate', dest='lr')
    parser.add_argument('--wd', '--weight-decay', default=1e-2, type=float,
                        metavar='W', help='weight decay (default: 1e-2)',
                        dest='weight_decay')

    parser.add_argument('-p', '--print-freq', default=10, type=int,
                        metavar='N', help='print frequency (default: 10)')
    parser.add_argument('--resume', default='', type=str, metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--resume_omit', default=[], type=str, nargs='*')
    parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                        help='evaluate model on validation set')

    parser.add_argument('--ema-decay', default=0.9997, type=float, metavar='M',
                        help='decay of model ema')
    parser.add_argument('--ema-epoch', default=0, type=int, metavar='M',
                        help='start ema epoch')

    # distribution training
    parser.add_argument('--world-size', default=-1, type=int,
                        help='number of nodes for distributed training')
    parser.add_argument('--rank', default=-1, type=int,
                        help='node rank for distributed training')
    parser.add_argument('--dist-url', default='env://', type=str,
                        help='url used to set up distributed training')
    parser.add_argument('--seed', default=None, type=int,
                        help='seed for initializing training. ')
    parser.add_argument("--local_rank", type=int, help='local rank for DistributedDataParallel')

    # data aug
    parser.add_argument('--cutout', action='store_true', default=False,
                        help='apply cutout')
    parser.add_argument('--n_holes', type=int, default=1,
                        help='number of holes to cut out from image')
    parser.add_argument('--length', type=int, default=-1,
                        help='length of the holes. suggest to use default setting -1.')
    parser.add_argument('--cut_fact', type=float, default=0.5,
                        help='mutual exclusion with length. ')

    parser.add_argument('--orid_norm', action='store_true', default=False,
                        help='using mean [0,0,0] and std [1,1,1] to normalize input images')

    # * Transformer
    parser.add_argument('--enc_layers', default=1, type=int,
                        help="Number of encoding layers in the transformer")
    parser.add_argument('--dec_layers', default=2, type=int,
                        help="Number of decoding layers in the transformer")
    parser.add_argument('--dim_feedforward', default=8192, type=int,
                        help="Intermediate size of the feedforward layers in the transformer blocks")
    parser.add_argument('--hidden_dim', default=2048, type=int,
                        help="Size of the embeddings (dimension of the transformer)")
    parser.add_argument('--dropout', default=0.1, type=float,
                        help="Dropout applied in the transformer")
    parser.add_argument('--nheads', default=4, type=int,
                        help="Number of attention heads inside the transformer's attentions")
    parser.add_argument('--pre_norm', action='store_true')
    parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine'),
                        help="Type of positional embedding to use on top of the image features")
    parser.add_argument('--backbone', default='resnet101', type=str,
                        help="Name of the convolutional backbone to use")
    parser.add_argument('--keep_other_self_attn_dec', action='store_true',
                        help='keep the other self attention modules in transformer decoders, which will be removed default.')
    parser.add_argument('--keep_first_self_attn_dec', action='store_true',
                        help='keep the first self attention module in transformer decoders, which will be removed default.')
    parser.add_argument('--keep_input_proj', action='store_true',
                        help="keep the input projection layer. Needed when the channel of image features is different from hidden_dim of Transformer layers.")

    # * raining
    parser.add_argument('--amp', action='store_true', default=False,
                        help='apply amp')
    parser.add_argument('--early-stop', action='store_true', default=False,
                        help='apply early stop')
    parser.add_argument('--kill-stop', action='store_true', default=False,
                        help='apply early stop')
    args = parser.parse_args(args)
    return args

def get_args(args=None):
    args = parser_args(args)
    return args

In [2]:
args = get_args('--dataset_dir data --backbone resnet101 --dataname my --batch-size 2 --print-freq 100 --output "result" --world-size 1 --rank 0 --dist-url tcp://127.0.0.1:3717 --gamma_pos 0 --gamma_neg 2 --dtgfl --epochs 80 --lr 1e-4 --optim AdamW --pretrained --num_class 4375 --img_size 448 --weight-decay 1e-2 --cutout --n_holes 1 --cut_fact 0.5 --hidden_dim 2048 --dim_feedforward 8192 --enc_layers 1 --dec_layers 2 --nheads 4 --early-stop --amp'.split())
args

Namespace(dataname='my', dataset_dir='data', img_size=448, output='"result"', num_class=4375, pretrained=True, optim='AdamW', eps=1e-05, dtgfl=True, gamma_pos=0.0, gamma_neg=2.0, loss_dev=-1, loss_clip=0.0, workers=0, epochs=80, val_interval=1, start_epoch=0, batch_size=2, lr=0.0001, weight_decay=0.01, print_freq=100, resume='', resume_omit=[], evaluate=False, ema_decay=0.9997, ema_epoch=0, world_size=1, rank=0, dist_url='tcp://127.0.0.1:3717', seed=None, local_rank=None, cutout=True, n_holes=1, length=-1, cut_fact=0.5, orid_norm=False, enc_layers=1, dec_layers=2, dim_feedforward=8192, hidden_dim=2048, dropout=0.1, nheads=4, pre_norm=False, position_embedding='sine', backbone='resnet101', keep_other_self_attn_dec=False, keep_first_self_attn_dec=False, keep_input_proj=False, amp=True, early_stop=True, kill_stop=False)

In [4]:
from collections import OrderedDict
from models.query2label import build_q2l
import torch
model = build_q2l(args)
checkpoint = torch.load("query2labels/result/model_best.pth.tar")

def clean_state_dict(state_dict):
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        if k[:7] == 'module.':
            k = k[7:]  # remove `module.`
        new_state_dict[k] = v
    return new_state_dict

state_dict = clean_state_dict(checkpoint['state_dict'])

model.load_state_dict(state_dict)

set model.input_proj to Indentify!


<All keys matched successfully>

In [5]:
model = model.cuda()

In [33]:
from dataset.cocodataset import CoCoDataset

train_ds = CoCoDataset("", "", input_transform=test_data_transform, labels_path="query2labels/data/coco/train_label_vectors.npy")

In [34]:
import json
with open("category_map.json", "r") as f:
    cate = json.load(f)

r_cate = {cate[i]: i for i in cate}

In [35]:
import numpy as np

a = np.where(train_ds[0][1] == 1)

In [36]:
for i in a[0]:
    print(r_cate[i])

舞台
地站
话筒
男人


In [9]:
from torch.utils.data import DataLoader

test_dl = DataLoader(test_dataset, batch_size=1)

# for img, name in test_dl:
#     print(img)
#     print(name)
#     break

In [15]:
from tqdm import tqdm
import pandas as pd
import torch

test_label = pd.DataFrame(columns=['filename', 'vector'])

with torch.no_grad():
    for img, name in tqdm(test_dl):
        img = img.to("cuda")
        result = model(img).cpu().numpy()
        for idx, filename in enumerate(name):
            test_label = test_label.append({'filename': filename, 'vector': list(result[idx])}, ignore_index=True)

In [46]:
# import pandas as pd
#
# test_label = pd.DataFrame(columns=['filename', 'vector'])

In [16]:
# test_label = test_label.append({'filename': "a.jpg"}, ignore_index=True)
test_label.to_csv("test_label.csv")