In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import shutil
import sys

# comment this out if you are using the pip package
sys.path.append('../')

import torch
import numpy as np
import matplotlib.pyplot as plt
from dataset_interfaces import utils
from dataset_interfaces import run_textual_inversion
from dataset_interfaces import generate
import dataset_interfaces.imagenet_utils as in_utils
import dataset_interfaces.inference_utils as infer_utils
from pathlib import Path

create_confounded_dataset = False

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from dataset_interfaces.imagenet_utils import *

def idx_to_foldername(idx):
    if idx != 0:
        return IMAGENET_IDX_TO_SYNSET[f'{idx}']['id']
    else:
        return '0'

def idx_to_label(idx):
    return IMAGENET_IDX_TO_SYNSET[f'{idx}']['label'].split(',')[0]
    # return IMAGENET_COMMON_CLASS_NAMES[idx]

def spaces_to_underscores(label):
    '''
    use for converting label / background strings to file names
    '''
    return '_'.join(label.split())

if create_confounded_dataset == True:
    # construct training set to extract embeddings
    # introduce confounding by selecting every object under only one beta
    # only one sample per object (in this case, object = class)


    # list of backgrounds (beta)
    background_strings = {
        # 'dusk':"at dusk", 
        'night':"at night", 
        # 'sunlight':"in bright sunlight", 
        'fog':"in the fog", 
        'forest':"in the forest", 
        'rain':"in the rain", 
        'snow':"in the snow"
    }
    # number of objects Z in dataset
    NUM_OBJECTS = len(IMAGENET_IDX_TO_SYNSET)
    NUM_BACKGROUNDS = len(background_strings)
    # each object gets a single background
    background_to_objects = {} 
    np.random.seed(0)
    objects_set = np.random.permutation(NUM_OBJECTS)
    objects_per_bg = int(np.floor(NUM_OBJECTS/NUM_BACKGROUNDS))
    for idx, key in enumerate(background_strings.keys()):
        # key is a one-word descriptor of the background
        # map non-overlapping subsets of the objects to each background
        background_to_objects[key] = objects_set[idx*objects_per_bg : np.min([NUM_OBJECTS, (idx+1)*objects_per_bg])]
    # construct dataset
    imagenet_star_path = '/raid/infolab/nlokesh/dataset-interfaces/data/imagenet_star/'
    confounded_dataset_path = os.path.join(imagenet_star_path, 'confounded_dataset/')
    if not os.path.exists(confounded_dataset_path):
        os.makedirs(confounded_dataset_path)


    for key, value in background_to_objects.items():
        bg_shorthand = key
        bg_path_local = spaces_to_underscores(background_strings[key])
        bg_path = os.path.join(imagenet_star_path, bg_path_local)
        for idx in value:
            foldername = idx_to_foldername(idx)
            img_path = os.path.join(bg_path, foldername, '00.jpg')
            if not os.path.exists(img_path):
                img_path = os.path.join(bg_path, foldername, foldername, '00.jpg')

            from_address = img_path
            z_object = spaces_to_underscores(idx_to_label(idx))
            beta = spaces_to_underscores(bg_shorthand)
            if not os.path.exists(os.path.join(confounded_dataset_path, z_object)):
                os.makedirs(os.path.join(confounded_dataset_path, z_object))
            to_address = os.path.join(confounded_dataset_path, z_object, f'idx-{idx}-z-{z_object}-beta-{beta}-00.jpg' )
            shutil.copy(from_address, to_address)

In [4]:
# Now that we have constructed the training dataset with confounding between beta and z, let us try to learn embeddings z*
print("Currently hardcoded to 'confounded_dataset' folder")
IMAGENET_ROOT = "/raid/infolab/nlokesh/dataset-interfaces/data/imagenet_star/confounded_dataset"

Currently hardcoded to 'confounded_dataset' folder


In [5]:
# path where to store an encoder, which we will load in with the learned tokens
encoder_root = "./encoder_root"

In [6]:
# a subset of ImageNet classes
classes = range(len(IMAGENET_IDX_TO_SYNSET))
class_names = [idx_to_label(c) for c in classes]
tokens = [f"<{class_names[i]}-{i}>" for i in range(len(class_names))]

# train_data_dirs = [os.path.join(IMAGENET_ROOT, "train", in_utils.IMAGENET_IDX_TO_SYNSET[str(c)]['id']) for c in classes]
train_data_dirs = [os.path.join(IMAGENET_ROOT, spaces_to_underscores(idx_to_label(c))) for c in classes]


In [11]:
embeds = []
for i in range(len(classes)):
    
    # runs textual inversion on a single class
    embed = run_textual_inversion(train_data_dirs[i],
        token = tokens[i],
        class_name = class_names[i]
    )
    
    embeds.append(embed)
    infer_utils.create_encoder(embeds=embeds, tokens=tokens[:(i+1)], class_names=class_names[:(i+1)], encoder_root=encoder_root[:(i+1)])

100%|██████████| 1/1 [00:00<00:00,  2.76it/s]
 50%|█████     | 1/2 [00:00<00:00,  2.60it/s]


IndexError: list index out of range