In [39]:
import json


def get_object_attribute(json_path, key):
    scene_dict = json.load(open(json_path))
    object = scene_dict["objects"][0]
    return object[key]

In [50]:
def print_important_features(feature_importances, max_clusters_per_block):
    important_features = []
    feature_names = []
    for i, max_clusters in enumerate(max_clusters_per_block):
        feature_names += [f"block_{i}_cluster_{j}" for j in range(max_clusters)]
    # feature_names = [f"block_{i}" for i in range(16)]
    for i in range(len(feature_importances)):
        if feature_importances[i] > 0:
            important_features.append(
                (feature_names[i], round(feature_importances[i], 4))
            )

    # sort important features by importance
    important_features.sort(key=lambda x: x[1], reverse=True)
    for n, i in important_features:
        print(f"{n}: \t\t {i}")

In [51]:
import pickle


def get_max_clusters_per_block(dict_path):
    with open(dict_path, "rb") as f:
        block_concepts = pickle.load(f)
    max_cluster_per_block = []
    for block in block_concepts:
        exemplars = block["exemplars"]
        max_value = len(exemplars["exemplar_ids"])
        max_cluster_per_block.append(max_value)

    return max_cluster_per_block

In [42]:
import torch


def transform_attrs(attrs, max_clusters_per_block):
    """
    very sorry this is so hideous, I needed a quick hack. Here I create a one hot encoding per block, based on the
    number of concepts per block. I then conatenate them together to a tensor.
    """
    n_blocks = attrs.shape[2]
    attrs_one_hot = [
        torch.nn.functional.one_hot(
            attrs[:, :, block_id].long(), num_classes=max_clusters_per_block[block_id]
        )
        for block_id in range(n_blocks)
    ]
    attrs_one_hot_cat = torch.cat(attrs_one_hot, dim=2).type(torch.FloatTensor)
    return attrs_one_hot_cat

In [52]:
# CLEVR-Easy
# get 4000 images and train classifier for color and shape

import os
import sys
import numpy as np
import torch
import pickle
from sklearn.tree import DecisionTreeClassifier, export_text

# load data
data_path = "../CLEVR-Easy-1/sudoku"
model_seeds = [0, 1, 2]

blocks_to_delete = []
image_to_code_paths = [
    f"../logs/CLEVR-Easy/seed_{seed}/sudoku_image_to_code.pkl" for seed in model_seeds
]
block_concept_paths = [
    f"../logs/CLEVR-Easy/seed_{seed}/block_concept_dicts.pkl" for seed in model_seeds
]

# get first 4000 images
image_names = os.listdir(data_path + "/images")
image_names = image_names[:4000]

# get json paths for images
jsons = []
for image_name in image_names:
    jsons.append(data_path + "/scenes/" + image_name[:-4] + ".json")

# get attributes
colors = []
shapes = []
for json_path in jsons:
    colors.append(get_object_attribute(json_path, "color"))
    shapes.append(get_object_attribute(json_path, "shape"))
# TODO: do jsons fit to code images?

# convert colors and shapes to integers
color_to_int = {
    "blue": 0,
    "brown": 1,
    "cyan": 2,
    "gray": 3,
    "green": 4,
    "purple": 5,
    "red": 6,
    "yellow": 7,
}
shape_to_int = {"cube": 0, "cylinder": 1, "sphere": 2}

colors = np.array([color_to_int[color] for color in colors])
shapes = np.array([shape_to_int[shape] for shape in shapes])

for i, seed in enumerate(model_seeds):

    path_to_codes = image_to_code_paths[i]

    # check if image_to_code dict exists
    try:
        with open(path_to_codes, "rb") as f:
            map_image_to_code = pickle.load(f)
    except:
        print(f"Error: {path_to_codes} does not exist")
        sys.exit(1)

    # get image codes
    codes = []
    for image_name in image_names:
        codes.append(map_image_to_code["CLEVR-Easy-1/sudoku/images/" + image_name])

    # train decision tree

    # transform codes to one-hot encoding
    codes = torch.tensor(codes)
    max_clusters_per_block = get_max_clusters_per_block(block_concept_paths[seed])
    codes = transform_attrs(codes, max_clusters_per_block)
    codes = np.array(codes).squeeze()

    # train dt to classify colors based on codes
    dt_color = DecisionTreeClassifier()
    dt_color.fit(codes, colors)

    # train dt to classify shapes based on codes
    dt_shape = DecisionTreeClassifier()
    dt_shape.fit(codes, shapes)

    # report feature importances
    print("Feature importances for color classifier:")
    color_feature_importances = dt_color.feature_importances_
    print_important_features(color_feature_importances, max_clusters_per_block)

    print("Feature importances for shape classifier:")
    shape_feature_importances = dt_shape.feature_importances_
    print_important_features(shape_feature_importances, max_clusters_per_block)

    # get feature ids that are not important
    unimportant_features_color = []
    unimportant_features_shape = []
    for i in range(len(color_feature_importances)):
        if color_feature_importances[i] == 0:
            unimportant_features_color.append(i)
        if shape_feature_importances[i] == 0:
            unimportant_features_shape.append(i)

    # get features that are unimportant for both color and shape
    unimportant_features = set(unimportant_features_color).intersection(
        unimportant_features_shape
    )
    print(f"Unimportant features: {unimportant_features}")

    delete_dict = {}
    # get max clusters per block
    max_clusters_per_block = get_max_clusters_per_block(block_concept_paths[seed])
    indices_of_clusters = np.cumsum(max_clusters_per_block)
    for block in range(len(max_clusters_per_block)):
        delete_dict[block] = []
        for cluster_id in range(max_clusters_per_block[block]):
            feature_id = (
                indices_of_clusters[block] - max_clusters_per_block[block] + cluster_id
            )
            if feature_id in unimportant_features:
                delete_dict[block].append(cluster_id)

    print(f"Blocks to delete: {delete_dict}")
    blocks_to_delete.append(delete_dict)

Feature importances for color classifier:
block_4_cluster_8: 		 0.1503
block_5_cluster_5: 		 0.1443
block_4_cluster_5: 		 0.1434
block_5_cluster_4: 		 0.1408
block_4_cluster_3: 		 0.1386
block_4_cluster_6: 		 0.1385
block_5_cluster_0: 		 0.1378
block_2_cluster_10: 		 0.0046
block_6_cluster_0: 		 0.0008
block_4_cluster_0: 		 0.0006
block_6_cluster_1: 		 0.0003
block_3_cluster_1: 		 0.0
Feature importances for shape classifier:
block_7_cluster_1: 		 0.4998
block_6_cluster_1: 		 0.4949
block_6_cluster_2: 		 0.003
block_3_cluster_1: 		 0.0009
block_4_cluster_3: 		 0.0005
block_4_cluster_0: 		 0.0004
block_5_cluster_0: 		 0.0004
Unimportant features: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 20, 21, 23, 26, 29, 30, 31, 37, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 1

In [45]:
blocks_to_delete

[{0: [0],
  1: [0, 1, 2, 3, 4],
  2: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
  3: [],
  4: [1, 2, 4, 7],
  5: [1, 2, 3],
  6: [],
  7: [2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   40,
   41,
   42,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   50,
   51,
   52,
   53,
   54,
   55,
   56,
   57,
   58,
   59,
   60,
   61,
   62,
   63,
   64,
   65,
   66,
   67,
   68,
   69,
   70,
   71,
   72,
   73,
   74,
   75,
   76,
   77,
   78,
   79,
   80,
   81,
   82,
   83,
   84,
   85,
   86,
   87,
   88,
   89,
   90]},
 {0: [1, 2],
  1: [0],
  2: [0, 4, 5, 6, 7, 8, 9],
  3: [0],
  4: [0],
  5: [0],
  6: [4, 5, 7],
  7: []},
 {0: [0, 1, 2, 3],
  1: [0, 3],
  2: [0],
  3: [0],
  4: [0, 1],
  5: [0, 1, 2],
  6: [1, 4, 5],
  7: [1, 2, 3, 5, 7, 8, 9]}]

In [47]:
# save blocks to delete
for seed in model_seeds:
    with open(
        f"../logs/CLEVR-Easy/seed_{seed}/blocks_to_delete_by_dt_{seed}_thrsh_0.pkl",
        "wb",
    ) as f:
        pickle.dump(blocks_to_delete[seed], f)

In [38]:
# load blocks to delete
blocks_to_delete = []
for seed in model_seeds:
    with open(
        f"../logs/CLEVR-Easy/seed_{seed}/blocks_to_delete_by_dt_{seed}_2.pkl", "rb"
    ) as f:
        blocks_to_delete.append(pickle.load(f))
        print(blocks_to_delete[-1])

{0: [0], 1: [0, 1, 2, 3, 4], 2: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 3: [1], 4: [1, 2, 4, 7], 5: [1, 3], 6: [0], 7: [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90]}
{0: [1, 2], 1: [0], 2: [0, 4, 5, 6, 7, 8], 3: [0], 4: [0], 5: [0], 6: [4, 5, 6, 7], 7: []}
{0: [0, 1, 2], 1: [0, 1, 3], 2: [0], 3: [0], 4: [0, 1], 5: [0, 1, 2], 6: [1, 3, 4, 5], 7: [2, 5, 6, 7, 8, 9]}


In [58]:
# CLEVR 4
# get 1000 images and train classifier for color and shape

# load data
data_path = "../sudoku"
model_seeds = [0, 1, 2]

blocks_to_delete = []
image_to_code_paths = [
    f"../logs/CLEVR-4/seed_{seed}/sudoku_image_to_code.pkl" for seed in model_seeds
]
block_concept_paths = [
    f"../logs/CLEVR-4/seed_{seed}/block_concept_dicts.pkl" for seed in model_seeds
]

# get first 1000 images
image_names = os.listdir(data_path + "/images")
image_names = image_names[:1000]

# get json paths for images
jsons = []
for image_name in image_names:
    jsons.append(data_path + "/scenes/" + image_name[:-4] + ".json")

# get attributes
colors = []
shapes = []
materials = []
sizes = []

for json_path in jsons:
    colors.append(get_object_attribute(json_path, "color"))
    shapes.append(get_object_attribute(json_path, "shape"))
    materials.append(get_object_attribute(json_path, "material"))
    sizes.append(get_object_attribute(json_path, "size"))

# convert colors and shapes to integers
color_to_int = {
    "blue": 0,
    "brown": 1,
    "cyan": 2,
    "gray": 3,
    "green": 4,
    "purple": 5,
    "red": 6,
    "yellow": 7,
}
shape_to_int = {"cube": 0, "cylinder": 1, "sphere": 2}
material_to_int = {"metal": 0, "rubber": 1}
size_to_int = {"large": 0, "small": 1}

colors = np.array([color_to_int[color] for color in colors])
shapes = np.array([shape_to_int[shape] for shape in shapes])
materials = np.array([material_to_int[material] for material in materials])
sizes = np.array([size_to_int[size] for size in sizes])

for i, seed in enumerate(model_seeds):

    path_to_codes = image_to_code_paths[i]
    # check if image_to_code dict exists
    try:
        with open(path_to_codes, "rb") as f:
            map_image_to_code = pickle.load(f)
    except:
        print(f"Error: {path_to_codes} does not exist")
        sys.exit(1)

    # get image codes
    codes = []
    for image_name in image_names:
        codes.append(map_image_to_code["sudoku/images/" + image_name])

    # train decision tree

    # transform codes to one-hot encoding
    codes = torch.tensor(codes)
    max_clusters_per_block = get_max_clusters_per_block(block_concept_paths[seed])
    codes = transform_attrs(codes, max_clusters_per_block)
    codes = np.array(codes).squeeze()

    # train dt to classify colors based on codes
    dt_color = DecisionTreeClassifier()
    dt_color.fit(codes, colors)

    # train dt to classify shapes based on codes
    dt_shape = DecisionTreeClassifier()
    dt_shape.fit(codes, shapes)

    # train dt to classify materials based on codes
    dt_material = DecisionTreeClassifier()
    dt_material.fit(codes, materials)

    # train dt to classify sizes based on codes
    dt_size = DecisionTreeClassifier()
    dt_size.fit(codes, sizes)

    # report feature importances
    print("Feature importances for color classifier:")
    color_feature_importances = dt_color.feature_importances_
    print_important_features(color_feature_importances, max_clusters_per_block)

    print("Feature importances for shape classifier:")
    shape_feature_importances = dt_shape.feature_importances_
    print_important_features(shape_feature_importances, max_clusters_per_block)

    print("Feature importances for material classifier:")
    material_feature_importances = dt_material.feature_importances_
    print_important_features(material_feature_importances, max_clusters_per_block)

    print("Feature importances for size classifier:")
    size_feature_importances = dt_size.feature_importances_
    print_important_features(size_feature_importances, max_clusters_per_block)

    # get feature ids that are not important
    unimportant_features_color = []
    unimportant_features_shape = []
    unimportant_features_material = []
    unimportant_features_size = []
    for i in range(len(color_feature_importances)):
        if color_feature_importances[i] == 0:
            unimportant_features_color.append(i)
        if shape_feature_importances[i] == 0:
            unimportant_features_shape.append(i)
        if material_feature_importances[i] == 0:
            unimportant_features_material.append(i)
        if size_feature_importances[i] == 0:
            unimportant_features_size.append(i)

    # get features that are unimportant for all attributes
    unimportant_features = (
        set(unimportant_features_color)
        .intersection(unimportant_features_shape)
        .intersection(unimportant_features_material)
        .intersection(unimportant_features_size)
    )
    print(f"Unimportant features: {unimportant_features}")

    delete_dict = {}
    # get max clusters per block
    max_clusters_per_block = get_max_clusters_per_block(block_concept_paths[seed])
    indices_of_clusters = np.cumsum(max_clusters_per_block)
    for block in range(len(max_clusters_per_block)):
        delete_dict[block] = []
        for cluster_id in range(max_clusters_per_block[block]):
            feature_id = (
                indices_of_clusters[block] - max_clusters_per_block[block] + cluster_id
            )
            if feature_id in unimportant_features:
                delete_dict[block].append(cluster_id)

    print(f"Blocks to delete: {delete_dict}")
    blocks_to_delete.append(delete_dict)

Feature importances for color classifier:
block_1_cluster_6: 		 0.1334
block_5_cluster_2: 		 0.1315
block_5_cluster_0: 		 0.1306
block_5_cluster_1: 		 0.1289
block_1_cluster_5: 		 0.1204
block_10_cluster_0: 		 0.1126
block_1_cluster_7: 		 0.1019
block_1_cluster_4: 		 0.0628
block_3_cluster_8: 		 0.0031
block_3_cluster_4: 		 0.0029
block_3_cluster_116: 		 0.0029
block_7_cluster_1: 		 0.0029
block_2_cluster_11: 		 0.0027
block_3_cluster_39: 		 0.0027
block_3_cluster_248: 		 0.0026
block_3_cluster_218: 		 0.0024
block_7_cluster_0: 		 0.0024
block_3_cluster_134: 		 0.0021
block_12_cluster_0: 		 0.0021
block_3_cluster_209: 		 0.0017
block_3_cluster_244: 		 0.0017
block_3_cluster_197: 		 0.0016
block_3_cluster_204: 		 0.0014
block_3_cluster_66: 		 0.0013
block_3_cluster_84: 		 0.0013
block_3_cluster_102: 		 0.0013
block_3_cluster_120: 		 0.0013
block_3_cluster_163: 		 0.0013
block_3_cluster_189: 		 0.0013
block_3_cluster_195: 		 0.0013
block_3_cluster_210: 		 0.0013
block_3_cluster_212: 		 0

In [57]:
# save blocks to delete
for seed in model_seeds:
    with open(
        f"../logs/CLEVR-4/seed_{seed}/blocks_to_delete_by_dt_{seed}_thrsh_0.pkl",
        "wb",
    ) as f:
        pickle.dump(blocks_to_delete[seed], f)

In [36]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Assuming you have trained your decision tree classifier
clf = DecisionTreeClassifier()
# Train your classifier with your data


def get_feature_value_pairs(tree, feature_names):
    feature_value_pairs = []
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != tree_.n_features else "decision" for i in tree_.feature
    ]

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != tree_.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            feature_value_pairs.append((name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            recurse(tree_.children_right[node], depth + 1)
        else:
            feature_value_pairs.append(("decision", np.argmax(tree_.value[node][0])))

    recurse(0, 1)
    return feature_value_pairs


# Assuming your feature names are stored in a list
# feature_names = ['feature0', 'feature1', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6', 'feature7']

# # Get relevant feature-value pairs
# relevant_feature_value_pairs = get_feature_value_pairs(clf, feature_names)
# print(relevant_feature_value_pairs)

In [7]:
# HUMAN revision clevr easy seed 1
import pickle

delete_dict = {
    0: [],
    1: [],
    2: [],
    3: [],
    4: [],
    5: [],
    6: [0, 1, 2, 3, 4, 5, 7],
    7: [],
}

pickle.dump(
    delete_dict,
    open(
        "../logs/CLEVR-Easy/seed_1/human_revision/blocks_to_delete_by_toni_2.pkl", "wb"
    ),
)

# reformulate merging dictionaries
pre_dict = {
    0: {1: [2]},
    1: {},
    2: {0: [4, 5, 7, 8, 9], 1: [2, 3]},
    3: {},
    4: {},
    5: {},
    6: {},
    7: {7: [8]},
}

result_dict = {}
ids = [0, 1, 2, 3, 4, 5, 6, 7]
num_c = [3, 1, 10, 1, 1, 1, 8, 9]
for b in range(8):
    result_dict[b] = {}
    for c_outer in range(num_c[b]):
        res_c = {}
        if c_outer in pre_dict[b].keys():
            for merge_idx in range(c_outer + 1, num_c[b]):
                if merge_idx in pre_dict[b][c_outer]:
                    res_c[merge_idx] = 1
                else:
                    res_c[merge_idx] = 0
        else:
            for merge_idx in range(c_outer + 1, num_c[b]):
                res_c[merge_idx] = 0
        result_dict[b][c_outer] = res_c

        # if c not in pre_dict[b].keys():
import pickle

pickle.dump(
    result_dict,
    open(
        "../logs/CLEVR-Easy/seed_1/human_revision/blocks_to_merge_by_toni_2.pkl", "wb"
    ),
)

In [9]:
# HUMAN revision clevr 4 seed 0

ids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
num_c = [3, 8, 12, 264, 7, 15, 1, 2, 5, 1, 11, 1, 20, 5, 1, 3]

delete_dict = {
    0: [2],
    1: [4],
    2: list(range(12)),
    3: list(range(264)),
    4: list(range(7)),
    5: [3],
    6: [],
    7: list(range(2)),
    8: [0],
    9: [],
    10: list(range(11)),
    11: [],
    12: list(range(20)),
    13: [0, 1, 2, 3, 4],
    14: [],
    15: [0, 1, 2],
}

pickle.dump(
    delete_dict,
    open("../logs/CLEVR-4/seed_0/human_revision/blocks_to_delete_by_toni_2.pkl", "wb"),
)


# reformulate merging dictionaries
pre_dict = {
    0: {},
    1: {},
    2: {},
    3: {},
    4: {},
    5: {4: [5, 8, 9, 10, 11, 12, 13, 14]},
    6: {},
    7: {},
    8: {},
    9: {},
    10: {},
    11: {},
    12: {},
    13: {},
    14: {},
    15: {},
}

result_dict = {}
for b in range(16):
    result_dict[b] = {}
    for c_outer in range(num_c[b]):
        res_c = {}
        if c_outer in pre_dict[b].keys():
            for merge_idx in range(c_outer + 1, num_c[b]):
                if merge_idx in pre_dict[b][c_outer]:
                    res_c[merge_idx] = 1
                else:
                    res_c[merge_idx] = 0
        else:
            for merge_idx in range(c_outer + 1, num_c[b]):
                res_c[merge_idx] = 0
        result_dict[b][c_outer] = res_c

        # if c not in pre_dict[b].keys():


pickle.dump(
    result_dict,
    open("../logs/CLEVR-4/seed_0/human_revision/blocks_to_merge_by_toni_2.pkl", "wb"),
)