In [1]:
import numpy as np
import PIL
import pandas as pd
import os
import copy
import scipy.io

things_stim_path = '/user_data/mmhender/things/'
things_images_root = os.path.join(things_stim_path, 'Images')

# def process_concepts():

filename = os.path.join(things_stim_path,'things_concepts.tsv')

df = pd.read_csv(filename, sep='\t')
concept_list = np.array(df['Word'])
ids_list = np.array(df['uniqueID'])
concept_list = [c.replace(' ', '_') for c in concept_list]
n_concepts = len(concept_list)
# concepts are the fine-grained/basic level names

info_folder = os.path.join(things_stim_path,'27 higher-level categories')
categ_names = scipy.io.loadmat(os.path.join(info_folder, 'categories.mat'))['categories'][0]
categ_names = [categ_names[ii][0] for ii in range(len(categ_names))]
categ_names = [categ.replace(' ', '_') for categ in categ_names]
n_categ = len(categ_names)
# categories are the high-level/superordinate names

# load the "bottom-up" (human-generated) groupings
dat = scipy.io.loadmat(os.path.join(info_folder, 'category_mat_bottom_up.mat'))
cmat = dat['category_mat_bottom_up']

# there is a swap in this labeling betweeen "hot-air balloon" and "hot chocolate"
# (maybe a typo?)
# i am manually switching them here
cmat_fixed = copy.deepcopy(cmat)
tmp = copy.deepcopy(cmat[801,:])
cmat_fixed[801,:] = cmat[803,:]
cmat_fixed[803,:] = tmp

concepts_each_categ = [np.array(concept_list)[cmat_fixed[:,ii]==1] for ii in range(n_categ)]

# now going to fix these a bit to get rid of anything ambiguous
cmat_adjusted = copy.deepcopy(cmat_fixed).astype(bool) 

# removing any duplicate concept names here (these are ambiguous meaning words like bat)
un, counts = np.unique(concept_list, return_counts=True)
duplicate_conc = un[counts>1]
duplicate_conc_inds = np.where([conc in duplicate_conc for conc in concept_list])
cmat_adjusted[duplicate_conc_inds,:] = False

# remove any concepts that have the same name as one of the categories (for example "fruit")
duplicate_inds = np.where([conc in categ_names for conc in concept_list])[0]
cmat_adjusted[duplicate_inds,:] = False

# deciding how to resolve overlap between categories. 
# set these as categories to "prioritize" when the same concept occurs in 
# another category. 
categories_prioritize = ['bird','insect','dessert','fruit','vegetable']

for cc1 in range(n_categ):

    for cc2 in np.arange(cc1+1, n_categ):

        overlap = cmat_adjusted[:,cc1] & cmat_adjusted[:,cc2]

        cat1 = categ_names[cc1]
        cat2 = categ_names[cc2]

        if np.sum(overlap)>0:

            if (cat1 in categories_prioritize) and (cat2 not in categories_prioritize):
                # remove concept from the not-prioritized category
                print('%s over %s'%(cat1, cat2))
                cmat_adjusted[overlap,cc2] = False
            elif (cat2 in categories_prioritize) and (cat1 not in categories_prioritize):
                print('%s over %s'%(cat2, cat1))
                cmat_adjusted[overlap,cc1] = False        
            else:
                # if neither is prioritized, don't use the concept at all
                print('%s and %s, remove:'%(cat1, cat2))
                print(np.array(concept_list)[overlap])
                cmat_adjusted[overlap,cc1] = False
                cmat_adjusted[overlap,cc2] = False

concepts_each_categ_adj = [np.array(concept_list)[cmat_adjusted[:,ii]==1] for ii in range(n_categ)]
ids_each_categ = [np.array(ids_list)[cmat_adjusted[:,ii]==1] for ii in range(n_categ)]


bird over animal
insect over animal
clothing and clothing_accessory, remove:
['bowtie' 'cummerbund']
dessert over drink
dessert over food
fruit over food
food and plant, remove:
['seed']
vegetable over food
fruit and vegetable, remove:
['tomato']
furniture and home_decor, remove:
['coat_rack']
kitchen_appliance and kitchen_tool, remove:
['kettle']
kitchen_tool and tool, remove:
['funnel' 'icepick']
office_supply and tool, remove:
['letter_opener']
sports_equipment and tool, remove:
['bungee']
sports_equipment and toy, remove:
['frisbee']
tool and weapon, remove:
['trident']
toy and vehicle, remove:
['scooter']
toy and weapon, remove:
['boomerang']
