In [1]:
import pickle
from utils.analysis_from_interaction import *
from egg.core.language_analysis import Disent
from language_analysis_local import TopographicSimilarityConceptLevel, encode_target_concepts_for_topsim
import os

# calculate metrics from stored interactions

In [64]:
datasets = ('(3,4)', '(3,8)', '(3,16)', '(4,4)', '(4,8)', '(5,4)')
n_attributes = (3, 3, 3, 4, 4, 5)
n_values = (4, 8, 16, 4, 8, 4)
epochs = 300
n_runs = 5
paths = ['results/' + d + '_game_size_10_vsf_0/' for d in datasets]

In [45]:
datasets = ('(3,8)',)
n_attributes = (3,)
n_values = (8, )
epochs = 300
n_runs = 5
paths = ['results/' + d + '_game_size_10_vsf_0/' for d in datasets]

In [46]:
context_unaware = False # whether original or context_unaware simulations are evaluated
zero_shot = False # whether zero-shot simulations are evaluated
zero_shot_test = 'specific' # 'generic' or 'specific'
test_interactions = False # whether scores should be calculated on test interactions (only with zero shot)
zero_shot_test_ds = 'test_sampled_unscaled' # 'test' or 'test_fine' or 'test_sampled_unscaled'
length_cost = True # whether length_cost was applied; length cost runs have been run with early stopping
early_stopping = True # only with length cost
rsa = False # only with context unaware
rsa_test = 'train'
setting = ""
if length_cost:
    setting = setting + 'length_cost/'
    if not context_unaware:
        setting = setting + 'context_aware'
if context_unaware:
    setting = setting + 'context_unaware'
else:
    if not length_cost:
        setting = setting + 'standard'
if zero_shot:
    setting = setting + '/zero_shot/' + zero_shot_test

In [47]:
# get n_epochs if early stopping
if early_stopping:
    
    n_epochs_all_data = []
    for d in range(len(datasets)):
        
        n_epochs = []
        
        for run in range(n_runs):
    
            path_to_run = paths[d] + str(setting) +'/' + str(run) + '/' 
            with open(os.path.join(path_to_run, 'loss_and_metrics.pkl'), 'rb') as input_file:
                data = pickle.load(input_file)
                final_epoch = max(data['loss_train'].keys())
                n_epochs.append(final_epoch)
                
        n_epochs_all_data.append(n_epochs)
        
else:
    n_epochs_all_data = []
    for d in range(len(datasets)):
        n_epochs = []
        
        for run in range(n_runs):
            n_epochs.append(epochs)
                
        n_epochs_all_data.append(n_epochs)
            

## entropy scores: MI, effectiveness, efficiency

In [48]:
for d in range(len(datasets)):
    
    n_epochs = n_epochs_all_data[d]
    
    for run in range(n_runs):

        path_to_run = paths[d] + str(setting) +'/' + str(run) + '/' 
        if not rsa:
            if not test_interactions:
                path_to_interaction = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
            else:
                path_to_interaction = (path_to_run + 'interactions/' + zero_shot_test_ds + '/epoch_0/interaction_gpu0')
        else:
            path_to_rsa = (path_to_run + 'rsa/' + rsa_test + '/')
            path_to_interaction = (path_to_rsa + 'rsa_' + rsa_test + '/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        interaction = torch.load(path_to_interaction)

        attributes = n_attributes[d]
        values = n_values[d]
        scores = information_scores(interaction, attributes, values, normalizer="arithmetic")
        
        if not rsa:
            if not test_interactions:
                pickle.dump(scores, open(path_to_run + 'entropy_scores.pkl', 'wb'))
            else:
                pickle.dump(scores, open(path_to_run + 'entropy_scores_' + zero_shot_test_ds + '.pkl', 'wb'))
        else:
            pickle.dump(scores, open(path_to_rsa + 'entropy_scores.pkl', 'wb'))

  (m_entropy_concept_x_context + c_entropy_concept_x_context - joint_entropy_concept_x_context)
  normalized_effectiveness_conc_x_cont = ((joint_entropy_concept_x_context - m_entropy_concept_x_context)
  normalized_consistency_conc_x_cont = (


##  message length

In [49]:
# we evaluated message length per hierarchy level after training but 
# you can also use the HierarchicalMessageLength callback and store the results 

for d in range(len(datasets)):
    
    n_epochs = n_epochs_all_data[d]
    
    for run in range(n_runs): 
        
        path_to_run = paths[d] + str(setting) +'/' + str(run) + '/'
        if not rsa:
            path_to_interaction = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        else:
            path_to_rsa = (path_to_run + 'rsa/' + rsa_test + '/')
            path_to_interaction = (path_to_rsa + 'rsa_' + rsa_test + '/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
            
        interaction = torch.load(path_to_interaction)

        attributes = n_attributes[d]
        values = n_values[d]
        ml, ml_concept = message_length_per_hierarchy_level(interaction, attributes)
        ml_context = message_length_per_context_condition(interaction, attributes)
        
        if not rsa:
            pickle.dump(ml, open(path_to_run + 'message_length.pkl', 'wb'))
            pickle.dump(ml_concept, open(path_to_run + 'message_length_hierarchical.pkl', 'wb'))
            pickle.dump(ml_context, open(path_to_run + 'message_length_over_context.pkl', 'wb'))
        else:
            pickle.dump(ml, open(path_to_run + 'message_length.pkl', 'wb'))
            pickle.dump(ml_concept, open(path_to_rsa + 'message_length_hierarchical.pkl', 'wb'))
            pickle.dump(ml_context, open(path_to_run + 'message_length_over_context.pkl', 'wb'))

##  symbol redundancy

In [50]:
for d in range(len(datasets)):
    
    n_epochs = n_epochs_all_data[d]
    
    attributes = n_attributes[d]
    values = n_values[d]
    vs_factor = int(paths[d][-2])
    vocab_size = (n_values[d] + 1) * vs_factor + 1
    vocab_size = 5
    
    for run in range(n_runs): 
                
        path_to_run = paths[d] + str(setting) +'/' + str(run) + '/'
        if not rsa:
            if not test_interactions:
                path_to_interaction = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
            else:
                path_to_interaction = (path_to_run + 'interactions/' + zero_shot_test_ds + '/epoch_0/interaction_gpu0')
        else:
            path_to_rsa = (path_to_run + 'rsa/' + rsa_test + '/')
            path_to_interaction = (path_to_rsa + 'rsa_' + rsa_test + '/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        interaction = torch.load(path_to_interaction)
        redundancy, MI = symbol_frequency(interaction, attributes, values, vocab_size)
        
        scores = {'symbol_redundancy': redundancy, 'MI_symbol-attribute_value': MI}
        
        if not rsa:
            if not test_interactions:
                pickle.dump(scores, open(path_to_run + 'symbol_redundancy.pkl', 'wb'))
            else:
                pickle.dump(scores, open(path_to_run + 'symbol_redundancy_' + zero_shot_test_ds + '.pkl', 'wb'))
        else:
            pickle.dump(scores, open(path_to_rsa + 'symbol_redundancy.pkl', 'wb'))

  return symbol_frequency / att_val_frequency, mutual_information
  return symbol_frequency / att_val_frequency, mutual_information
  return symbol_frequency / att_val_frequency, mutual_information
  return symbol_frequency / att_val_frequency, mutual_information
  return symbol_frequency / att_val_frequency, mutual_information


##  compositionality scores: topsim, posdis, bosdis

### topsim

In [51]:
# topsim for train and validation
# although topsim values are stored throughout training if callbacks are verbose, we reevaluate the
# final topsim scores with more data points 
# not yet implemented for rsa

samples = 5000 # maybe shuffle from these because otherwise I just take the first 5,000 (which might not be the best)
for d, dataset in enumerate(datasets):
    
    n_epochs = n_epochs_all_data[d]
    
    dim = [n_values[d]]*n_attributes[d]
    
    for run in range(n_runs):
        print("dataset", dataset, "run", run)
        
        topsim_final = {}
        path_to_run = paths[d] + str(setting) +'/' + str(run) + '/'
        path_to_interaction_train = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        path_to_interaction_val = (path_to_run + 'interactions/validation/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        
        TOPSIM = TopographicSimilarityConceptLevel(dim, is_gumbel=True)
        
        for mode in ['train', 'val']:

            if mode == 'train':
                interaction = torch.load(path_to_interaction_train)
            elif mode == 'val':
                interaction = torch.load(path_to_interaction_val)
                  
            messages = interaction.message.argmax(dim=-1)
            sender_input = interaction.sender_input
            n_targets = int(sender_input.shape[1]/2)
            # get target objects and fixed vectors to re-construct concepts
            target_objects = sender_input[:, :n_targets]
            target_objects = k_hot_to_attributes(target_objects, n_values[d])
            # concepts are defined by a list of target objects (here one sampled target object) and a fixed vector
            (objects, fixed) = retrieve_concepts_sampling(target_objects)
            # add one such that zero becomes an empty attribute for the calculation (_)
            objects = objects + 1
            concepts = torch.from_numpy(objects * (np.array(fixed)))
            specific_idx = np.where(np.sum(fixed, axis=1)==n_attributes[d])[0]
            messages_specific = messages[specific_idx]
            concepts_specific = concepts[specific_idx]
            
            generic_idx = np.where(np.sum(fixed, axis=1)==1)[0]
            messages_generic = messages[generic_idx]
            concepts_generic = concepts[generic_idx]

            messages = [msg.tolist() for msg in messages]
            messages_specific = [msg.tolist() for msg in messages_specific]
            messages_generic = [msg.tolist() for msg in messages_generic]

            encoded_input = encode_target_concepts_for_topsim(sender_input)

            topsim = TOPSIM.compute_topsim(encoded_input[0:samples], messages[0:samples]) # default: hausdorff distance
            
            if not zero_shot:
                topsim_specific = TOPSIM.compute_topsim(concepts_specific[0:samples], messages_specific[0:samples], 
                                                            meaning_distance_fn="edit")
                
                topsim_generic = TOPSIM.compute_topsim(concepts_generic[0:samples], messages_generic[0:samples],
                                                       meaning_distance_fn="edit")

            print('... topsim computed')

            topsim_final['topsim_' + mode] = topsim
            if not zero_shot:
                topsim_final['topsim_specific_' + mode] = topsim_specific
                topsim_final['topsim_generic_' + mode] = topsim_generic
        
        pickle.dump(topsim_final, open(path_to_run +  "topsim_final.pkl", "wb" ) )
        print(topsim_final)        

dataset (3,8) run 0
... topsim computed
... topsim computed
{'topsim_train': 0.09227438441683405, 'topsim_specific_train': 0.1368224479233204, 'topsim_generic_train': 0.17028422922988085, 'topsim_val': 0.10049169572423747, 'topsim_specific_val': 0.13083028044391096, 'topsim_generic_val': 0.08177861154693228}
dataset (3,8) run 1
... topsim computed
... topsim computed
{'topsim_train': 0.1957792508756706, 'topsim_specific_train': 0.24561507067643623, 'topsim_generic_train': 0.10668495856372975, 'topsim_val': 0.20059100724104195, 'topsim_specific_val': 0.26175319448516177, 'topsim_generic_val': 0.14561427985788195}
dataset (3,8) run 2
... topsim computed
... topsim computed
{'topsim_train': 0.1212207826367225, 'topsim_specific_train': 0.14751844092939878, 'topsim_generic_train': 0.32196042494339344, 'topsim_val': 0.1410965251745052, 'topsim_specific_val': 0.16107321082187814, 'topsim_generic_val': 0.2990776745183893}
dataset (3,8) run 3
... topsim computed
... topsim computed
{'topsim_tra

In [52]:
# topsim for test interactions

if test_interactions:

    samples = 5000 
    for d, dataset in enumerate(datasets):
        
        n_epochs = n_epochs_all_data[d]
        
        dim = [n_values[d]]*n_attributes[d]
        
        for run in range(n_runs):
            print("dataset", dataset, "run", run)
            
            topsim_final = {}
            path_to_run = paths[d] + str(setting) +'/' + str(run) + '/'
            mode = 'test'
            path_to_interaction_test = (path_to_run + 'interactions/' + zero_shot_test_ds + '/epoch_0/interaction_gpu0')
            
            TOPSIM = TopographicSimilarityConceptLevel(dim, is_gumbel=True)
            
            interaction = torch.load(path_to_interaction_test)
                      
            messages = interaction.message.argmax(dim=-1)
            sender_input = interaction.sender_input
            n_targets = int(sender_input.shape[1]/2)
            # get target objects and fixed vectors to re-construct concepts
            target_objects = sender_input[:, :n_targets]
            target_objects = k_hot_to_attributes(target_objects, n_values[d])
            # concepts are defined by a list of target objects (here one sampled target object) and a fixed vector
            (objects, fixed) = retrieve_concepts_sampling(target_objects)
            # add one such that zero becomes an empty attribute for the calculation (_)
            objects = objects + 1
            concepts = torch.from_numpy(objects * (np.array(fixed)))

            messages = [msg.tolist() for msg in messages]

            encoded_input = encode_target_concepts_for_topsim(sender_input)

            topsim = TOPSIM.compute_topsim(encoded_input[0:samples], messages[0:samples])  

            print('... topsim computed')

            topsim_final['topsim_' + mode] = topsim
    
            pickle.dump(topsim_final, open(path_to_run +  "topsim_final_" + zero_shot_test_ds + ".pkl", "wb" ) )
            print(topsim_final)        

#### Topsim over time

In [53]:
for d, dataset in enumerate(datasets):
    
    n_epochs = n_epochs_all_data[d]
    
    for run in range(n_runs):
        print("dataset", dataset, "run", run)
        
        path_to_run = paths[d] + str(setting) +'/' + str(run) + '/'
        path_to_interaction_train = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        path_to_interaction_val = (path_to_run + 'interactions/validation/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        
        for mode in ['train', 'val']:

            if mode == 'train':
                interaction = torch.load(path_to_interaction_train)
            elif mode == 'val':
                interaction = torch.load(path_to_interaction_val)

        messages = interaction.message.argmax(dim=-1)
        sender_input = interaction.sender_input
        messages = [msg.tolist() for msg in messages]
        encoded_input = encode_target_concepts_for_topsim(sender_input)
        dim = [n_values[0]] * n_attributes[0]
        TOPSIM = TopographicSimilarityConceptLevel(dim, is_gumbel=True)
        
        samples = 5000
        num_batches = len(messages) // samples + (len(messages) % samples > 0)
        topsim_over_time = []
        
        for i in range(num_batches):
            messages_batch = messages[i * samples:(i + 1) * samples]
            topsim = TOPSIM.compute_topsim(encoded_input[i * samples:(i + 1) * samples], messages_batch)
            topsim_over_time.append(topsim)
            
        pickle.dump(topsim_over_time, open(path_to_run +  "topsim_over_time.pkl", "wb" ) )

dataset (3,8) run 0
dataset (3,8) run 1
dataset (3,8) run 2
dataset (3,8) run 3
dataset (3,8) run 4


### Posdis and Bosdis

In [54]:
# use Disent callback from egg

for d in range(len(datasets)): 
    
    n_epochs = n_epochs_all_data[d]
    
    path = paths[d]
    dim = [n_values[d]] * n_attributes[d]
    n_features = n_attributes[d] * n_values[d]
    vs_factor = int(path[-2])
    vocab_size = (n_values[d] + 1) * vs_factor + 1
    
    print("data set", dim)
    
    for run in range(n_runs):
        
        posdis_bosdis = {}
    
        path_to_run = paths[d] + '/' + str(setting) +'/' + str(run) + '/'
        path_to_interaction_train = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        interaction = torch.load(path_to_interaction_train)
        
        messages = interaction.message.argmax(dim=-1)
        sender_input = interaction.sender_input
        n_targets = int(sender_input.shape[1]/2)
        # get target objects and fixed vectors to re-construct concepts
        target_objects = sender_input[:, :n_targets]
        target_objects = k_hot_to_attributes(target_objects, n_values[d])
        # concepts are defined by a list of target objects (here one sampled target object) and a fixed vector
        (objects, fixed) = retrieve_concepts_sampling(target_objects)
        # add one such that zero becomes an empty attribute for the calculation (_)
        objects = objects + 1
        concepts = torch.from_numpy(objects * (np.array(fixed)))

        # concrete/specific concepts: where all attributes are fixed
        concepts_specific = torch.tensor(
            objects[torch.sum(torch.from_numpy(fixed), dim=1) == n_attributes[d]])
        messages_specific = messages[torch.sum(torch.from_numpy(fixed), dim=1) == n_attributes[d]]

        # generic concepts: where only one attribute is fixed
        concepts_generic = torch.tensor(
            objects[torch.sum(torch.from_numpy(fixed), dim=1) == 1])
        messages_generic = messages[torch.sum(torch.from_numpy(fixed), dim=1) == 1]
        
        posdis_specific = Disent.posdis(concepts_specific, messages_specific)
        bosdis_specific = Disent.bosdis(concepts_specific, messages_specific, vocab_size)

        posdis_generic = Disent.posdis(concepts_generic, messages_generic)
        bosdis_generic = Disent.bosdis(concepts_generic, messages_generic, vocab_size)
        
        posdis = Disent.posdis(torch.from_numpy(objects), messages)
        bosdis = Disent.bosdis(torch.from_numpy(objects), messages, vocab_size)
        
        posdis_bosdis['posdis_specific'] = posdis_specific
        posdis_bosdis['bosdis_specific'] = bosdis_specific
        posdis_bosdis['posdis_generic'] = posdis_generic
        posdis_bosdis['bosdis_generic'] = bosdis_generic
        posdis_bosdis['posdis'] = posdis
        posdis_bosdis['bosdis'] = bosdis

        print(posdis_bosdis)
    
        pickle.dump(posdis_bosdis, open(path_to_run + "posdis_bosdis.pkl", "wb" ) )

data set [8, 8, 8]
{'posdis_specific': 0.05665984004735947, 'bosdis_specific': nan, 'posdis_generic': 0.07075405865907669, 'bosdis_generic': nan, 'posdis': 0.05150959640741348, 'bosdis': nan}
{'posdis_specific': 0.4360629618167877, 'bosdis_specific': nan, 'posdis_generic': 0.15436796844005585, 'bosdis_generic': nan, 'posdis': 0.35393208265304565, 'bosdis': nan}
{'posdis_specific': 0.08248614519834518, 'bosdis_specific': nan, 'posdis_generic': 0.11588380485773087, 'bosdis_generic': nan, 'posdis': 0.07018067687749863, 'bosdis': nan}
{'posdis_specific': 0.35773882269859314, 'bosdis_specific': nan, 'posdis_generic': 0.045208171010017395, 'bosdis_generic': nan, 'posdis': 0.2753124535083771, 'bosdis': nan}
{'posdis_specific': 0.09696928411722183, 'bosdis_specific': nan, 'posdis_generic': 0.02053152024745941, 'bosdis_generic': nan, 'posdis': 0.07807818800210953, 'bosdis': nan}


#### Posdis and bosdis concept x context

In [55]:
# bosdis concept x context
from utils.analysis_from_interaction import bosdis
for d in range(len(datasets)):
    
    n_epochs = n_epochs_all_data[d]

    vs_factor = int(paths[d][-2])
    vocab_size = (n_values[d] + 1) * vs_factor + 1
    
    for run in range(n_runs):

        path_to_run = paths[d] + '/' + str(setting) +'/' + str(run) + '/' 
        path_to_interaction = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        interaction = torch.load(path_to_interaction)

        attributes = n_attributes[d]
        values = n_values[d]
        scores = bosdis(interaction, attributes, values, vocab_size)

        pickle.dump(scores, open(path_to_run + 'bosdis_scores.pkl', 'wb'))

In [56]:
# posdis concept x context
from utils.analysis_from_interaction import posdis
for d in range(len(datasets)):
    
    n_epochs = n_epochs_all_data[d]

    vs_factor = int(paths[d][-2])
    vocab_size = (n_values[d] + 1) * vs_factor + 1

    for run in range(n_runs):
        path_to_run = paths[d] + '/' + str(setting) + '/' + str(run) + '/'
        print(path_to_run)
        path_to_interaction = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        #print(path_to_interaction)
        interaction = torch.load(path_to_interaction)

        attributes = n_attributes[d]
        values = n_values[d]
        scores = posdis(interaction, attributes, values, vocab_size)

        pickle.dump(scores, open(path_to_run + 'posdis_scores.pkl', 'wb'))

results/(3,8)_game_size_10_vsf_0//length_cost/context_aware/0/
results/(3,8)_game_size_10_vsf_0//length_cost/context_aware/1/
results/(3,8)_game_size_10_vsf_0//length_cost/context_aware/2/
results/(3,8)_game_size_10_vsf_0//length_cost/context_aware/3/
results/(3,8)_game_size_10_vsf_0//length_cost/context_aware/4/


## co-occurrences

In [None]:
# Not yet implemented:

for d in range(len(datasets)):
    
    vs_factor = int(paths[d][-2])
    
    for run in range(n_runs): 
        
        path_to_run = paths[d] + str(setting) +'/' + str(run) + '/'
        path_to_interaction = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        interaction = torch.load(path_to_interaction)

        attributes = n_attributes[d]
        values = n_values[d]
        
        scores = cooccurrence_per_hierarchy_level(interaction, attributes, values, vs_factor)

        print(scores)
        
        pickle.dump(scores, open(path_to_run + 'normalized_cooccurrence.pkl', 'wb'))
    