In [205]:
import pickle
from utils.analysis_from_interaction import *
from egg.core.language_analysis import Disent
from language_analysis_local import TopographicSimilarityConceptLevel, encode_target_concepts_for_topsim
import os

# calculate metrics from stored interactions

In [267]:
datasets = ('(3,4)', '(3,8)', '(3,16)', '(4,4)', '(4,8)', '(5,4)')
n_attributes = (3, 3, 3, 4, 4, 5)
n_values = (4, 8, 16, 4, 8, 4)
epochs = 300
n_runs = 5
paths = ['results/' + d + '_game_size_10_vsf_3/' for d in datasets]

In [239]:
datasets = ('(3,4)', )
n_attributes = (3, )
n_values = (4, )
epochs = 0
n_runs = 5
paths = ['results/' + d + '_game_size_10_vsf_0/' for d in datasets]

In [268]:
context_unaware = False # whether original or context_unaware simulations are evaluated
zero_shot = True # whether zero-shot simulations are evaluated
zero_shot_test = 'generic' # 'generic' or 'specific'
test_interactions = True # whether scores should be calculated on test interactions (only with zero shot)
length_cost = False # whether length_cost was applied; length cost runs have been run with early stopping
early_stopping = False # only with length cost
rsa = False # only with context unaware
rsa_test = 'train'
setting = ""
if length_cost:
    setting = setting + 'length_cost/'
    if not context_unaware:
        setting = setting + 'context_aware'
if context_unaware:
    setting = setting + 'context_unaware'
else:
    if not length_cost:
        setting = setting + 'standard'
if zero_shot:
    setting = setting + '/zero_shot/' + zero_shot_test

In [256]:
# get n_epochs if early stopping
if early_stopping:
    
    n_epochs_all_data = []
    for d in range(len(datasets)):
        
        n_epochs = []
        
        for run in range(5):
    
            path_to_run = paths[d] + str(setting) +'/' + str(run) + '/' 
            with open(os.path.join(path_to_run, 'loss_and_metrics.pkl'), 'rb') as input_file:
                data = pickle.load(input_file)
                final_epoch = max(data['loss_train'].keys())
                n_epochs.append(final_epoch)
                
        n_epochs_all_data.append(n_epochs)
        
else:
    n_epochs_all_data = []
    for d in range(len(datasets)):
        n_epochs = []
        
        for run in range(5):
            n_epochs.append(epochs)
                
        n_epochs_all_data.append(n_epochs)
            

## entropy scores: MI, effectiveness, efficiency

In [272]:
for d in range(len(datasets)):
    
    n_epochs = n_epochs_all_data[d]
    
    for run in range(5):

        path_to_run = paths[d] + str(setting) +'/' + str(run) + '/' 
        if not rsa:
            if not test_interactions:
                path_to_interaction = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
            else:
                path_to_interaction = (path_to_run + 'interactions/test/epoch_0/interaction_gpu0')
        else:
            path_to_rsa = (path_to_run + 'rsa/' + rsa_test + '/')
            path_to_interaction = (path_to_rsa + 'rsa_' + rsa_test + '/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        interaction = torch.load(path_to_interaction)

        attributes = n_attributes[d]
        values = n_values[d]
        scores = information_scores(interaction, attributes, values, normalizer="arithmetic")
        
        if not rsa:
            if not test_interactions:
                pickle.dump(scores, open(path_to_run + 'entropy_scores.pkl', 'wb'))
            else:
                pickle.dump(scores, open(path_to_run + 'entropy_scores_test.pkl', 'wb'))
        else:
            pickle.dump(scores, open(path_to_rsa + 'entropy_scores.pkl', 'wb'))

##  message length

In [194]:
# we evaluated message length per hierarchy level after training but 
# you can also use the HierarchicalMessageLength callback and store the results 

for d in range(len(datasets)):
    
    n_epochs = n_epochs_all_data[d]
    
    for run in range(5): 
        
        path_to_run = paths[d] + str(setting) +'/' + str(run) + '/'
        if not rsa:
            path_to_interaction = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        else:
            path_to_rsa = (path_to_run + 'rsa/' + rsa_test + '/')
            path_to_interaction = (path_to_rsa + 'rsa_' + rsa_test + '/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
            
        interaction = torch.load(path_to_interaction)

        attributes = n_attributes[d]
        values = n_values[d]
        scores = message_length_per_hierarchy_level(interaction, attributes)
        
        if not rsa:
            pickle.dump(scores, open(path_to_run + 'message_length_hierarchical.pkl', 'wb'))
        else:
            pickle.dump(scores, open(path_to_rsa + 'message_length_hierarchical.pkl', 'wb'))

##  symbol redundancy

In [204]:
for d in range(len(datasets)):
    
    n_epochs = n_epochs_all_data[d]
    
    attributes = n_attributes[d]
    values = n_values[d]
    vs_factor = int(paths[d][-2])
    vocab_size = (n_values[d] + 1) * vs_factor + 1
    vocab_size = 5
    
    for run in range(5): 
                
        path_to_run = paths[d] + str(setting) +'/' + str(run) + '/'
        if not rsa:
            path_to_interaction = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        else:
            path_to_rsa = (path_to_run + 'rsa/' + rsa_test + '/')
            path_to_interaction = (path_to_rsa + 'rsa_' + rsa_test + '/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        interaction = torch.load(path_to_interaction)
        redundancy, MI = symbol_frequency(interaction, attributes, values, vocab_size)
        
        scores = {'symbol_redundancy': redundancy, 'MI_symbol-attribute_value': MI}
        
        if not rsa:
            pickle.dump(scores, open(path_to_run + 'symbol_redundancy.pkl', 'wb'))
        else:
            pickle.dump(scores, open(path_to_rsa + 'symbol_redundancy.pkl', 'wb'))

  return symbol_frequency / att_val_frequency, mutual_information
  return symbol_frequency / att_val_frequency, mutual_information
  return symbol_frequency / att_val_frequency, mutual_information
  return symbol_frequency / att_val_frequency, mutual_information
  return symbol_frequency / att_val_frequency, mutual_information


##  compositionality scores: topsim, posdis, bosdis

### topsim

In [110]:
# topsim
# although topsim values are stored throughout training if callbacks are verbose, we reevaluate the
# final topsim scores with more data points 
# not yet implemented for rsa

samples = 5000 # maybe shuffle from these because otherwise I just take the first 5,000 (which might not be the best)
for d, dataset in enumerate(datasets):
    
    n_epochs = n_epochs_all_data[d]
    
    dim = [n_values[d]]*n_attributes[d]
    
    for run in range(5):
        print("dataset", dataset, "run", run)
        
        topsim_final = {}
        path_to_run = paths[d] + str(setting) +'/' + str(run) + '/'
        path_to_interaction_train = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        path_to_interaction_val = (path_to_run + 'interactions/validation/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        
        TOPSIM = TopographicSimilarityConceptLevel(dim, is_gumbel=True)
        
        for mode in ['train', 'val']:

            if mode == 'train':
                interaction = torch.load(path_to_interaction_train)
            elif mode == 'val':
                interaction = torch.load(path_to_interaction_val)
                
                  
            messages = interaction.message.argmax(dim=-1)
            sender_input = interaction.sender_input
            n_targets = int(sender_input.shape[1]/2)
            # get target objects and fixed vectors to re-construct concepts
            target_objects = sender_input[:, :n_targets]
            target_objects = k_hot_to_attributes(target_objects, n_values[d])
            # concepts are defined by a list of target objects (here one sampled target object) and a fixed vector
            (objects, fixed) = retrieve_concepts_sampling(target_objects)
            # add one such that zero becomes an empty attribute for the calculation (_)
            objects = objects + 1
            concepts = torch.from_numpy(objects * (np.array(fixed)))
            specific_idx = np.where(np.sum(fixed, axis=1)==n_attributes[d])[0]
            messages_specific = messages[specific_idx]
            concepts_specific = concepts[specific_idx]
            
            generic_idx = np.where(np.sum(fixed, axis=1)==1)[0]
            messages_generic = messages[generic_idx]
            concepts_generic = concepts[generic_idx]

            messages = [msg.tolist() for msg in messages]
            messages_specific = [msg.tolist() for msg in messages_specific]
            messages_generic = [msg.tolist() for msg in messages_generic]

            encoded_input = encode_target_concepts_for_topsim(sender_input)
            # randomly take samples when more than 5000 samples are available
            # if len(encoded_input) > samples: 
            #     print("sampling")
            #     sample_indices = random.sample(range(len(encoded_input)), samples)
            #     sampled_input = [encoded_input[i] for i in sample_indices]
            #     sampled_messages = [messages[i] for i in sample_indices]
            #     print("start computing")
            #     print(len(sampled_input), len(sampled_input[0]), len(sampled_input[0][0]))
            #     topsim = TOPSIM.compute_topsim(sampled_input, sampled_messages)
            # else:
            topsim = TOPSIM.compute_topsim(encoded_input[0:samples], messages[0:samples]) # default: hausdorff distance for concepts, edit distance for messages
            # if len(concepts_specific) > samples:
            #     print("sampling specific")
            #     sample_indices_specific = random.sample(range(len(concepts_specific)), samples)
            #     sampled_input_specific = [concepts_specific[i] for i in sample_indices_specific]
            #     sampled_messages_specific = [messages_specific[i] for i in sample_indices_specific]
            #     topsim_specific = TOPSIM.compute_topsim(sampled_input_specific, sampled_messages_specific, 
            #                                             meaning_distance_fn="edit")
            # else:
            topsim_specific = TOPSIM.compute_topsim(concepts_specific[0:samples], messages_specific[0:samples], 
                                                        meaning_distance_fn="edit")
            
            topsim_generic = TOPSIM.compute_topsim(concepts_generic[0:samples], messages_generic[0:samples],
                                                   meaning_distance_fn="edit")

            print('... topsim computed')

            topsim_final['topsim_' + mode] = topsim
            topsim_final['topsim_specific_' + mode] = topsim_specific
            topsim_final['topsim_generic_' + mode] = topsim_generic
    
        pickle.dump(topsim_final, open(path_to_run +  "topsim_final.pkl", "wb" ) )
        print(topsim_final)        

dataset (3,4) run 0
... topsim computed
... topsim computed
{'topsim_train': 0.253401988257643, 'topsim_specific_train': 0.31540972927284067, 'topsim_generic_train': 0.1674772222063737, 'topsim_val': 0.28206781263146785, 'topsim_specific_val': 0.3389871825447343, 'topsim_generic_val': 0.2531927120500811}
dataset (3,4) run 1
... topsim computed
... topsim computed
{'topsim_train': 0.22012641243808112, 'topsim_specific_train': 0.30126834233973626, 'topsim_generic_train': 0.20510726011884886, 'topsim_val': 0.2541664633023642, 'topsim_specific_val': 0.29273791654322195, 'topsim_generic_val': 0.2667535559177903}
dataset (3,4) run 2
... topsim computed
... topsim computed
{'topsim_train': 0.3158607960755495, 'topsim_specific_train': 0.40640901950129427, 'topsim_generic_train': 0.15751390449638517, 'topsim_val': 0.3407686740944561, 'topsim_specific_val': 0.45982819267504865, 'topsim_generic_val': 0.1637009902601251}
dataset (3,4) run 3
... topsim computed
... topsim computed
{'topsim_train': 

#### Topsim over time

In [None]:
for d, dataset in enumerate(datasets):
    
    n_epochs = n_epochs_all_data[d]
    
    for run in range(5):
        print("dataset", dataset, "run", run)
        
        path_to_run = paths[d] + str(setting) +'/' + str(run) + '/'
        path_to_interaction_train = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        path_to_interaction_val = (path_to_run + 'interactions/validation/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        
        for mode in ['train', 'val']:

            if mode == 'train':
                interaction = torch.load(path_to_interaction_train)
            elif mode == 'val':
                interaction = torch.load(path_to_interaction_val)

        messages = interaction.message.argmax(dim=-1)
        sender_input = interaction.sender_input
        messages = [msg.tolist() for msg in messages]
        encoded_input = encode_target_concepts_for_topsim(sender_input)
        dim = [n_values[0]] * n_attributes[0]
        TOPSIM = TopographicSimilarityConceptLevel(dim, is_gumbel=True)
        
        samples = 5000
        num_batches = len(messages) // samples + (len(messages) % samples > 0)
        topsim_over_time = []
        
        for i in range(num_batches):
            messages_batch = messages[i * samples:(i + 1) * samples]
            topsim = TOPSIM.compute_topsim(encoded_input[i * samples:(i + 1) * samples], messages_batch)
            topsim_over_time.append(topsim)
            
        pickle.dump(topsim_over_time, open(path_to_run +  "topsim_over_time.pkl", "wb" ) )

dataset (3,4) run 0
dataset (3,4) run 1
dataset (3,4) run 2
dataset (3,4) run 3
dataset (3,4) run 4
dataset (3,8) run 0
dataset (3,8) run 1
dataset (3,8) run 2
dataset (3,8) run 3
dataset (3,8) run 4
dataset (3,16) run 0
dataset (3,16) run 1


### Posdis and Bosdis

In [111]:
# use Disent callback from egg

for d in range(len(datasets)): 
    
    n_epochs = n_epochs_all_data[d]
    
    path = paths[d]
    dim = [n_values[d]] * n_attributes[d]
    n_features = n_attributes[d] * n_values[d]
    vs_factor = int(path[-2])
    vocab_size = (n_values[d] + 1) * vs_factor + 1
    
    print("data set", dim)
    
    for run in range(5):
        
        posdis_bosdis = {}
    
        path_to_run = paths[d] + '/' + str(setting) +'/' + str(run) + '/'
        path_to_interaction_train = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        interaction = torch.load(path_to_interaction_train)
        
        messages = interaction.message.argmax(dim=-1)
        sender_input = interaction.sender_input
        n_targets = int(sender_input.shape[1]/2)
        # get target objects and fixed vectors to re-construct concepts
        target_objects = sender_input[:, :n_targets]
        target_objects = k_hot_to_attributes(target_objects, n_values[d])
        # concepts are defined by a list of target objects (here one sampled target object) and a fixed vector
        (objects, fixed) = retrieve_concepts_sampling(target_objects)
        # add one such that zero becomes an empty attribute for the calculation (_)
        objects = objects + 1
        concepts = torch.from_numpy(objects * (np.array(fixed)))

        # concrete/specific concepts: where all attributes are fixed
        concepts_specific = torch.tensor(
            objects[torch.sum(torch.from_numpy(fixed), dim=1) == n_attributes[d]])
        messages_specific = messages[torch.sum(torch.from_numpy(fixed), dim=1) == n_attributes[d]]

        # generic concepts: where only one attribute is fixed
        concepts_generic = torch.tensor(
            objects[torch.sum(torch.from_numpy(fixed), dim=1) == 1])
        messages_generic = messages[torch.sum(torch.from_numpy(fixed), dim=1) == 1]
        
        posdis_specific = Disent.posdis(concepts_specific, messages_specific)
        bosdis_specific = Disent.bosdis(concepts_specific, messages_specific, vocab_size)

        posdis_generic = Disent.posdis(concepts_generic, messages_generic)
        bosdis_generic = Disent.bosdis(concepts_generic, messages_generic, vocab_size)
        
        posdis = Disent.posdis(torch.from_numpy(objects), messages)
        bosdis = Disent.bosdis(torch.from_numpy(objects), messages, vocab_size)
        
        posdis_bosdis['posdis_specific'] = posdis_specific
        posdis_bosdis['bosdis_specific'] = bosdis_specific
        posdis_bosdis['posdis_generic'] = posdis_generic
        posdis_bosdis['bosdis_generic'] = bosdis_generic
        posdis_bosdis['posdis'] = posdis
        posdis_bosdis['bosdis'] = bosdis

        print(posdis_bosdis)
    
        pickle.dump(posdis_bosdis, open(path_to_run + "posdis_bosdis.pkl", "wb" ) )

data set [4, 4, 4]
{'posdis_specific': 0.0465710423886776, 'bosdis_specific': nan, 'posdis_generic': 0.0296615157276392, 'bosdis_generic': nan, 'posdis': 0.04189429059624672, 'bosdis': nan}
{'posdis_specific': 0.10868732631206512, 'bosdis_specific': nan, 'posdis_generic': 0.05492260307073593, 'bosdis_generic': nan, 'posdis': 0.08238240331411362, 'bosdis': nan}
{'posdis_specific': 0.12329412996768951, 'bosdis_specific': nan, 'posdis_generic': 0.034101225435733795, 'bosdis_generic': nan, 'posdis': 0.08410908281803131, 'bosdis': nan}
{'posdis_specific': 0.038105349987745285, 'bosdis_specific': nan, 'posdis_generic': 0.03253865987062454, 'bosdis_generic': nan, 'posdis': 0.039102792739868164, 'bosdis': nan}
{'posdis_specific': 0.08939934521913528, 'bosdis_specific': nan, 'posdis_generic': 0.05377557873725891, 'bosdis_generic': nan, 'posdis': 0.06353741884231567, 'bosdis': nan}


#### Posdis and bosdis concept x context

In [112]:
# bosdis concept x context
from utils.analysis_from_interaction import bosdis
for d in range(len(datasets)):
    
    n_epochs = n_epochs_all_data[d]

    vs_factor = int(paths[d][-2])
    vocab_size = (n_values[d] + 1) * vs_factor + 1
    
    for run in range(5):

        path_to_run = paths[d] + '/' + str(setting) +'/' + str(run) + '/' 
        path_to_interaction = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        interaction = torch.load(path_to_interaction)

        attributes = n_attributes[d]
        values = n_values[d]
        scores = bosdis(interaction, attributes, values, vocab_size)

        pickle.dump(scores, open(path_to_run + 'bosdis_scores.pkl', 'wb'))

In [113]:
# posdis concept x context
from utils.analysis_from_interaction import posdis
for d in range(len(datasets)):
    
    n_epochs = n_epochs_all_data[d]

    vs_factor = int(paths[d][-2])
    vocab_size = (n_values[d] + 1) * vs_factor + 1

    for run in range(5):
        path_to_run = paths[d] + '/' + str(setting) + '/' + str(run) + '/'
        print(path_to_run)
        path_to_interaction = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        #print(path_to_interaction)
        interaction = torch.load(path_to_interaction)

        attributes = n_attributes[d]
        values = n_values[d]
        scores = posdis(interaction, attributes, values, vocab_size)

        pickle.dump(scores, open(path_to_run + 'posdis_scores.pkl', 'wb'))

results/(3,4)_game_size_10_vsf_0//length_cost/context_unaware/0/
results/(3,4)_game_size_10_vsf_0//length_cost/context_unaware/1/
results/(3,4)_game_size_10_vsf_0//length_cost/context_unaware/2/
results/(3,4)_game_size_10_vsf_0//length_cost/context_unaware/3/
results/(3,4)_game_size_10_vsf_0//length_cost/context_unaware/4/


## co-occurrences

In [None]:
# Not yet implemented:

for d in range(len(datasets)):
    
    vs_factor = int(paths[d][-2])
    
    for run in range(5): 
        
        path_to_run = paths[d] + str(setting) +'/' + str(run) + '/'
        path_to_interaction = (path_to_run + 'interactions/train/epoch_' + str(n_epochs[run]) + '/interaction_gpu0')
        interaction = torch.load(path_to_interaction)

        attributes = n_attributes[d]
        values = n_values[d]
        
        scores = cooccurrence_per_hierarchy_level(interaction, attributes, values, vs_factor)

        print(scores)
        
        pickle.dump(scores, open(path_to_run + 'normalized_cooccurrence.pkl', 'wb'))
    