In [87]:
## Investigate pattern level results for masked LM models
from tabulate import tabulate

from evaluate import SampleResult
import numpy as np
import pickle
import nltk
import os

results_dir = '/mounts/work/kerem/definition_classification_data/results'

model_names = ['bert-base-uncased', 'bert-large-uncased', 'roberta-base', 'roberta-large', 'random']

word_types = ['n','v']
types = {
    'n': 'Nouns',
    'v': 'Verbs'
}

for word_type in word_types:
    
    if word_type == 'n':
        headers = ['Model','Pattern 1','Pattern 2','Pattern 3','Best Pattern']       
        pattern_count = 3
    elif word_type == 'v':
        headers = ['Model','Pattern 1','Pattern 2','Best Pattern']
        pattern_count = 2
        
    model_names[-1] = f'random_with_{pattern_count}_patterns'
    all_results = {}
    for model_name in model_names:
        with open(os.path.join(results_dir, f'{model_name}_{word_type}_results.pickle'), 'rb') as handle:
            results = pickle.load(handle)

        accuracies = []
        pred_scores = []
        for i in range(pattern_count):
            pred_scores.append([])
            accuracies.append([])

        option_counts = []
        best_pred_scores = []
        best_accuracies = []
        no = 0
        for key, value in results.items():
            scores = []
            correct = 0
            for pattern_no in range(pattern_count):
                score = value[pattern_no].prediction_score
                scores.append(score)
                pred_scores[pattern_no].append(score)

                if score == 1:
                    accuracies[pattern_no].append(1)
                    correct = 1
                else:
                    accuracies[pattern_no].append(0)

            best_accuracies.append(correct)    
            best_pred_scores.append(max(scores))
            option_counts.append(len(value[0].prediction_probs))   

        x1 = np.round(np.mean(np.asarray(pred_scores),1),2)
        x2 = np.round(np.mean(best_pred_scores),2)
        x3 = np.round(np.mean(np.asarray(accuracies)*100,1),2)
        x4 = np.round(np.mean(best_accuracies)*100,2)
        all_results[model_name] = (x1,x2,x3,x4)

    print(f'\n\nResults of Masked Language Models for Wordnet {types[word_type]}')
    print(f'\nExample count: {len(option_counts)}')
    print(f'\nAverage option count: {np.mean(option_counts):.2f}')
    print(f'Minimum option count: {np.min(option_counts)}')
    print(f'Maximum option count: {np.max(option_counts)}')

    

    print('\n\nPrediction Rank Results\n')
    model_no = 0
    tab = []
    for model, results in all_results.items():
        tab.append([])
        tab[model_no].append(model)
        for pattern_no in range(pattern_count):
             tab[model_no].append(results[0][pattern_no])
        tab[model_no].append(results[1])
        model_no += 1
    print(tabulate(tab, headers=headers, tablefmt='orgtbl'))


    print('\n\nAccuracy Results\n')
    model_no = 0
    tab = []
    for model, results in all_results.items():
        tab.append([])
        tab[model_no].append(model)
        for pattern_no in range(pattern_count):
             tab[model_no].append(results[2][pattern_no])
        tab[model_no].append(results[3])
        model_no += 1
    print(tabulate(tab, headers=headers, tablefmt='orgtbl'))




Results of Masked Language Models for Wordnet Nouns

Example count: 51559

Average option count: 50.23
Minimum option count: 5
Maximum option count: 404


Prediction Rank Results

| Model                  |   Pattern 1 |   Pattern 2 |   Pattern 3 |   Best Pattern |
|------------------------+-------------+-------------+-------------+----------------|
| bert-base-uncased      |        0.6  |        0.62 |        0.63 |           0.74 |
| bert-large-uncased     |        0.58 |        0.59 |        0.6  |           0.75 |
| roberta-base           |        0.64 |        0.67 |        0.69 |           0.77 |
| roberta-large          |        0.65 |        0.69 |        0.68 |           0.77 |
| random_with_3_patterns |        0.5  |        0.5  |        0.5  |           0.77 |


Accuracy Results

| Model                  |   Pattern 1 |   Pattern 2 |   Pattern 3 |   Best Pattern |
|------------------------+-------------+-------------+-------------+----------------|
| bert-base-uncased     

In [86]:
# Investigate word frequency level results for masked LM models
from tabulate import tabulate

from evaluate import SampleResult
import numpy as np
import pickle
import nltk
import os

word_counts_file = '/mounts/work/kerem/datasets/WordNet/wordnet_word_counts_in_WWC.pickle'
with open(word_counts_file, 'rb') as handle:
    word_counts = pickle.load(handle)

results_dir = '/mounts/work/kerem/definition_classification_data/results'

model_names = ['bert-base-uncased', 'bert-large-uncased', 'roberta-base', 'roberta-large', 'random']

word_types = ['n','v']
types = {
    'n': 'Nouns',
    'v': 'Verbs'
}

for word_type in word_types:
    headers = ['Model','rare (0-9, X)','medium (10-99, X)','frequent (100-, X)','all (X)']   

    if word_type == 'n':   
        pattern_count = 3
    elif word_type == 'v':
        pattern_count = 2
        
    model_names[-1] = f'random_with_{pattern_count}_patterns'
    all_results = {}
    for model_name in model_names:
        with open(os.path.join(results_dir, f'{model_name}_{word_type}_results.pickle'), 'rb') as handle:
            results = pickle.load(handle)

        accuracies = []
        pred_scores = []
        for i in range(len(headers)-1):
            pred_scores.append([])
            accuracies.append([])

        option_counts = []
        no = 0
        for key, value in results.items():
            word = key.split('.')[0].replace("_", " ")
            tokens = nltk.word_tokenize(word)
            word = ' '.join(token.lower() for token in tokens)
            
            scores = []
            correct = 0
            for pattern_no in range(pattern_count):
                score = value[pattern_no].prediction_score
                scores.append(score)

                if score == 1:                   
                    correct = 1
 
            if 'random'in model_name:
                best_pred_score = score
                correct = 1 if score == 1 else 0
            else:
                best_pred_score = max(scores)
                
            option_counts.append(len(value[0].prediction_probs)) 
        
            if word_counts[word] < 10:
                accuracies[0].append(correct)
                pred_scores[0].append(best_pred_score)
            elif word_counts[word] < 100:
                accuracies[1].append(correct)
                pred_scores[1].append(best_pred_score)
            else:
                accuracies[2].append(correct)
                pred_scores[2].append(best_pred_score)
            accuracies[3].append(correct)
            pred_scores[3].append(best_pred_score)    
        
        

        mean_pred_scores = [np.round(np.mean(scores), 2) for scores in pred_scores]
        mean_accuracies = [np.round(np.mean(acc)*100,2) for acc in accuracies]
        all_results[model_name] = (mean_pred_scores, mean_accuracies)

    
    for i in range(1,5):
        headers[i] = headers[i].replace('X', f'{len(accuracies[i-1])}')
        
    print(f'\n\nResults of Masked Language Models for Wordnet {types[word_type]}')
    print(f'\nExample count: {len(option_counts)}')
    print(f'\nAverage option count: {np.mean(option_counts):.2f}')
    print(f'Minimum option count: {np.min(option_counts)}')
    print(f'Maximum option count: {np.max(option_counts)}')

    
    print('\n\nPrediction Rank Results\n')
    model_no = 0
    tab = []
    for model, results in all_results.items():
        tab.append([])
        tab[model_no].append(model)
        tab[model_no].extend(results[0])
        model_no += 1
    print(tabulate(tab, headers=headers, tablefmt='orgtbl'))


    print('\n\nAccuracy Results\n')
    model_no = 0
    tab = []
    for model, results in all_results.items():
        tab.append([])
        tab[model_no].append(model)
        tab[model_no].extend(results[1])
        model_no += 1
    print(tabulate(tab, headers=headers, tablefmt='orgtbl'))




Results of Masked Language Models for Wordnet Nouns

Example count: 51559

Average option count: 50.23
Minimum option count: 5
Maximum option count: 404


Prediction Rank Results

| Model                  |   rare (0-9, 10771) |   medium (10-99, 13033) |   frequent (100-, 27755) |   all (51559) |
|------------------------+---------------------+-------------------------+--------------------------+---------------|
| bert-base-uncased      |                0.72 |                    0.72 |                     0.76 |          0.74 |
| bert-large-uncased     |                0.74 |                    0.74 |                     0.76 |          0.75 |
| roberta-base           |                0.76 |                    0.77 |                     0.78 |          0.77 |
| roberta-large          |                0.78 |                    0.77 |                     0.78 |          0.77 |
| random_with_3_patterns |                0.5  |                    0.5  |                     0.5  |         

In [91]:
## Investigate pattern level results for LM models
from evaluate import SampleResult
import numpy as np
import pickle
import nltk
import os
    
results_dir = '/mounts/work/kerem/definition_classification_data/results'

# model_names = ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl', 'xlnet-base-cased', 'xlnet-large-cased']
model_names = ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl', 'random']

pattern_count = 3

word_types = ['n','v']
types = {
    'n': 'Nouns',
    'v': 'Verbs'
}


for word_type in word_types:
    if word_type == 'n':
        headers = ['Model','Pattern 1','Pattern 2','Pattern 3','Best Pattern']
        pattern_count = 3
    elif word_type == 'v':
        headers = ['Model','Pattern 1','Best Pattern']
        pattern_count = 1
        
    model_names[-1] = f'random_with_{pattern_count}_patterns'
 
    all_results = {}
    for model_name in model_names:
        with open(os.path.join(results_dir, f'{model_name}_{word_type}_results.pickle'), 'rb') as handle:
            results = pickle.load(handle)

        accuracies = []
        pred_scores = []
        for i in range(pattern_count):
            pred_scores.append([])
            accuracies.append([])

        option_counts = []
        best_pred_scores = []
        best_accuracies = []
        no = 0
        for key, value in results.items():
            scores = []
            correct = 0
            for pattern_no in range(pattern_count):
                score = value[pattern_no].prediction_score
                scores.append(score)
                pred_scores[pattern_no].append(score)

                if score == 1:
                    accuracies[pattern_no].append(1)
                    correct = 1
                else:
                    accuracies[pattern_no].append(0)

            best_accuracies.append(correct)    
            best_pred_scores.append(max(scores))
            option_counts.append(len(value[0].prediction_probs))

        x1 = np.round(np.mean(np.asarray(pred_scores),1),2)
        x2 = np.round(np.mean(best_pred_scores),2)
        x3 = np.round(np.mean(np.asarray(accuracies)*100,1),2)
        x4 = np.round(np.mean(best_accuracies)*100,2)
        all_results[model_name] = (x1,x2,x3,x4)
        
#         print(f'Model: {model_name} Sample count: {len(option_counts)}')
        
    print(f'\n\nResults from Generative Language Models for Wordnet {types[word_type]}')
    print(f'\nExample count: {len(option_counts)}')
    print(f'\nAverage option count: {np.mean(option_counts):.2f}')
    print(f'Minimum option count: {np.min(option_counts)}')
    print(f'Maximum option count: {np.max(option_counts)}')

    print('\n\nPrediction Rank Results\n')
    model_no = 0
    tab = []
    for model, results in all_results.items():
        tab.append([])
        tab[model_no].append(model)
        for pattern_no in range(pattern_count):
             tab[model_no].append(results[0][pattern_no])
        tab[model_no].append(results[1])
        model_no += 1
    print(tabulate(tab, headers=headers, tablefmt='orgtbl'))


    print('\n\nAccuracy Results\n')
    model_no = 0
    tab = []
    for model, results in all_results.items():
        tab.append([])
        tab[model_no].append(model)
        for pattern_no in range(pattern_count):
             tab[model_no].append(results[2][pattern_no])
        tab[model_no].append(results[3])
        model_no += 1
    print(tabulate(tab, headers=headers, tablefmt='orgtbl'))

Model: gpt2 Sample count: 51559
Model: gpt2-medium Sample count: 51559
Model: gpt2-large Sample count: 42573
Model: gpt2-xl Sample count: 39722
Model: random_with_3_patterns Sample count: 51559


Results from Generative Language Models for Wordnet Nouns

Example count: 51559

Average option count: 50.23
Minimum option count: 5
Maximum option count: 404


Prediction Rank Results

| Model                  |   Pattern 1 |   Pattern 2 |   Pattern 3 |   Best Pattern |
|------------------------+-------------+-------------+-------------+----------------|
| gpt2                   |        0.75 |        0.76 |        0.75 |           0.81 |
| gpt2-medium            |        0.77 |        0.78 |        0.77 |           0.82 |
| gpt2-large             |        0.79 |        0.8  |        0.79 |           0.84 |
| gpt2-xl                |        0.81 |        0.81 |        0.81 |           0.86 |
| random_with_3_patterns |        0.5  |        0.5  |        0.5  |           0.77 |


Accuracy Resul

In [94]:
# Investigate word frequency level results for LM models
from evaluate import SampleResult
import numpy as np
import pickle
import nltk
import os

word_counts_file = '/mounts/work/kerem/datasets/WordNet/wordnet_word_counts_in_WWC.pickle'
with open(word_counts_file, 'rb') as handle:
    word_counts = pickle.load(handle)
    
results_dir = '/mounts/work/kerem/definition_classification_data/results'

# model_names = ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl', 'xlnet-base-cased', 'xlnet-large-cased']
model_names = ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl', 'random']

pattern_count = 3

word_types = ['n','v']
types = {
    'n': 'Nouns',
    'v': 'Verbs'
}


for word_type in word_types:
    headers = ['Model','rare (0-9, X)','medium (10-99, X)','frequent (100-, X)','all (X)']  
    headers = ['Model','(0-9, X, Y)','(10-99, X, Y)','(100-, X, Y)','all (X, Y)']
    
    if word_type == 'n':
        pattern_count = 3
    elif word_type == 'v':
        pattern_count = 1
        
    model_names[-1] = f'random_with_{pattern_count}_patterns'
    all_results = {}
    for model_name in model_names:
        with open(os.path.join(results_dir, f'{model_name}_{word_type}_results.pickle'), 'rb') as handle:
            results = pickle.load(handle)
        
        if 'random' in model_name:
            model_name = 'random'
            
        accuracies = []
        pred_scores = []
        option_counts = []
        for i in range(len(headers)-1):
            pred_scores.append([])
            accuracies.append([])
            option_counts.append([])
        
        no = 0
        for key, value in results.items():
            word = key.split('.')[0].replace("_", " ")
            tokens = nltk.word_tokenize(word)
            word = ' '.join(token.lower() for token in tokens)
            scores = []
            correct = 0
            for pattern_no in range(pattern_count):
                score = value[pattern_no].prediction_score
                scores.append(score)

                if score == 1:                   
                    correct = 1
 
            if 'random' in model_name:
                best_pred_score = score
                correct = 1 if score == 1 else 0
            else:
                best_pred_score = max(scores)
                                
            if word_counts[word] < 10:
                accuracies[0].append(correct)
                pred_scores[0].append(best_pred_score)
                option_counts[0].append(len(value[0].prediction_probs)) 
            elif word_counts[word] < 100:
                accuracies[1].append(correct)
                pred_scores[1].append(best_pred_score)
                option_counts[1].append(len(value[0].prediction_probs)) 
            else:
                accuracies[2].append(correct)
                pred_scores[2].append(best_pred_score)
                option_counts[2].append(len(value[0].prediction_probs)) 
            accuracies[3].append(correct)
            pred_scores[3].append(best_pred_score)    
            option_counts[3].append(len(value[0].prediction_probs)) 
            
        mean_pred_scores = [np.round(np.mean(scores), 2) for scores in pred_scores]
        mean_accuracies = [np.round(np.mean(acc)*100,2) for acc in accuracies]
        all_results[model_name] = (mean_pred_scores, mean_accuracies)

    
    for i in range(1,5):
        headers[i] = headers[i].replace('X', f'{len(accuracies[i-1])}')
        headers[i] = headers[i].replace('Y', f'{np.round(np.mean(option_counts[i-1]),1)}')

    print(f'\n\nResults from Generative Language Models for Wordnet {types[word_type]}')
    print(f'\nExample count: {len(option_counts)}')
    print(f'\nAverage option count: {np.mean(option_counts[-1]):.2f}')
    print(f'Minimum option count: {np.min(option_counts[-1])}')
    print(f'Maximum option count: {np.max(option_counts[-1])}')

    print('\n\nPrediction Rank Results\n')
    model_no = 0
    tab = []
    for model, results in all_results.items():
        tab.append([])
        tab[model_no].append(model)
        tab[model_no].extend(results[0])
        model_no += 1
    print(tabulate(tab, headers=headers, tablefmt='orgtbl'))


    print('\n\nAccuracy Results\n')
    model_no = 0
    tab = []
    for model, results in all_results.items():
        tab.append([])
        tab[model_no].append(model)
        tab[model_no].extend(results[1])
        model_no += 1
    print(tabulate(tab, headers=headers, tablefmt='orgtbl'))



Results from Generative Language Models for Wordnet Nouns

Example count: 4

Average option count: 50.23
Minimum option count: [6, 6, 37, 7, 7, 48, 48, 48, 48, 6, 63, 48, 48, 48, 7, 24, 15, 6, 31, 31, 7, 45, 8, 8, 19, 71, 37, 19, 19, 19, 6, 10, 8, 8, 26, 26, 8, 8, 24, 33, 33, 10, 21, 33, 7, 33, 33, 19, 19, 19, 19, 19, 19, 40, 33, 33, 33, 33, 33, 33, 7, 40, 40, 40, 40, 5, 40, 40, 5, 40, 40, 40, 40, 40, 40, 5, 40, 40, 5, 40, 5, 19, 19, 19, 19, 19, 33, 8, 7, 7, 7, 7, 34, 7, 8, 8, 7, 8, 8, 8, 8, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 11, 10, 10, 5, 5, 5, 5, 5, 19, 10, 7, 7, 19, 11, 11, 11, 11, 11, 11, 11, 40, 8, 8, 8, 6, 11, 8, 7, 7, 7, 7, 10, 7, 40, 7, 40, 7, 10, 8, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 10, 10, 13, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 12, 12, 12, 12, 12, 15, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 12, 12, 12, 40, 12, 12, 8, 5, 5, 5, 5, 8, 5, 8, 12, 5, 14, 8, 7, 7, 7, 7, 7, 7, 16, 16, 16, 16, 16, 5, 5, 7, 7, 19, 9, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 33, 10, 10, 10, 10,



Results from Generative Language Models for Wordnet Verbs

Example count: 4

Average option count: 47.73
Minimum option count: [5, 10, 10, 10, 24, 6, 5, 5, 10, 10, 10, 5, 10, 10, 10, 10, 6, 11, 8, 6, 6, 96, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 5, 5, 5, 5, 5, 10, 19, 73, 401, 401, 401, 10, 10, 5, 5, 5, 5, 5, 10, 10, 18, 73, 73, 401, 193, 10, 10, 73, 73, 5, 11, 6, 5, 5, 5, 5, 11, 11, 11, 11, 11, 11, 18, 10, 11, 11, 11, 11, 11, 5, 13, 6, 19, 137, 11, 11, 38, 6, 6, 10, 8, 14, 8, 8, 5, 5, 8, 5, 14, 112, 8, 8, 14, 14, 14, 14, 6, 14, 14, 8, 5, 14, 53, 14, 14, 14, 14, 14, 5, 5, 5, 9, 9, 5, 15, 5, 9, 9, 6, 73, 73, 193, 9, 18, 18, 73, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 9, 15, 9, 15, 9, 15, 401, 5, 96, 5, 5, 5, 73, 5, 73, 9, 58, 10, 58, 33, 5, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 5, 10, 8, 10, 11, 25, 24, 401, 137, 137, 5, 401, 5, 5, 5, 5, 7, 7, 8, 8, 8, 8, 8, 8, 8, 18, 18, 18, 18, 18, 7, 7, 10, 7, 24, 7, 10, 7, 10, 9, 137, 24, 24, 24, 24, 24, 24, 

In [61]:
## Investigate results for rare words
from tabulate import tabulate

from evaluate import SampleResult
import numpy as np
import pickle
import os

results_dir = '/mounts/work/kerem/definition_classification_data/results'

model_names = [
    'bert-base-uncased',#'embeddings-wiki-100-diff-bbu-all_bert-base-uncased',
    'bertram_original_context_bert-base-uncased','bertram_original_add_bert-base-uncased',
    'bertram_CE_context_bert-base-uncased', 'bertram_CE_add_bert-base-uncased',
    'bertram_CE_old_context_bert-base-uncased', 'bertram_CE_old_fused_bert-base-uncased',
#     'bertram_CE_old_model_add_bert-base-uncased', 'bertram_random_informativeness_context_bert-base-uncased',
#     'bertram_uniform_informativeness_context_bert-base-uncased',
#     'ablation_experiment'
              ]

short_model_names = {
    'bert-base-uncased': 'bert',
    'embeddings-wiki-100-diff-bbu-all_bert-base-uncased': 'OTA',
    'bertram_original_context_bert-base-uncased': 'AM context',
    'bertram_original_add_bert-base-uncased': 'AM add',
    'bertram_CE_context_bert-base-uncased': 'CE context',
    'bertram_CE_add_bert-base-uncased': 'CE add',
    'bertram_CE_old_context_bert-base-uncased': 'CE old context',
    'bertram_CE_old_fused_bert-base-uncased': 'CE old fused',
    'bertram_CE_old_add_bert-base-uncased': 'CE old add',
    'bertram_random_informativeness_context_bert-base-uncased': 'random informativeness',
    'bertram_uniform_informativeness_context_bert-base-uncased': 'uniform informativeness',
    'ablation_experiment': 'random'
}

pattern_count = 3

min_count = 1e10
min_model = ''
for model_name in model_names:
    with open(os.path.join(results_dir, f'{model_name}_n_results.pickle'), 'rb') as handle:
        results = pickle.load(handle)
#     print(f'Model: {model_name}, result count: {len(results)}')
    if len(results) < min_count:
        min_count = len(results)
        min_model = model_name

with open(os.path.join(results_dir, f'{min_model}_n_results.pickle'), 'rb') as handle:
    results = pickle.load(handle)
min_vocab = list(results.keys())

all_results = {}
for model_name in model_names:
    with open(os.path.join(results_dir, f'{model_name}_n_results.pickle'), 'rb') as handle:
        results = pickle.load(handle)
    
    accuracies = []
    pred_scores = []
    for i in range(pattern_count):
        pred_scores.append([])
        accuracies.append([])
        
    option_counts = []
    best_pred_scores = []
    best_accuracies = []
    no = 0
    for key, value in results.items():  
        if key in min_vocab:
            scores = []
            correct = 0
            for pattern_no in range(pattern_count):
                score = value[pattern_no].prediction_score
                scores.append(score)
                pred_scores[pattern_no].append(score)

                if score == 1:
                    accuracies[pattern_no].append(1)
                    correct = 1
                else:
                    accuracies[pattern_no].append(0)

            best_accuracies.append(correct)    
            best_pred_scores.append(max(scores))
            option_counts.append(len(value[0].prediction_probs))   

    x1 = np.round(np.mean(np.asarray(pred_scores),1),2)
    x2 = np.round(np.mean(best_pred_scores),2)
    x3 = np.round(np.mean(np.asarray(accuracies)*100,1),2)
    x4 = np.round(np.mean(best_accuracies)*100,2)
    all_results[model_name] = (x1,x2,x3,x4)

print(f'\nExample count: {len(option_counts)}')
print(f'\nAverage option count: {np.mean(option_counts):.2f}')
print(f'Minimum option count: {np.min(option_counts)}')
print(f'Maximum option count: {np.max(option_counts)}')

headers = ['Model','Pattern 1','Pattern 2','Pattern 3','Best Pattern']

print('\n\nPrediction Rank Results\n')
model_no = 0
tab = []
for model, results in all_results.items():
    tab.append([])
    tab[model_no].append(short_model_names[model])
    for pattern_no in range(pattern_count):
         tab[model_no].append(results[0][pattern_no])
    tab[model_no].append(results[1])
    model_no += 1
print(tabulate(tab, headers=headers, tablefmt='orgtbl'))


print('\n\nAccuracy Results\n')
model_no = 0
tab = []
for model, results in all_results.items():
    tab.append([])
    tab[model_no].append(short_model_names[model])
    for pattern_no in range(pattern_count):
         tab[model_no].append(results[2][pattern_no])
    tab[model_no].append(results[3])
    model_no += 1
print(tabulate(tab, headers=headers, tablefmt='orgtbl'))



Example count: 19924

Average option count: 68.55
Minimum option count: 5
Maximum option count: 402


Prediction Rank Results

| Model          |   Pattern 1 |   Pattern 2 |   Pattern 3 |   Best Pattern |
|----------------+-------------+-------------+-------------+----------------|
| bert           |        0.57 |        0.59 |        0.59 |           0.72 |
| AM context     |        0.59 |        0.59 |        0.6  |           0.71 |
| AM add         |        0.61 |        0.61 |        0.62 |           0.72 |
| CE context     |        0.59 |        0.59 |        0.6  |           0.71 |
| CE add         |        0.61 |        0.61 |        0.62 |           0.72 |
| CE old context |        0.6  |        0.6  |        0.61 |           0.72 |
| CE old fused   |        0.61 |        0.61 |        0.62 |           0.72 |


Accuracy Results

| Model          |   Pattern 1 |   Pattern 2 |   Pattern 3 |   Best Pattern |
|----------------+-------------+-------------+-------------+------------