In [17]:
from tabulate import tabulate
from definition_processor import get_patterns
from evaluate import SampleResult
import numpy as np
import pickle
import nltk
import os

model_names = {
    'mlm': ['bert-base-uncased', 'bert-large-uncased', 'roberta-base', 'roberta-large'],
    'lm': ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl']
}
word_types = {
    'n': 'Nouns',
    'v': 'Verbs'
}

word_counts_file = '/mounts/work/kerem/datasets/WordNet/wordnet_word_counts_in_WWC.pickle'
with open(word_counts_file, 'rb') as handle:
    word_counts = pickle.load(handle)
    
for model_type in ['mlm']:#,'lm']:
    for word_type in ['n','v']:
        for detail_level in ['tokenization','pattern','frequency']:

            max_token_count = 3 # for tokenization_level results
            count_thresholds = [10, 100] # for frequency level results, should contain only 2 numbers
            pattern_count = len(get_patterns(model_type, word_type))

#             results_data_dir = '/mounts/work/kerem/definition_classification_data/results'
#             results_print_dir = '/mounts/work/kerem/gitlab_projects/bertram_with_informative_contexts/evaluate/definition_classification/results'

            results_data_dir = '/mounts/work/kerem/definition_classification_data/word_classificiations_from_definitions/mlm_with_log_probs'
            results_print_dir = '/mounts/work/kerem/gitlab_projects/bertram_with_informative_contexts/evaluate/word_clasification_from_definitions/results'

    
            results_file = os.path.join(results_print_dir, f'{model_type}_log_prob_{detail_level}_results_for_WordNet_{word_types[word_type]}.txt')


            headers = ['Model']
            if detail_level == 'pattern':
                for no in range(pattern_count):
                    headers.append(f'Pattern {no+1}')
                headers.append('Best Pattern')

            elif detail_level == 'frequency':
                headers.append(f'rare (0-{count_thresholds[0]-1})')
                headers.append(f'medium ({count_thresholds[0]}-{count_thresholds[1]-1})')
                headers.append(f'freq. ({count_thresholds[1]}-)')
                headers.append(f'all')
            elif detail_level == 'tokenization':
                for i in range(max_token_count):
                    headers.append(f'{i+1} token')
                headers.append(f'{max_token_count}+ token')
                headers.append(f'all')
            else:
                print('Invalid detail_level!')

            models = model_names[model_type].copy()
            all_results = {}
            for model_name in models:
                with open(os.path.join(results_data_dir, f'{model_name}_{word_type}_results.pickle'), 'rb') as handle:
                    results = pickle.load(handle)

                accuracies = []
                pred_scores = []
                option_counts = []
                for i in range(len(headers)-1):
                    pred_scores.append([])
                    accuracies.append([])
                    option_counts.append([])

                no = 0
                for key, value in results.items():
                    word = key.split('.')[0].replace("_", " ")
                    tokens = nltk.word_tokenize(word)
                    word = ' '.join(token.lower() for token in tokens)
#                     token_count = len(value[0].tokenized_word)
                    token_count = len(value[0].tokenized_words[value[0].answer])
                        
                    scores = []
                    correct = 0
                    for pattern_no in range(pattern_count):
                        score = value[pattern_no].prediction_score
                        scores.append(score)

                        if score == 1:                   
                            correct = 1

                        if detail_level == 'pattern':
                            accuracies[pattern_no].append(int(score == 1))
                            pred_scores[pattern_no].append(score)

                    if 'random' in model_name:
                        best_pred_score = score
                        correct = 1 if score == 1 else 0
                    else:
                        best_pred_score = max(scores)

                    if detail_level == 'frequency':
                        freq_level = int(word_counts[word] >= count_thresholds[0]) + int(word_counts[word] >= count_thresholds[1])
                        accuracies[freq_level].append(correct)
                        pred_scores[freq_level].append(best_pred_score)
                        option_counts[freq_level].append(len(value[0].prediction_probs))
                    
                    if detail_level == 'tokenization':
                        pos = min(token_count, max_token_count+1)-1
                        accuracies[pos].append(correct)
                        pred_scores[pos].append(best_pred_score)
                        option_counts[pos].append(len(value[0].prediction_probs))
                        
                    accuracies[-1].append(correct)
                    pred_scores[-1].append(best_pred_score)
                    option_counts[-1].append(len(value[0].prediction_probs))
                                    
                mean_pred_scores = [np.round(np.mean(scores), 2) for scores in pred_scores]
                mean_accuracies = [np.round(np.mean(acc)*100,2) for acc in accuracies]
                all_results[model_name] = (mean_pred_scores, mean_accuracies)
                
                
                sample_counts = []
                mean_option_counts = []
                for i in range(len(headers)-1):
                    sample_counts.append(len(accuracies[i]))
                    mean_option_counts.append(np.round(np.mean(option_counts[i]),1))       

            
            if detail_level == 'frequency':
                mean_pred_scores = []
                mean_accuracies = []
                for i in range(len(headers)-1):   
                    mean_pred_scores.append(0.5)
                    mean_accuracies.append(sum([1/count for count in option_counts[i]])/len(option_counts[i])*100)
                all_results['random'] = (mean_pred_scores, mean_accuracies)

            with open(results_file, 'w') as f_out:
                f_out.write(f'Results from {model_type} for Wordnet {word_types[word_type]}')
                f_out.write(f'\n\nExample count: {len(option_counts[-1])}')
                f_out.write(f'\nAverage option count: {np.mean(option_counts[-1]):.2f}')
                f_out.write(f'\nMinimum option count: {np.min(option_counts[-1])}')
                f_out.write(f'\nMaximum option count: {np.max(option_counts[-1])}')

                if detail_level == 'frequency' or detail_level == 'tokenization':
                    for no, header in enumerate(headers[1:-1]):
                        f_out.write(f'\n\nFor {header}:')
                        f_out.write(f'\nExample count: {len(option_counts[no])}')
                        f_out.write(f'\nAverage option count: {mean_option_counts[no]:.2f}')


                f_out.write('\n\n\nPrediction Rank Results\n\n')
                model_no = 0
                tab = []
                for model, results in all_results.items():
                    tab.append([])
                    tab[model_no].append(model)
                    tab[model_no].extend(results[0])
                    model_no += 1
                f_out.write(tabulate(tab, headers=headers, tablefmt='orgtbl'))


                f_out.write('\n\n\nAccuracy Results\n\n')
                model_no = 0
                tab = []
                for model, results in all_results.items():
                    tab.append([])
                    tab[model_no].append(model)
                    tab[model_no].extend(results[1])
                    model_no += 1
                f_out.write(tabulate(tab, headers=headers, tablefmt='orgtbl'))

FileNotFoundError: [Errno 2] No such file or directory: '/mounts/work/kerem/definition_classification_data/word_classificiations_from_definitions/mlm_with_log_probs/gpt2_n_results.pickle'

In [17]:
## Investigate results for rare words
from tabulate import tabulate

from evaluate import SampleResult
import numpy as np
import pickle
import os

results_dir = '/mounts/work/kerem/definition_classification_data/results'

model_names = [
    'bert-base-uncased',#'embeddings-wiki-100-diff-bbu-all_bert-base-uncased',
    'bertram_original_context_bert-base-uncased','bertram_original_add_bert-base-uncased',
    'bertram_CE_context_bert-base-uncased', 'bertram_CE_add_bert-base-uncased',
    'bertram_CE_old_context_bert-base-uncased', 'bertram_CE_old_fused_bert-base-uncased',
#     'bertram_CE_old_model_add_bert-base-uncased', 'bertram_random_informativeness_context_bert-base-uncased',
#     'bertram_uniform_informativeness_context_bert-base-uncased',
#     'ablation_experiment'
              ]

short_model_names = {
    'bert-base-uncased': 'bert',
    'embeddings-wiki-100-diff-bbu-all_bert-base-uncased': 'OTA',
    'bertram_original_context_bert-base-uncased': 'AM context',
    'bertram_original_add_bert-base-uncased': 'AM add',
    'bertram_CE_context_bert-base-uncased': 'CE context',
    'bertram_CE_add_bert-base-uncased': 'CE add',
    'bertram_CE_old_context_bert-base-uncased': 'CE old context',
    'bertram_CE_old_fused_bert-base-uncased': 'CE old fused',
    'bertram_CE_old_add_bert-base-uncased': 'CE old add',
    'bertram_random_informativeness_context_bert-base-uncased': 'random informativeness',
    'bertram_uniform_informativeness_context_bert-base-uncased': 'uniform informativeness',
    'ablation_experiment': 'random'
}


word_counts_file = '/mounts/work/kerem/datasets/WordNet/wordnet_word_counts_in_WWC.pickle'
with open(word_counts_file, 'rb') as handle:
    word_counts = pickle.load(handle)
    
min_count = 1e10
min_model = ''
for model_name in model_names:
    with open(os.path.join(results_dir, f'{model_name}_n_results.pickle'), 'rb') as handle:
        results = pickle.load(handle)
#     print(f'Model: {model_name}, result count: {len(results)}')
    if len(results) < min_count:
        min_count = len(results)
        min_model = model_name

with open(os.path.join(results_dir, f'{min_model}_n_results.pickle'), 'rb') as handle:
    results = pickle.load(handle)
min_vocab = list(results.keys())


for detail_level in ['tokenization','pattern','frequency']:

    max_token_count = 3 # for tokenization_level results
    count_thresholds = [10, 100] # for frequency level results, should contain only 2 numbers
    pattern_count = len(get_patterns('mlm', 'n'))

    results_data_dir = '/mounts/work/kerem/definition_classification_data/results'
    results_print_dir = '/mounts/work/kerem/gitlab_projects/bertram_with_informative_contexts/evaluate/definition_classification/results'

    results_file = os.path.join(results_print_dir, f'rare_words_{detail_level}_results_for_WordNet_Nouns.txt')

    headers = ['Model']
    if detail_level == 'pattern':
        for no in range(pattern_count):
            headers.append(f'Pattern {no+1}')
        headers.append('Best Pattern')

    elif detail_level == 'frequency':
        headers.append(f'rare (0-{count_thresholds[0]-1})')
        headers.append(f'medium ({count_thresholds[0]}-{count_thresholds[1]-1})')
        headers.append(f'freq. ({count_thresholds[1]}-)')
        headers.append(f'all')
    elif detail_level == 'tokenization':
        for i in range(max_token_count):
            headers.append(f'{i+1} token')
        headers.append(f'{max_token_count}+ token')
        headers.append(f'all')
    else:
        print('Invalid detail_level!')

    models = model_names.copy()
    if detail_level != 'tokenization':
        models.append(f'random_with_{pattern_count}_patterns')              
                
    all_results = {}
    for model_name in models:
        with open(os.path.join(results_data_dir, f'{model_name}_n_results.pickle'), 'rb') as handle:
            results = pickle.load(handle)

        if model_name == 'bert-base-uncased':
            tokenization_lens = {}
            for key, value in results.items():
                if key in min_vocab:
                    tokenization_lens[key] = len(value[0].tokenized_word)
            
        accuracies = []
        pred_scores = []
        option_counts = []
        for i in range(len(headers)-1):
            pred_scores.append([])
            accuracies.append([])
            option_counts.append([])

        no = 0
        for key, value in results.items():
            if key in min_vocab:
                word = key.split('.')[0].replace("_", " ")
                tokens = nltk.word_tokenize(word)
                word = ' '.join(token.lower() for token in tokens)
                token_count = tokenization_lens[key]

                scores = []
                correct = 0
                for pattern_no in range(pattern_count):
                    score = value[pattern_no].prediction_score
                    scores.append(score)

                    if score == 1:                   
                        correct = 1

                    if detail_level == 'pattern':
                        accuracies[pattern_no].append(int(score == 1))
                        pred_scores[pattern_no].append(score)

                if 'random' in model_name:
                    best_pred_score = score
                    correct = 1 if score == 1 else 0
                else:
                    best_pred_score = max(scores)

                if detail_level == 'frequency':
                    freq_level = int(word_counts[word] >= count_thresholds[0]) + int(word_counts[word] >= count_thresholds[1])
                    accuracies[freq_level].append(correct)
                    pred_scores[freq_level].append(best_pred_score)
                    option_counts[freq_level].append(len(value[0].prediction_probs))

                if detail_level == 'tokenization':
                    pos = min(token_count, max_token_count+1)-1
                    accuracies[pos].append(correct)
                    pred_scores[pos].append(best_pred_score)
                    option_counts[pos].append(len(value[0].prediction_probs))

                accuracies[-1].append(correct)
                pred_scores[-1].append(best_pred_score)
                option_counts[-1].append(len(value[0].prediction_probs))

        mean_pred_scores = [np.round(np.mean(scores), 2) for scores in pred_scores]
        mean_accuracies = [np.round(np.mean(acc)*100,2) for acc in accuracies]
        try:
            all_results[short_model_names[model_name]] = (mean_pred_scores, mean_accuracies)
        except:
            all_results[model_name] = (mean_pred_scores, mean_accuracies)

        sample_counts = []
        mean_option_counts = []
        for i in range(len(headers)-1):
            sample_counts.append(len(accuracies[i]))
            mean_option_counts.append(np.round(np.mean(option_counts[i]),1))       

    with open(results_file, 'w') as f_out:
        f_out.write(f'Results from bert-base-uncase for Wordnet {word_types[word_type]}')
        f_out.write(f'\n\nExample count: {len(option_counts[-1])}')
        f_out.write(f'\nAverage option count: {np.mean(option_counts[-1]):.2f}')
        f_out.write(f'\nMinimum option count: {np.min(option_counts[-1])}')
        f_out.write(f'\nMaximum option count: {np.max(option_counts[-1])}')

        if detail_level == 'frequency' or detail_level == 'tokenization':
            for no, header in enumerate(headers[1:-1]):
                f_out.write(f'\n\nFor {header}:')
                f_out.write(f'\nExample count: {len(option_counts[no])}')
                f_out.write(f'\nAverage option count: {mean_option_counts[no]:.2f}')


        f_out.write('\n\n\nPrediction Rank Results\n\n')
        model_no = 0
        tab = []
        for model, results in all_results.items():
            tab.append([])
            tab[model_no].append(model)
            tab[model_no].extend(results[0])
            model_no += 1
        f_out.write(tabulate(tab, headers=headers, tablefmt='orgtbl'))


        f_out.write('\n\n\nAccuracy Results\n\n')
        model_no = 0
        tab = []
        for model, results in all_results.items():
            tab.append([])
            tab[model_no].append(model)
            tab[model_no].extend(results[1])
            model_no += 1
        f_out.write(tabulate(tab, headers=headers, tablefmt='orgtbl'))


In [13]:
import numpy as np

a = np.asarray([1.2,2.7183,3.2,4.2])
b = np.log(a)
b

array([0.18232156, 1.00000668, 1.16315081, 1.43508453])