In [2]:
from collections import defaultdict
import csv
from glob import glob
import json
import os
import random

import pandas as pd

In [3]:
random.seed=42

In [4]:
def get_word_clusters(predictions_folder):
    word_clusters_dict = {}
    for word_path in glob(predictions_folder):
        word = os.path.split(word_path)[-1]
    
        word_uses = pd.read_csv(os.path.join(DWUG_PATH, "data", word, "uses.csv"), sep="\t", quoting=csv.QUOTE_NONE)
        
        clusters_dir = os.path.join(DWUG_PATH, "clusters/opt")
        if not os.path.exists(clusters_dir):  # no opt in RuDSI data
            clusters_dir = os.path.join(DWUG_PATH, "clusters")
        word_clusters = pd.read_csv(os.path.join(clusters_dir, f"{word}.csv"), sep="\t", quoting=csv.QUOTE_NONE)
        word_clusters[CLUSTER_NUMBER_COLUMN] = word_clusters[CLUSTER_NUMBER_COLUMN].astype(int)
    
        
        this_word = word_uses.join(word_clusters.set_index("identifier"), on="identifier")
        
        word_clusters_dict[word] = this_word
    return word_clusters_dict

In [44]:
def pass_folder(predictions_folder, label_data, word_clusters_dict):
    clusters_minus_1 = 0
    one_use_clusters = 0
    two_use_cluster = 0
    one_clusters = 0
    one_clusters_after_one_use_removal = 0
    all_definitions = set()
    all_definitions_by_word = defaultdict(set)
    for word_path in glob(predictions_folder):
        word = os.path.split(word_path)[-1]
        
        clusters_and_definitions = pd.read_csv(os.path.join(word_path, "cluster_gloss.tsv"), sep="\t")
        clusters_and_definitions["cluster"] = clusters_and_definitions.cluster.astype(int)
        clusters_minus_1 += clusters_and_definitions[clusters_and_definitions.cluster==-1].shape[0]
        clusters_and_definitions = clusters_and_definitions[clusters_and_definitions.cluster!=-1]
        
        if clusters_and_definitions.shape[0] > 1:
            this_word = word_clusters_dict[word]
        
            definitions, contexts_list, contexts_list_html = [], [], []
            for row in clusters_and_definitions.iterrows():
                cluster_number, definition = row[1]
                if definition not in all_definitions:
                        all_definitions.add(definition)
                        all_definitions_by_word[word].add(definition)
                else: 
                    words = [word]
                    for word_key, word_definitions in all_definitions_by_word.items():
                        if (word_key != word) and (definition in word_definitions):
                            words.append(word_key)

                    if len(words)>1:
                        print(f"Common definition {definition} for words {words} by {predictions_folder}")
                
                    
                        
                this_cluster = this_word[this_word[CLUSTER_NUMBER_COLUMN] == cluster_number]
                if this_cluster.shape[0] == 2:
                    two_use_cluster += 1
                if this_cluster.shape[0] == 1:
                    one_use_clusters += 1
                if this_cluster.shape[0] > 2:
                    
                    definitions.append(definition)
                    contexts = []
                    
                    for row in this_cluster.iterrows():
                        start, end = row[1]["indexes_target_token"].split(":")
                        start, end = int(start), int(end)
                        example = row[1][EXAMPLE_COLUMN]
                        contexts.append(f"{example[:start]}<b>{example[start:end]}</b>{example[end:]}")
        
                    if not contexts: 
                        print(word)
                        print(cluster_number)
                    try:
                        choiced_contexts = random.sample(contexts, k=min(len(contexts), 5))
                        
                        contexts_list.append(str(cluster_number))
                        contexts_list_html.append('<br>'.join(choiced_contexts))
                        
                    except IndexError:
                        print(contexts)
                        raise IndexError
                

            if (len(contexts_list) > 1):
                seen_definitions = []
                for definition in definitions:
                    if definition not in seen_definitions:
                        seen_definitions.append(definition)
                        cluster_data = {}
                        cluster_data["data"] = {"my_text": f"{word}: {definition[0].upper()+definition[1:]}"}
                        
                        
                        cluster_data["data"]["variants"] = [
                            {"value": ctx, "html": ctx_html} for ctx, ctx_html in zip(contexts_list, contexts_list_html)
                        ]
                        label_data.append(cluster_data)
            else:
                
                one_clusters_after_one_use_removal += 1
                
                
        else:
            one_clusters += 1
    print(f"Number of clusters labeled with -1: {clusters_minus_1}")
    print(f"Number of singleton clusters: {one_use_clusters}")
    print(f"Number of clusters with two uses: {two_use_cluster}")
    print(f"Number of words with one cluster only: {one_clusters}")
    print(f"Number of words where one cluster only remained after removing singletons, -1: {one_clusters_after_one_use_removal}")
    return label_data

In [45]:
label_data = []
lang = "en"
DWUG_PATH = os.path.expanduser(f"~/PycharmProjects/gloss-annotator/wugs/dwug_{lang}/")
CLUSTER_NUMBER_COLUMN = 'cluster'
EXAMPLE_COLUMN = 'context'
WORD_COLUMN = 'word'
predictions_folder = os.path.expanduser("~/PycharmProjects/gloss-annotator/predictions/")

word_clusters_dict = get_word_clusters(os.path.join(predictions_folder, f"pilot_glmlarge_wordnet_l1norm_top3/dwug_{lang}/*"))

label_data = pass_folder(
    os.path.join(predictions_folder, f"pilot_glmlarge_wordnet_l1norm_top3/dwug_{lang}/*"),
    label_data,
    word_clusters_dict,
)
label_data = pass_folder(
    os.path.join(
    predictions_folder,
    f"pilot_flan-t5-definition-en-xl/dwug_{lang}/*",
    ),
    label_data,
    word_clusters_dict,
)
label_data = pass_folder(
    os.path.join(
    predictions_folder,
    f"lesk/dwug_{lang}/*",
    ),
    label_data,
    word_clusters_dict,
)
print(f"{len(label_data)} examples to annotate")

random.shuffle(label_data)
with open(os.path.expanduser(f"~/PycharmProjects/label-studio-{lang}.json"), "w") as f:
    json.dump(label_data, f)

Common definition the front of the human head from the forehead to the chin and ear to ear for words ['face_nn', 'head_nn'] by /home/m/PycharmProjects/gloss-annotator/predictions/pilot_glmlarge_wordnet_l1norm_top3/dwug_en/*
Common definition a large scale offensive (more than a counterattack) undertaken by a defending force to seize the initiative from an attacking force for words ['attack_nn', 'head_nn'] by /home/m/PycharmProjects/gloss-annotator/predictions/pilot_glmlarge_wordnet_l1norm_top3/dwug_en/*
Common definition the act of pressing; the exertion of pressure for words ['twist_nn', 'stab_nn'] by /home/m/PycharmProjects/gloss-annotator/predictions/pilot_glmlarge_wordnet_l1norm_top3/dwug_en/*
Number of clusters labeled with -1: 33
Number of singleton clusters: 326
Number of clusters with two uses: 37
Number of words with one cluster only: 4
Number of words where one cluster only remained after removing singletons, -1: 12
Number of clusters labeled with -1: 0
Number of singleton cl

In [46]:
with open(os.path.expanduser(f"~/PycharmProjects/label-studio-{lang}-test.json"), "w") as f:
    json.dump(label_data[:15], f)

Labeling interface - https://github.com/ltgoslo/gloss-annotator/blob/main/wugs/label_studio_data/labeling_config.xml

In [64]:
annotations_path = os.path.expanduser("~/Downloads/project-1-at-2023-10-11-15-39-6ec84569.json")
with open(annotations_path, "r", encoding="utf8") as f:
    annotations = json.load(f)
annotations

[{'id': 1,
  'annotations': [{'id': 6,
    'completed_by': 3,
    'result': [{'value': {'choices': ['4']},
      'id': 'CyR8Ly4eg9',
      'from_name': 'selection',
      'to_name': 'query',
      'type': 'choices',
      'origin': 'manual'}],
    'was_cancelled': False,
    'ground_truth': False,
    'created_at': '2023-10-11T15:35:14.954776Z',
    'updated_at': '2023-10-11T15:35:14.954811Z',
    'draft_created_at': None,
    'lead_time': 345.459,
    'prediction': {},
    'result_count': 0,
    'unique_id': 'a16f8f0c-a66e-4276-afe6-fa42cf4d9dc7',
    'last_action': None,
    'task': 1,
    'project': 1,
    'updated_by': 3,
    'parent_prediction': None,
    'parent_annotation': None,
    'last_created_by': None}],
  'file_upload': '67c0c244-label-studio-en.json',
  'drafts': [],
  'predictions': [],
  'data': {'my_text': 'plane_nn: A flat or level surface, especially of a three-dimensional object.',
   'variants': [{'value': '0',
     'html': 'The <b>plane</b>\'s four Allison turbop

In [65]:
methods = (
    "pilot_glmlarge_wordnet_l1norm_top3",
    "pilot_flan-t5-definition-en-xl",
    "lesk",
)
correct_answers = defaultdict(list)

for sample in annotations:
    word = sample["data"]["my_text"].split(": ")[0]
    gloss = sample["data"]["my_text"][len(word)+2:]
    methods_with_this_gloss, clusters_pred = [], []
    for method in methods:
        predictions_path = os.path.join(predictions_folder, f"{method}/dwug_{lang}/{word}/cluster_gloss.tsv")
        clusters_and_definitions = pd.read_csv(predictions_path, sep="\t")
        cluster_pred = clusters_and_definitions[clusters_and_definitions.gloss == gloss]
        if cluster_pred.shape[0] > 0:
            methods_with_this_gloss.append(method)
            clusters_pred.append(str(cluster_pred[CLUSTER_NUMBER_COLUMN].iloc[0]))
    sample_annotations = sample['annotations']
    for annotation in sample_annotations:
        cluster_true = annotation['result'][0]["value"]["choices"][0]
        for method, cluster in zip(methods_with_this_gloss, clusters_pred):
            print(method)
            print(cluster_true)
            print(cluster)
            correct_answers[method].append(int(cluster_true == cluster))

for k, v in correct_answers.items():
    print(f"Accuracy {k}: {sum(v)/len(v)}")

pilot_flan-t5-definition-en-xl
4
5
pilot_glmlarge_wordnet_l1norm_top3
0
0
pilot_flan-t5-definition-en-xl
1
1
pilot_flan-t5-definition-en-xl
2
0
Accuracy pilot_flan-t5-definition-en-xl: 0.3333333333333333
Accuracy pilot_glmlarge_wordnet_l1norm_top3: 1.0
