In [1]:
from collections import defaultdict
import csv
from glob import glob
import json
import os
import random

import pandas as pd

In [2]:
random.seed=42

In [3]:
def get_word_clusters(predictions_folder):
    word_clusters_dict = {}
    for word_path in glob(predictions_folder):
        word = os.path.split(word_path)[-1]
    
        word_uses = pd.read_csv(os.path.join(DWUG_PATH, "data", word, "uses.csv"), sep="\t", quoting=csv.QUOTE_NONE)
        
        clusters_dir = os.path.join(DWUG_PATH, "clusters/opt")
        if not os.path.exists(clusters_dir):  # no opt in RuDSI data
            clusters_dir = os.path.join(DWUG_PATH, "clusters")
        word_clusters = pd.read_csv(os.path.join(clusters_dir, f"{word}.csv"), sep="\t", quoting=csv.QUOTE_NONE)
        word_clusters[CLUSTER_NUMBER_COLUMN] = word_clusters[CLUSTER_NUMBER_COLUMN].astype(int)
    
        
        this_word = word_uses.join(word_clusters.set_index("identifier"), on="identifier")
        
        word_clusters_dict[word] = this_word
    return word_clusters_dict

In [18]:
def pass_folder(predictions_folder, label_data, word_clusters_dict, word_definitions_dict):
    clusters_minus_1 = 0
    one_use_clusters = 0
    two_use_cluster = 0
    one_clusters = 0
    one_clusters_after_one_use_removal = 0
    all_definitions = set()
    all_definitions_by_word = defaultdict(set)
    for word_path in glob(predictions_folder):
        word = os.path.split(word_path)[-1]
        other_methods_word_definitions = set()
        for definitions_set in word_definitions_dict[word]:
            other_methods_word_definitions = other_methods_word_definitions.union(definitions_set)
        
        
        clusters_and_definitions = pd.read_csv(os.path.join(word_path, "cluster_gloss.tsv"), sep="\t")
        clusters_and_definitions["cluster"] = clusters_and_definitions.cluster.astype(int)
        clusters_minus_1 += clusters_and_definitions[clusters_and_definitions.cluster==-1].shape[0]
        clusters_and_definitions = clusters_and_definitions[clusters_and_definitions.cluster!=-1]
        
        if clusters_and_definitions.shape[0] > 1:
            this_word = word_clusters_dict[word]
        
            definitions, contexts_list, contexts_list_html = [], [], []
            for row in clusters_and_definitions.iterrows():
                cluster_number, definition = row[1]
                if definition not in all_definitions:
                        all_definitions.add(definition)
                        all_definitions_by_word[word].add(definition)
                else: 
                    words = [word]
                    for word_key, word_definitions in all_definitions_by_word.items():
                        if (word_key != word) and (definition in word_definitions):
                            words.append(word_key)

                    if len(words)>1:
                        print(f"Common definition {definition} for words {words} by {predictions_folder}")
                
                    
                        
                this_cluster = this_word[this_word[CLUSTER_NUMBER_COLUMN] == cluster_number]
                if this_cluster.shape[0] == 2:
                    two_use_cluster += 1
                if this_cluster.shape[0] == 1:
                    one_use_clusters += 1
                if this_cluster.shape[0] > 2:
                    
                    definitions.append(definition)
                    contexts = []
                    
                    for row in this_cluster.iterrows():
                        start, end = row[1]["indexes_target_token"].split(":")
                        start, end = int(start), int(end)
                        example = row[1][EXAMPLE_COLUMN]
                        contexts.append(f"- {example[:start]}<b>{example[start:end]}</b>{example[end:]}")
        
                    if not contexts: 
                        print(word)
                        print(cluster_number)
                    try:
                        choiced_contexts = random.sample(contexts, k=min(len(contexts), 5))
                        
                        contexts_list.append(str(cluster_number))
                        contexts_list_html.append('<br>'.join(choiced_contexts))
                        
                    except IndexError:
                        print(contexts)
                        raise IndexError
                

            if (len(contexts_list) > 1):
                seen_definitions = []
                for definition in definitions:
                    if (definition not in seen_definitions) and (definition not in other_methods_word_definitions):
                        seen_definitions.append(definition)
                        cluster_data = {}
                        cluster_data["data"] = {"my_text": f"{word.upper()}: <b>{definition.upper()}</b>"}
                        
                        
                        cluster_data["data"]["variants"] = [
                            {"value": ctx, "html": ctx_html} for ctx, ctx_html in zip(contexts_list, contexts_list_html)
                        ]
                        random.shuffle(cluster_data["data"]["variants"])
                        cluster_data["data"]["variants"].extend(
                            [
                                {"value": "-2", "html": '<b>This definition describes none of the clusters</b>'},
                                {"value": "-3", "html": '<b>This definition describes more than one cluster</b>'},
                            ],
                        )
                        label_data.append(cluster_data)
                word_definitions_dict[word].append(set(seen_definitions))
            else:
                
                one_clusters_after_one_use_removal += 1
                
                
        else:
            one_clusters += 1
    print(f"Number of clusters labeled with -1: {clusters_minus_1}")
    print(f"Number of singleton clusters: {one_use_clusters}")
    print(f"Number of clusters with two uses: {two_use_cluster}")
    print(f"Number of words with one cluster only: {one_clusters}")
    print(f"Number of words where one cluster only remained after removing singletons, -1: {one_clusters_after_one_use_removal}")
    return label_data, word_definitions_dict

In [19]:
label_data = []
lang = "en"
DWUG_PATH = os.path.expanduser(f"~/PycharmProjects/gloss-annotator/wugs/dwug_{lang}/")
CLUSTER_NUMBER_COLUMN = 'cluster'
EXAMPLE_COLUMN = 'context'
WORD_COLUMN = 'word'
predictions_folder = os.path.expanduser("~/PycharmProjects/gloss-annotator/predictions/")
methods = (
    "pilot_glmlarge_wordnet_l1norm_top3",
    "pilot_flan-t5-definition-en-xl",
    "lesk",
)
word_clusters_dict = get_word_clusters(os.path.join(predictions_folder, f"pilot_glmlarge_wordnet_l1norm_top3/dwug_{lang}/*"))
word_definitions_dict = defaultdict(list)


label_data, word_definitions_dict = pass_folder(
    os.path.join(predictions_folder, f"pilot_glmlarge_wordnet_l1norm_top3/dwug_{lang}/*"),
    label_data,
    word_clusters_dict,
    word_definitions_dict,
)
label_data, word_definitions_dict = pass_folder(
    os.path.join(
    predictions_folder,
    f"pilot_flan-t5-definition-en-xl/dwug_{lang}/*",
    ),
    label_data,
    word_clusters_dict,
    word_definitions_dict,
)
label_data, word_definitions_dict = pass_folder(
    os.path.join(
    predictions_folder,
    f"lesk/dwug_{lang}/*",
    ),
    label_data,
    word_clusters_dict,
    word_definitions_dict,
)
print(f"{len(label_data)} examples to annotate")

random.shuffle(label_data)
with open(os.path.expanduser(f"~/PycharmProjects/label-studio-{lang}.json"), "w") as f:
    json.dump(label_data, f)

Common definition the front of the human head from the forehead to the chin and ear to ear for words ['face_nn', 'head_nn'] by /home/m/PycharmProjects/gloss-annotator/predictions/pilot_glmlarge_wordnet_l1norm_top3/dwug_en/*
Common definition a large scale offensive (more than a counterattack) undertaken by a defending force to seize the initiative from an attacking force for words ['attack_nn', 'head_nn'] by /home/m/PycharmProjects/gloss-annotator/predictions/pilot_glmlarge_wordnet_l1norm_top3/dwug_en/*
Common definition the act of pressing; the exertion of pressure for words ['twist_nn', 'stab_nn'] by /home/m/PycharmProjects/gloss-annotator/predictions/pilot_glmlarge_wordnet_l1norm_top3/dwug_en/*
Number of clusters labeled with -1: 33
Number of singleton clusters: 326
Number of clusters with two uses: 37
Number of words with one cluster only: 4
Number of words where one cluster only remained after removing singletons, -1: 12
Number of clusters labeled with -1: 0
Number of singleton cl

In [16]:
before, after = 0, 0
for word, definitions in word_definitions_dict.items():
    print(word)
    if definitions:
        current_unique = len(definitions[0]) + len(definitions[1]) + len(definitions[2])
        before += current_unique
        print(current_unique)
        new_unique = len((definitions[0].union(definitions[1])).union(definitions[2]))
        after += new_unique
        print(new_unique)
        if current_unique != new_unique:
            for method, method_definitions in zip(methods, definitions):
                print(method)
                print(method_definitions)
print(before)
print(after)

contemplation_nn
ounce_nn
fiction_nn
bag_nn
afternoon_nn
quilt_nn
5
5
player_nn
12
12
prop_nn
12
12
ball_nn
12
12
head_nn
10
10
lass_nn
5
5
stroke_vb
7
7
relationship_nn
5
5
grain_nn
11
11
lane_nn
stab_nn
9
9
tree_nn
rally_nn
5
5
graft_nn
5
5
face_nn
5
5
record_nn
7
7
bit_nn
11
11
land_nn
7
7
risk_nn
heel_nn
7
7
chairman_nn
edge_nn
16
16
circle_vb
thump_nn
plane_nn
14
14
attack_nn
6
6
tip_vb
15
15
word_nn
9
9
bar_nn
17
17
savage_nn
pin_vb
9
9
rag_nn
8
8
pick_vb
15
15
donkey_nn
gas_nn
6
6
include_vb
maxim_nn
multitude_nn
part_nn
9
9
chef_nn
8
8
twist_nn
11
11
278
278


In [17]:
with open(os.path.expanduser(f"~/PycharmProjects/label-studio-{lang}-test.json"), "w") as f:
    json.dump(label_data[:15], f)

Labeling interface - https://github.com/ltgoslo/gloss-annotator/blob/main/wugs/label_studio_data/labeling_config.xml

In [2]:
METHODS = {
    "en": (
        "glossreader_v1",
        "mt0-definition-en-xl",
        "lesk",
    ),
    "en_test": (
        "pilot_glmlarge_wordnet_l1norm_top3",
        "pilot_flan-t5-definition-en-xl",
        "lesk",
    ),
    "de": (
        "mt0-definition-en-xl",
        "glossreader_v1",
    ),
    "no1": (
        "mt0-definition-no-xl",
    ),
    "no2": (
        "mt0-definition-no-xl",
    )
}

In [29]:
path = os.path.expanduser("~/Downloads/project-1-at-2023-10-19-00-56-e7447d7e.json")

In [30]:
with open(path, "r") as f:
    data = json.load(f)

In [31]:
data

[{'id': 199,
  'annotations': [{'id': 69,
    'completed_by': 1,
    'result': [{'value': {'choices': ['2']},
      'id': 'Ggaygh-xCO',
      'from_name': 'selection',
      'to_name': 'query',
      'type': 'choices',
      'origin': 'manual'}],
    'was_cancelled': False,
    'ground_truth': False,
    'created_at': '2023-10-18T16:53:23.838044Z',
    'updated_at': '2023-10-18T16:53:23.838077Z',
    'draft_created_at': '2023-10-18T16:53:20.504729Z',
    'lead_time': 47.802,
    'prediction': {},
    'result_count': 0,
    'unique_id': 'e22924d6-0aac-4182-b29b-f3083e05d68f',
    'last_action': None,
    'task': 199,
    'project': 1,
    'updated_by': 1,
    'parent_prediction': None,
    'parent_annotation': None,
    'last_created_by': None}],
  'file_upload': '5aba4498-label-studio-en.json',
  'drafts': [],
  'predictions': [],
  'data': {'my_text': 'RECORD_NN: <b>A MEDIUM FOR RECORDING SOUND, ESPECIALLY A VINYL PHONOGRAPH AND GRAMOPHONE DISC.</b>',
   'variants': [{'value': '2',
  

In [3]:
CLUSTER_NUMBER_COLUMN = 'cluster'

for lang in (
    "de",
    "en",
    "no1",
    "no2",
):
    data_file = f"label-studio-{lang}.json"
    gloss_repo = os.path.expanduser("~/PycharmProjects/gloss-annotator")
    predictions_folder = os.path.join(gloss_repo, "predictions")
    words = defaultdict(list)
    with open(data_file, "r") as f:
        data = json.load(f)
        for sample in data:
            word = sample["data"]["my_text"].split(": ")[0].lower().replace("Engpass", "Engpaß").replace("Fuss", "Fuß").replace("Missklang", "Mißklang")
            gloss = sample["data"]["my_text"][len(word)+2:].replace("<b>", "").replace("</b>", "")
            methods_with_this_gloss, clusters_pred = [], []
            for method in METHODS[lang]:
                predictions_path = os.path.join(predictions_folder, f"{method}/dwug_{lang}/{word}/cluster_gloss.tsv")
                if not os.path.exists(predictions_path):
                    word = word[0].upper() + word[1:]
                    word = word.replace("Engpass", "Engpaß").replace("Fuss", "Fuß").replace("Missklang", "Mißklang")
                    predictions_path = os.path.join(
                        predictions_folder,
                        f"{method}/dwug_{lang}/{word}/cluster_gloss.tsv")
                clusters_and_definitions = pd.read_csv(predictions_path, sep="\t")
                clusters_and_definitions[CLUSTER_NUMBER_COLUMN] = clusters_and_definitions[CLUSTER_NUMBER_COLUMN].astype(str)
                clusters_and_definitions = clusters_and_definitions[clusters_and_definitions[CLUSTER_NUMBER_COLUMN]!="-1"]
                cluster_pred = clusters_and_definitions[clusters_and_definitions.gloss.str.lower() == gloss.lower()]
                if cluster_pred.shape[0] > 0:
                    cluster_number = cluster_pred[CLUSTER_NUMBER_COLUMN].iloc[0]
                    this, other = None, None
                    for choice in sample["data"]["variants"][:2]:
                        if choice["value"]==cluster_number:
                            this = cluster_number
                        else:
                            other = choice["value"]
                    
                        
                    pair = (this, other)
                    
                    if (this is not None) and (pair not in words[word]):
                        words[word].append(pair)
    mapping_dir = os.path.join(f"mappings/{lang}")
    if not os.path.exists("mappings"):
        os.mkdir("mappings")
    if not os.path.exists(mapping_dir):
        os.mkdir(mapping_dir)
    for word, val in words.items():
        mapping_df = pd.DataFrame(val, columns=[CLUSTER_NUMBER_COLUMN, "wrong_cluster"])
        mapping_df.to_csv(
            f"{mapping_dir}/{word}.tsv",
            sep="\t",
            index=False,
        )