In [46]:
from collections import defaultdict
import csv
from glob import glob
import json
import os
import random

import pandas as pd

In [2]:
random.seed=42

In [37]:
def get_word_clusters(predictions_folder):
    word_clusters_dict = {}
    for word_path in glob(predictions_folder):
        word = os.path.split(word_path)[-1]
    
        word_uses = pd.read_csv(os.path.join(DWUG_PATH, "data", word, "uses.csv"), sep="\t", quoting=csv.QUOTE_NONE)
        
        clusters_dir = os.path.join(DWUG_PATH, "clusters/opt")
        if not os.path.exists(clusters_dir):  # no opt in RuDSI data
            clusters_dir = os.path.join(DWUG_PATH, "clusters")
        word_clusters = pd.read_csv(os.path.join(clusters_dir, f"{word}.csv"), sep="\t", quoting=csv.QUOTE_NONE)
        word_clusters[CLUSTER_NUMBER_COLUMN] = word_clusters[CLUSTER_NUMBER_COLUMN].astype(int)
    
        
        this_word = word_uses.join(word_clusters.set_index("identifier"), on="identifier")
        
        word_clusters_dict[word] = this_word
    return word_clusters_dict

In [43]:
def pass_folder(predictions_folder, label_data, word_clusters_dict):
    for word_path in glob(predictions_folder):
        word = os.path.split(word_path)[-1]
        
        clusters_and_definitions = pd.read_csv(os.path.join(word_path, "cluster_gloss.tsv"), sep="\t")
        clusters_and_definitions["cluster"] = clusters_and_definitions.cluster.astype(int)
        clusters_and_definitions = clusters_and_definitions[clusters_and_definitions.cluster!=-1]
        if clusters_and_definitions.shape[0] > 1:
            this_word = word_clusters_dict[word]
        
            definitions, contexts_list, contexts_list_html = [], [], []
            for row in clusters_and_definitions.iterrows():
                cluster_number, definition = row[1]
                
                
                this_cluster = this_word[this_word[CLUSTER_NUMBER_COLUMN] == cluster_number]
                
                contexts = []
        
                for row in this_cluster.iterrows():
                    start, end = row[1]["indexes_target_token"].split(":")
                    start, end = int(start), int(end)
                    example = row[1][EXAMPLE_COLUMN]
                    contexts.append(f"{example[:start]}<b>{example[start:end]}</b>{example[end:]}")
    
                if not contexts: 
                    print(word)
                    print(cluster_number)
                try:
                    choiced_contexts = random.sample(contexts, k=min(len(contexts), 5))
                    
                    contexts_list.append(str(cluster_number))
                    contexts_list_html.append('<br>'.join(choiced_contexts))
                    
                except IndexError:
                    print(contexts)
                    raise IndexError
                definitions.append(definition)
            
            for definition in definitions:
                cluster_data = {}
                cluster_data["data"] = {"my_text": f"{word}: {definition}"}
                cluster_data["data"]["variants"] = [
                    {"value": ctx, "html": ctx_html} for ctx, ctx_html in zip(contexts_list, contexts_list_html)
                ]
                label_data.append(cluster_data)
    return label_data

In [44]:
label_data = []
lang = "en"
DWUG_PATH = os.path.expanduser(f"~/PycharmProjects/gloss-annotator/wugs/dwug_{lang}/")
CLUSTER_NUMBER_COLUMN = 'cluster'
EXAMPLE_COLUMN = 'context'
WORD_COLUMN = 'word'
predictions_folder = os.path.expanduser("~/PycharmProjects/gloss-annotator/predictions/")

word_clusters_dict = get_word_clusters(os.path.join(predictions_folder, f"pilot_glmlarge_wordnet_l1norm_top3/dwug_{lang}/*"))

label_data = pass_folder(os.path.join(predictions_folder, f"pilot_glmlarge_wordnet_l1norm_top3/dwug_{lang}/*"), label_data, word_clusters_dict)
label_data = pass_folder(
    os.path.join(
    predictions_folder,
    f"pilot_flan-t5-definition-en-xl/dwug_{lang}/*",
    ),
    label_data,
    word_clusters_dict,
)
label_data = pass_folder(
    os.path.join(
    predictions_folder,
    f"lesk/dwug_{lang}/*",
    ),
    label_data,
    word_clusters_dict,
)
random.shuffle(label_data)
with open(os.path.expanduser(f"~/PycharmProjects/label-studio-{lang}.json"), "w") as f:
    json.dump(label_data, f)

Labeling interface - https://github.com/ltgoslo/gloss-annotator/blob/main/wugs/label_studio_data/labeling_config.xml

In [52]:
annotations_path = os.path.expanduser("~/Downloads/project-4-at-2023-10-11-14-44-6fb06a53.json")
with open(annotations_path, "r", encoding="utf8") as f:
    annotations = json.load(f)
annotations

[{'id': 737,
  'annotations': [{'id': 20,
    'completed_by': 1,
    'result': [{'value': {'choices': ['1']},
      'id': 'cfvp_xt3zz',
      'from_name': 'selection',
      'to_name': 'query',
      'type': 'choices',
      'origin': 'manual'}],
    'was_cancelled': False,
    'ground_truth': False,
    'created_at': '2023-10-11T13:06:43.798241Z',
    'updated_at': '2023-10-11T14:44:08.586408Z',
    'draft_created_at': None,
    'lead_time': 8.571,
    'prediction': {},
    'result_count': 0,
    'unique_id': '15238752-595d-4613-9b0f-2daa0ad46c35',
    'import_id': None,
    'last_action': None,
    'task': 737,
    'project': 4,
    'updated_by': 1,
    'parent_prediction': None,
    'parent_annotation': None,
    'last_created_by': None}],
  'file_upload': 'aa82ee11-label-studio-en.json',
  'drafts': [],
  'predictions': [],
  'data': {'my_text': 'bar_nn: the body of individuals qualified to practice law in a particular jurisdiction',
   'variants': [{'value': '6',
     'html': 'The

In [56]:
methods = (
    "pilot_glmlarge_wordnet_l1norm_top3",
    "pilot_flan-t5-definition-en-xl",
    "lesk",
)
correct_answers = defaultdict(list)

for sample in annotations:
    word = sample["data"]["my_text"].split(": ")[0]
    gloss = sample["data"]["my_text"][len(word)+2:]
    methods_with_this_gloss, clusters_pred = [], []
    for method in methods:
        predictions_path = os.path.join(predictions_folder, f"{method}/dwug_{lang}/{word}/cluster_gloss.tsv")
        clusters_and_definitions = pd.read_csv(predictions_path, sep="\t")
        cluster_pred = clusters_and_definitions[clusters_and_definitions.gloss == gloss]
        if cluster_pred.shape[0] > 0:
            methods_with_this_gloss.append(method)
            clusters_pred.append(str(cluster_pred[CLUSTER_NUMBER_COLUMN].iloc[0]))
    sample_annotations = sample['annotations']
    for annotation in sample_annotations:
        cluster_true = annotation['result'][0]["value"]["choices"][0]
        for method, cluster in zip(methods_with_this_gloss, clusters_pred):
            print(method)
            print(cluster_true)
            print(cluster)
            correct_answers[method].append(int(cluster_true == cluster))

for k, v in correct_answers.items():
    print(f"Accuracy {k}: {sum(v)/len(v)}")

pilot_glmlarge_wordnet_l1norm_top3
1
1
lesk
1
13
pilot_glmlarge_wordnet_l1norm_top3
0
20
lesk
2
5
pilot_flan-t5-definition-en-xl
0
2
Accuracy pilot_glmlarge_wordnet_l1norm_top3: 0.5
Accuracy lesk: 0.0
Accuracy pilot_flan-t5-definition-en-xl: 0.0
