In [5]:
from datasets import load_dataset
from config import Credentials
from ngram_utils import NgramFetcher

In [6]:
import pickle as pkl

with open('wn_homonyms.pkl', 'rb') as f:
    wn_homonyms = pkl.load(f)

In [7]:
homonyms = {}
for key, value in wn_homonyms.items():
    if len(value) > 1:
        homonyms[key] = value

In [8]:
filtered_homonyms = {}
for key, value in homonyms.items():
    word = key.split('.')[0]
    word = word.replace('_', ' ')
    word_type = key.split('.')[1]
    # we do not want short words like 'a', 'b'
    # we only take nouns for now, as, e.g. 'run' noun 'run' verb very similar meaning
    if len(word) > 2 and word_type == 'n':
        filtered_homonyms[word] = value

word_data = [key for key in filtered_homonyms.keys()]

In [22]:
pkl.dump(filtered_homonyms, open('filtered_homonyms.pkl', 'wb'))

In [14]:
import nltk
from nltk.corpus import semcor, wordnet as wn

def count_occurence(target_synset: str):
    target_synset = wn.synset(target_synset)
    count = 0
    for sent in semcor.tagged_sents(tag='sem'):
        for chunk in sent:
            if isinstance(chunk, nltk.tree.Tree):
                if isinstance(chunk.label(), nltk.corpus.reader.wordnet.Lemma):
                    synset = chunk.label().synset()
                    if synset == target_synset:
                        count += 1
    return count

In [20]:
from tqdm import tqdm

filtered_homonyms_w_occurences = {}
for word, definitions in tqdm(filtered_homonyms.items()):
    filtered_homonyms_w_occurences[word] = {}
    for key, definition in definitions.items():
        count = 0
        for synset_tuple in definition:
            synset = synset_tuple[0]
            count += count_occurence(synset)
        filtered_homonyms_w_occurences[word][key] = {'synsets': definition, 'semcor_occurances': count}

  0%|          | 0/1791 [00:00<?, ?it/s]

0
0


  0%|          | 0/1791 [01:59<?, ?it/s]


KeyboardInterrupt: 

In [21]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from itertools import islice
from tqdm import tqdm

# Helper function to process one word
def process_word(word_definitions_pair):
    word, definitions = word_definitions_pair
    word_result = {}

    for key, definition in definitions.items():
        total_count = 0
        for synset_tuple in definition:
            synset = synset_tuple[0]
            if synset:
                total_count += count_occurence(synset)
        word_result[key] = {
            'synsets': definition,
            'semcor_occurances': total_count
        }

    return word, word_result

# Limit to first 20 entries
top_20_homonyms = list(islice(filtered_homonyms.items(), 20))

filtered_homonyms_w_occurences = {}

with ProcessPoolExecutor() as executor:
    futures = [executor.submit(process_word, item) for item in top_20_homonyms]

    for future in tqdm(as_completed(futures), total=len(futures), desc="Parallel processing"):
        word, result = future.result()
        filtered_homonyms_w_occurences[word] = result

Parallel processing:   0%|          | 0/20 [00:00<?, ?it/s]


PicklingError: Can't pickle <function process_word at 0x31c4a7420>: attribute lookup process_word on __main__ failed

In [2]:
dataset = load_dataset(
    "lukasellinger/homonym-homonymy-wsd",
    token=Credentials.hf_api_key
)['train']

In [3]:
word_data = [entry['word'] for entry in dataset]

In [4]:
ngram_data = NgramFetcher().fetch_ngram_data(word_data, inflections=True)

No Ngram data for koweit
No Ngram data for saint john's
No Ngram data for santiago de cuba
No Ngram data for william cowper
No Ngram data for johann strauss
No Ngram data for river avon
No Ngram data for somme river
No Ngram data for cynoscephalae
No Ngram data for william gilbert
No Ngram data for severn river
No Ngram data for john trumbull
No Ngram data for ferdinand i
No Ngram data for battle of ypres
No Ngram data for coeur d'alene
No Ngram data for bismarck sea
No Ngram data for frederick i
No Ngram data for odessa
No Ngram data for siege of syracuse
No Ngram data for arthur schlesinger
No Ngram data for marston moor
No Ngram data for president harrison
No Ngram data for hohenlinden
No Ngram data for arab-israeli war
No Ngram data for battle of the somme
No Ngram data for cape passero
No Ngram data for thomas wolfe
No Ngram data for capital of georgia
No Ngram data for william seward burroughs
No Ngram data for naseby
No Ngram data for thomas hart benton
No Ngram data for meuse r

In [10]:
missing = []
for key, value in ngram_data.items():
    if 'avg_frequency' not in value:
        missing.append(key)

In [9]:
ngram_data_missing = NgramFetcher().fetch_ngram_data(missing)

No valid Ngram data returned for batch: ['koweit', "saint john's", 'santiago de cuba', 'william cowper', 'johann strauss', 'river avon', 'somme river', 'cynoscephalae', 'william gilbert', 'severn river']
No valid Ngram data returned for batch: ['john trumbull', 'ferdinand i', 'battle of ypres', "coeur d'alene", 'bismarck sea', 'frederick i', 'siege of syracuse', 'arthur schlesinger', 'marston moor', 'president harrison']
No valid Ngram data returned for batch: ['hohenlinden', 'arab-israeli war', 'battle of the somme', 'cape passero', 'thomas wolfe', 'capital of georgia', 'william seward burroughs', 'naseby', 'thomas hart benton', 'meuse river']
No valid Ngram data returned for batch: ['samuel butler', 'president adams', 'joliot-curie', 'el alamein', 'anapurna', 'guarneri', 'guarnerius']


In [16]:
data = []
for word in word_data:
    avg_frequency = avg_freq = ngram_data.get(word, {}).get("avg_frequency") or ngram_data_missing.get(word, {}).get("avg_frequency")
    data.append({'word': word, 'avg_google_ngrams_frequency':avg_frequency})

In [7]:
pkl.dump(data, open('homonym-homonymy-wsd.pkl', 'wb'))

In [23]:
dataset = load_dataset(
    "lukasellinger/homonym-homonymy-wsd",
    token=Credentials.hf_api_key
)['train']

In [24]:
with open('../filtered_homonyms_w_occurences.pkl', 'rb') as f:
    filtered_homonyms_w_occurences = pkl.load(f)

In [25]:
data = []
for word, v in filtered_homonyms_w_occurences.items():
    for entry in dataset:
        if entry['word'] == word:
            avg_google_ngrams_frequency = entry['avg_google_ngrams_frequency']
            coarse_synsets = [{"name": name,
                               "semcor_occurances": info['semcor_occurances'],
                               "synsets": [{"name": synset[0], "definition": synset[1]} for synset in info['synsets']]} for name, info in v.items()]
            data.append({'word': word, 'avg_google_ngrams_frequency': avg_google_ngrams_frequency, 'coarse_synsets': coarse_synsets})
            break

In [27]:
from datasets import Dataset

dataset = Dataset.from_list(data)

In [28]:
dataset.push_to_hub(
    repo_id="lukasellinger/homonym-homonymy-wsd",
    private=True,
    token=Credentials.hf_api_key
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 396.44ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.79s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/lukasellinger/homonym-homonymy-wsd/commit/844bc29807e56e9e9a38d5d106992e77ac8a8ef5', commit_message='Upload dataset', commit_description='', oid='844bc29807e56e9e9a38d5d106992e77ac8a8ef5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lukasellinger/homonym-homonymy-wsd', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lukasellinger/homonym-homonymy-wsd'), pr_revision=None, pr_num=None)