In [8]:
import pandas as pd
from datasets import Dataset, DatasetDict

from config import Credentials
from ngram_utils import NgramFetcher
from reader import JSONReader

In [2]:
languages = ['ar', 'en', 'fr', 'ru', 'zh']
dev_data_path = 'dev/multilingual/dev.{lang}-{lang}.data'
dev_gold_path = 'dev/multilingual/dev.{lang}-{lang}.gold'

full_data = {}
for language in languages:
    data = pd.DataFrame(JSONReader().read(dev_data_path.format(lang=language)))
    gold = pd.DataFrame(JSONReader().read(dev_gold_path.format(lang=language)))
    conc_data = pd.concat([data, gold], axis=1)
    data_filtered = conc_data[conc_data['tag'] == 'F'].drop_duplicates(subset=["lemma"])['lemma'].tolist()

    full_data[language] = data_filtered

In [3]:
language_map = {
    'en', 'fr', 'ru', 'zh'
}

full_ngram_data = {}
for language in languages:
    if language in language_map:
        full_ngram_data[language] = NgramFetcher().fetch_ngram_data(full_data[language], False, language)

429 Client Error: Too Many Requests for url: https://books.google.com/ngrams/json?content=buzz%2Cpill%2Cspike%2Cthought%2Cfacing%2Cinhibit%2Clandscape%2Cfixing%2Ccheer%2Cban&year_start=1950&year_end=2022&corpus=en&smoothing=3 - sleeping 10 sec.
Error fetching Ngram data for batch ['buzz', 'pill', 'spike', 'thought', 'facing', 'inhibit', 'landscape', 'fixing', 'cheer', 'ban']: 429 Client Error: Too Many Requests for url: https://books.google.com/ngrams/json?content=buzz%2Cpill%2Cspike%2Cthought%2Cfacing%2Cinhibit%2Clandscape%2Cfixing%2Ccheer%2Cban&year_start=1950&year_end=2022&corpus=en&smoothing=3
429 Client Error: Too Many Requests for url: https://books.google.com/ngrams/json?content=region%2Ccatch%2Cfriction%2Cfind%2Cfolk%2Cmilk%2Cbraid%2Cempire%2Clap%2Ccoldness&year_start=1950&year_end=2022&corpus=en&smoothing=3 - sleeping 10 sec.
429 Client Error: Too Many Requests for url: https://books.google.com/ngrams/json?content=actif%2Cbarbu%2Csolide%2Coie%2Chumeur%2Csermon%2Cmouill%C3%A9%2

In [4]:
missing = {}
for language in languages:
    missing[language] = []
    for key, value in full_ngram_data.get(language, {}).items():
        if 'avg_frequency' not in value:
            missing[language].append(key)

In [5]:
ngram_data_missing = {}
for language, data in missing.items():
    ngram_data_missing[language] = NgramFetcher().fetch_ngram_data(data, False, language)

In [6]:
data = {}

for language in languages:
    data[language] = []
    lang_data = full_data[language]
    for word in lang_data:
        if word in full_ngram_data.get(language, {}) or word in ngram_data_missing.get(language, {}):
            avg_frequency = full_ngram_data.get(language, {}).get(word, {}).get("avg_frequency") or ngram_data_missing.get(language, {}).get(word, {}).get("avg_frequency")
            data[language].append({'word': word, 'avg_google_ngrams_frequency': avg_frequency})
        else:
            data[language].append({'word': word, 'avg_google_ngrams_frequency': None})

In [9]:
dataset = DatasetDict()
for language in languages:
    dataset[language] = Dataset.from_list(data[language])

In [11]:
from datasets import Features, Value

features = Features({
    "word": Value("string"),
    "avg_google_ngrams_frequency": Value("float64"),
})

# Assuming your DatasetDict is called `ds_dict`
for lang in dataset:
    dataset[lang] = dataset[lang].cast(features)

Casting the dataset: 100%|██████████| 308/308 [00:00<00:00, 81463.34 examples/s]
Casting the dataset: 100%|██████████| 334/334 [00:00<00:00, 240126.42 examples/s]
Casting the dataset: 100%|██████████| 380/380 [00:00<00:00, 248997.89 examples/s]
Casting the dataset: 100%|██████████| 330/330 [00:00<00:00, 316659.88 examples/s]
Casting the dataset: 100%|██████████| 254/254 [00:00<00:00, 246838.09 examples/s]


In [12]:
dataset.push_to_hub(
    repo_id="lukasellinger/homonym-mcl-wic",
    private=True,
    token=Credentials.hf_api_key
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1123.57ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.57s/it]
Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1014.10ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.56s/it]
Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 883.38ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1400.44ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.08it/s]
Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from

CommitInfo(commit_url='https://huggingface.co/datasets/lukasellinger/homonym-mcl-wic/commit/9f08feea7f4042ad6dddf5af553f9fb2ed0e7d95', commit_message='Upload dataset', commit_description='', oid='9f08feea7f4042ad6dddf5af553f9fb2ed0e7d95', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lukasellinger/homonym-mcl-wic', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lukasellinger/homonym-mcl-wic'), pr_revision=None, pr_num=None)