In [80]:
import pandas as pd
from datasets import Dataset

from config import Credentials
from ngram_utils import NgramFetcher

In [53]:
# Define the paths to the dataset files (modify these paths as needed)
data_dir = 'WiC_dataset'
train_data_path = f'{data_dir}/train/train.data.txt'
train_gold_path = f'{data_dir}/train/train.gold.txt'
dev_data_path = f'{data_dir}/dev/dev.data.txt'
dev_gold_path = f'{data_dir}/dev/dev.gold.txt'
test_data_path = f'{data_dir}/test/test.data.txt'
test_gold_path = f'{data_dir}/test/test.gold.txt'

# Function to load the data and labels
def load_wic_data(data_path, gold_path):
    data = pd.read_csv(data_path, sep='\t', header=None, names=["target_word", "PoS", "index", "example_1", "example_2"])
    gold = pd.read_csv(gold_path, sep='\t', header=None, names=["label"])
    return pd.concat([data, gold], axis=1)

In [77]:
train_data = load_wic_data(train_data_path, train_gold_path)
dev_data = load_wic_data(dev_data_path, dev_gold_path)
test_data = load_wic_data(test_data_path, test_gold_path)

data = pd.concat([train_data, dev_data, test_data])
data_filtered = data[data['label'] == 'F'].drop_duplicates(subset=["target_word"])['target_word'].tolist()

In [81]:
ngram_data = NgramFetcher().fetch_ngram_data(data_filtered)

In [83]:
ngram_data

{'carry': {'avg_frequency': 5.383752065972183e-05},
 'go': {'avg_frequency': 0.00022121897385842125},
 'break': {'avg_frequency': 3.7965859999694695e-05},
 'academy': {'avg_frequency': 2.5037980521392812e-06},
 'set': {'avg_frequency': 0.0002701555949482444},
 'take': {'avg_frequency': 0.0002767345703357165},
 'death': {'avg_frequency': 0.00011776523501114842},
 'despite': {'avg_frequency': 3.411959296499845e-05},
 'fall': {'avg_frequency': 6.76729328990579e-05},
 'work': {'avg_frequency': 0.000511420294282495},
 'shock': {'avg_frequency': 2.418205587634427e-05},
 'play': {'avg_frequency': 8.280866976191446e-05},
 'make': {'avg_frequency': 0.00038262525520423995},
 'end': {'avg_frequency': 0.00025593870210187023},
 'life': {'avg_frequency': 0.00033121078765580665},
 'do': {'avg_frequency': 0.0008320330904610819},
 'see': {'avg_frequency': 0.0005916634216354155},
 'answer': {'avg_frequency': 7.932791564196787e-05},
 'head': {'avg_frequency': 0.00016452858468286578},
 'shot': {'avg_frequ

In [84]:
#ngram_data = NgramFetcher().fetch_ngram_data(data_filtered)
data = []
# Update homonyms with fetched data
for word in data_filtered:
    if word in ngram_data:
        avg_frequency = ngram_data[word]["avg_frequency"]
        data.append({'word': word, 'avg_google_ngrams_frequency': avg_frequency})  # Or use a dictionary: {"word": word, "avg_frequency": avg_frequency}
    else:
        # Handle the case where the word is not in ngram_data
        data.append({'word': word, 'avg_google_ngrams_frequency': None})

In [85]:
dataset = Dataset.from_list(data)

In [88]:
dataset.push_to_hub(
    repo_id="lukasellinger/homonym-wic",
    private=True,
    token=Credentials.hf_api_key
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 810.34ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.49s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/lukasellinger/homonym-wic/commit/e54ccdc5e37834793857253e4a631cdfca0fd4a8', commit_message='Upload dataset', commit_description='', oid='e54ccdc5e37834793857253e4a631cdfca0fd4a8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lukasellinger/homonym-wic', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lukasellinger/homonym-wic'), pr_revision=None, pr_num=None)