In [None]:
import os
import json
import random
import pickle
from dataclasses import dataclass
from collections import Counter, defaultdict

import datasets

from datasets import load_from_disk

# Data

In [None]:
# Loading dictionaries that store relationship between articles and categories in different languages

temp_directory = "../data/"

with open(os.path.join(temp_directory, 'language_title_dict.pkl'), 'rb') as f:
    language_title_dict = pickle.load(f)
with open(os.path.join(temp_directory, 'language_category_article_mapping'), 'rb') as f:
    language_category_article_mapping = pickle.load(f)

In [None]:
root_dir = r"C:\Users\onurg\.cache\huggingface\datasets"
data_dir = "updated_wiki40b"

# Long dataset to process to have the final version
path = os.path.join(root_dir, data_dir, "long_small_en")
dataset = load_from_disk(path)
print(len(dataset))

# Wide dataset for finding hard negatives
path = os.path.join(root_dir, data_dir, "filtered_small")
dataset_wide = load_from_disk(path)
print(len(dataset_wide))
df = dataset_wide.to_pandas()
df.set_index('wikidata_id', inplace=True)

# Hard Negative Finding

Some results:

"Q3588472": Émile Dubonnet, French balloonist  <br />
"Q588510": Jacques Balsan, French aviator and businessman

"Q206961": Épinay-sur-Seine, commune in Seine-Saint-Denis <br />
"Q175999": Le Pré-Saint-Gervais, commune in Seine-Saint-Denis

In [None]:
def hard_negative_finder(language: str, initial_article:str, language_title_dict: dict, language_category_article_mapping: dict) -> str:
    cat_list = language_title_dict[language][initial_article]
    category = random.choice(cat_list)
    available_articles = [article for article in language_category_article_mapping[language][category] if article != initial_article]
    selected_article = random.choice(available_articles)
    return selected_article

In [None]:
# Example usage
language = "en"
initial_article = "Q206961"

selected_article = hard_negative_finder(language, initial_article, language_title_dict, language_category_article_mapping)
print(selected_article)

# Reverse Pairs

In [None]:
# In order to create reverse pair for each available pair
def create_reverse_pair(example):

    second_pairs = []
    for pair in example["pair"]:
        first_lan, second_lan = pair.split("_")
        second_pairs.append(f"{second_lan}_{first_lan}")
     
    example = {"pair": example["pair"] + second_pairs,
               "article_1": example["article_1"] + example["article_2"],
               "article_2": example["article_2"] + example["article_1"],
               "wikidata_id": example["wikidata_id"] + example["wikidata_id"]}
    return example


long_dataset = dataset.map(create_reverse_pair, 
                                #remove_columns=long_dummy_dataset.column_names, 
                                batched=True, 
                                batch_size=1000)

# Combining

In [None]:
def negative_combiner_wrapper(dataset: datasets.Dataset, language_title_dict: dict, language_category_article_mapping: dict, df:pd.DataFrame) -> datasets.Dataset:
    def negative_combiner(example):
        anchor_language = example["pair"].split("_")[0]
        wiki_id = example["wikidata_id"]
        if wiki_id in language_title_dict[anchor_language]:
            selected_article = hard_negative_finder(anchor_language, wiki_id, language_title_dict, language_category_article_mapping)
        else:
            return

        example["hard_negative"] = df.loc[selected_article][f"text_{anchor_language}"]

        return example

    final_dataset = dataset.map(negative_combiner, num_proc=8)
    return final_dataset

final_dataset = negative_combiner_wrapper(long_dataset, language_title_dict, language_category_article_mapping, df)