In [1]:
import os
import json
import random
from dataclasses import dataclass
from collections import Counter, defaultdict

import datasets

from datasets import load_from_disk

# Dataset

In [2]:
root_dir = r"C:\Users\onurg\.cache\huggingface\datasets"
data_dir = "updated_wiki40b"

path = os.path.join(root_dir, data_dir, "long_small_en")

dataset = load_from_disk(path)
print(len(dataset))

1558009


In [3]:
# Number of occurances for each language pairs

pair_dict = Counter(dataset["pair"])
print(pair_dict)

Counter({'en_de': 435694, 'en_fr': 357248, 'en_it': 245572, 'fr_de': 209015, 'it_de': 157327, 'fr_it': 153153})


## Language Article Dictionary

In [5]:
language_article_mapping = defaultdict(set)

def language_article_mapper(example):
    p1, p2 = example["pair"].split("_")
    language_article_mapping[p1].add(example["wikidata_id"])
    language_article_mapping[p2].add(example["wikidata_id"])


dataset.map(language_article_mapper)


100%|██████████| 1558009/1558009 [08:09<00:00, 3180.79ex/s]


Dataset({
    features: ['article_1', 'article_2', 'pair', 'wikidata_id'],
    num_rows: 1558009
})

In [6]:
for k, v in language_article_mapping.items():
    print(f" For language {k}, there are {len(v)} availabile articles")

 For language en, there are 639046 availabile articles
 For language fr, there are 357248 availabile articles
 For language de, there are 435694 availabile articles
 For language it, there are 245572 availabile articles


# Titles

In [21]:
titles_file = 'titles.json'

data = []
with open(titles_file, encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

In [22]:
# Creating a dictionary for mapping Wikidata IDs to their categories

title_dict = dict()
for item in data:
    title_dict[item["id"]] = {"title": item["title"], "categories": item["categories"].split("#")}

In [23]:
category_list = []
lens_list = []

for key, values in title_dict.items():
    lst = values["categories"]
    category_list.extend(lst)
    lens_list.append(len(lst))

print(f"Number of total occurances for all categories: {len(category_list)}")

Number of total occurances for all categories: 8020123


In [24]:
# Creating a list of the substrings that will be used in filtering out operation.

exclude_file = "notes.txt"

with open(exclude_file) as file:
    exclude_list = [line.rstrip().split(",")[0] for line in file if line.rstrip().split(",")[1] != "?"]
print("List of the substring:")
print(exclude_list)

List of the substring:
['Articles', 'description', 'articles', 'Living', 'Wikidata', 'Webarchive', 'CS1', 'AC with 0 elements', 'missing', 'invalid', 'Pages', 'pages', 'Template', 'errors', 'cleanup', 'unknown', 'Use dmy dates from', 'All ', 'Source attribution', 'birth', 'Biography with signature', 'EngvarB', 'Singlechart', 'usages', 'deaths', 'Use mdy dates', 'Commons', 'links', 'lacking', 'needing confirmation', 'Drugboxes which contain changes to watched fields', 'certification', 'ambiguous', 'Use ']


In [25]:
print(f"Number of total occurances for all categories before filter out operation: {len(category_list)}")

category_list = [category for category in category_list if all(string not in category for string in exclude_list)]
print(f"Number of total occurances for all categories after filter out operation: {len(category_list)}")

l_c = Counter(lens_list)
category_dict = Counter(category_list)
print(f"Number of total unique categories: {len(category_dict)}")

category_dict = {k:v for k,v in category_dict.items() if v > 1}
print(f"Number of total unique categories that appear more than once: {len(category_dict)}")

Number of total occurances for all categories before filter out operation: 8020123
Number of total occurances for all categories after filter out operation: 3475826
Number of total unique categories: 611644
Number of total unique categories that appear more than once: 354497


In [26]:
print(f"Number of articles before filtering: {len(title_dict)}")

updated_title_dict = dict()
for key, values in title_dict.items():
    # filtering out the articles that don't belong to any of the respective categories
    if any(category in category_dict for category in values["categories"]):
        # updating the category list of each article
        updated_title_dict[key] = [value for value in values["categories"] if value in category_dict] 
        
print(f"Number of articles after filtering: {len(updated_title_dict)}")


Number of articles before filtering: 423790
Number of articles after filtering: 420002


In [27]:
# Categoriy-article mapping: for each category, corresponding articles are listed. 

category_article_mapping = defaultdict(list)
for key, values in updated_title_dict.items():
    for category in values:
        category_article_mapping[category].append(key)    

In [28]:
# For each language, creates a "category_article_mapping" so that for each category, corresponding articles will be listed.

def language_category_mapper(category_article_mapping: dict, article_set: set) -> dict:
    mapping = defaultdict(list)
    for key, values in category_article_mapping.items():
        # For each language - category combination, get the available articles 
        values = [value for value in values if value in article_set]
        # If category only has one article, discard
        if len(values) > 1:
            mapping[key] = values

    return mapping


language_category_article_mapping = dict(dict())
for language, article_set in language_article_mapping.items():
    language_category_article_mapping[language] = language_category_mapper(category_article_mapping, article_set)

In [29]:
for k, v in language_category_article_mapping.items():
    print(f" For language {k}, there are {len(v)} availabile categories")

 For language en, there are 354497 availabile categories
 For language fr, there are 244967 availabile categories
 For language de, there are 272606 availabile categories
 For language it, there are 185971 availabile categories


In [40]:
# For each language, creates a dictionary, for which each article is listed with all corresponding categories. 

def language_title_mapper(updated_title_dict, category_set, language_article_mapping, language, verbose=False):
    mapping = dict()
    id_set = language_article_mapping[language]
    for key, values in updated_title_dict.items():
    # filtering out the articles that don't belong to any of the respective categories
        if any(category in category_set for category in values) and key in id_set:
            # updating the category list of each article
            mapping[key] = [value for value in values if value in category_set] 
        elif verbose:
            print(key, language, values)
            
    return mapping

language_title_dict = dict(dict())

for language, category_set in language_category_article_mapping.items():
    language_title_dict[language] = language_title_mapper(updated_title_dict, category_set, language_article_mapping, language, verbose=False)

In [41]:
for k, v in language_article_mapping.items():
    print(f" For language {k}, there were {len(v)} availabile articles before filtering")

 For language en, there were 639046 availabile articles before filtering
 For language fr, there were 357248 availabile articles before filtering
 For language de, there were 435694 availabile articles before filtering
 For language it, there were 245572 availabile articles before filtering


In [45]:
for k, v in language_title_dict.items():
    print(f" For language {k}, there are {len(v)} availabile articles after filtering")

 For language en, there are 420002 availabile articles after filtering
 For language fr, there are 233955 availabile articles after filtering
 For language de, there are 285771 availabile articles after filtering
 For language it, there are 160765 availabile articles after filtering


# Hard Negative Finding

Some results:

"Q3588472": Émile Dubonnet, French balloonist  <br />
"Q588510": Jacques Balsan, French aviator and businessman

"Q206961": Épinay-sur-Seine, commune in Seine-Saint-Denis <br />
"Q175999": Le Pré-Saint-Gervais, commune in Seine-Saint-Denis

In [46]:
def hard_negative_finder(language: str, initial_article:str, language_title_dict: dict, language_category_article_mapping: dict) -> str:
    cat_list = language_title_dict[language][initial_article]
    category = random.choice(cat_list)
    available_articles = [article for article in language_category_article_mapping[language][category] if article != initial_article]
    selected_article = random.choice(available_articles)
    return selected_article

In [55]:
language = "en"
initial_article = "Q206961"

selected_article = hard_negative_finder(language, initial_article, language_title_dict, language_category_article_mapping)
print(selected_article)

Q275353
