In [7]:
import json
import requests
import csv
import os
from tqdm import tqdm

In [8]:
def get_wikipedia_title_from_wikidata(qid: str, language: str = "en") -> str:
    """
    Given a Wikidata QID (e.g., 'Q328656'), retrieve the corresponding
    Wikipedia page title in the specified language (default: English).
    Returns None if the item does not have a matching sitelink or if there's an error.
    """
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching Wikidata for {qid}.")
        return None
    
    data = response.json()
    try:
        entity_info = data["entities"][qid]
        sitelinks = entity_info["sitelinks"]
        wiki_key = f"{language}wiki"
        if wiki_key in sitelinks:
            return sitelinks[wiki_key]["title"]
        else:
            print(f"No '{language}wiki' sitelink for {qid}")
            return None
    except KeyError:
        print(f"Wikidata entry not found or malformed for {qid}")
        return None
    
def gather_qids_from_files(file_paths):
    """
    Given a list of JSON file paths, gather all QIDs (main 'wikidata_ID'
    plus any in 'answer_ids'). Returns a set of QIDs.
    """
    all_qids = set()
    for path in file_paths:
        if not os.path.exists(path):
            print(f"Warning: File {path} does not exist. Skipping.")
            continue
        
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
            for item in data:
                # Add main wikidata_ID
                if "wikidata_ID" in item:
                    all_qids.add(item["wikidata_ID"])
                
                # Add any QIDs from 'answer_ids'
                if "answer_ids" in item:
                    for _, v in item["answer_ids"].items():
                        all_qids.add(v)
    return all_qids

In [9]:
# Update these paths as needed
json_file_paths = [
    "../data/train_TLQA.json",  # or "train_TLQA.json"
    "../data/test_TLQA.json",   # or "test_TLQA.json"
    "../data/val_TLQA.json"     # or "val_TLQA.json"
]

# 1. Gather all QIDs
qid_set = gather_qids_from_files(json_file_paths)
print(f"Found {len(qid_set)} unique QIDs in total.")

# 2. For each QID, retrieve the Wikipedia title (with a progress bar)
qid_title_pairs = []
for qid in tqdm(qid_set, desc="Retrieving Wikipedia titles"):
    title = get_wikipedia_title_from_wikidata(qid, language="en")
    if title:
        qid_title_pairs.append((qid, title))

print(f"Successfully retrieved {len(qid_title_pairs)} titles.")

# 3. Write out to CSV
output_csv_path = "qid_to_titles.csv"
with open(output_csv_path, mode="w", encoding="utf-8", newline="") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["QID", "Wikipedia_Title"])  # Header
    for qid, title in qid_title_pairs:
        writer.writerow([qid, title])

print(f"CSV of QIDs and titles written to {output_csv_path}")

Found 13971 unique QIDs in total.


Retrieving Wikipedia titles:   0%|          | 25/13971 [00:13<1:44:09,  2.23it/s]

No 'enwiki' sitelink for Q42712773


Retrieving Wikipedia titles:   3%|▎         | 432/13971 [03:29<1:30:09,  2.50it/s]

No 'enwiki' sitelink for Q7005377


Retrieving Wikipedia titles:   5%|▍         | 669/13971 [05:32<1:55:42,  1.92it/s]

No 'enwiki' sitelink for Q23762108


Retrieving Wikipedia titles:   6%|▋         | 879/13971 [07:16<1:54:10,  1.91it/s]

No 'enwiki' sitelink for Q64029222


Retrieving Wikipedia titles:   8%|▊         | 1171/13971 [09:36<1:44:21,  2.04it/s]

No 'enwiki' sitelink for Q9379740


Retrieving Wikipedia titles:  10%|█         | 1426/13971 [11:40<1:37:50,  2.14it/s]

No 'enwiki' sitelink for Q57360504


Retrieving Wikipedia titles:  10%|█         | 1466/13971 [12:00<1:43:54,  2.01it/s]

No 'enwiki' sitelink for Q21580706


Retrieving Wikipedia titles:  12%|█▏        | 1714/13971 [14:07<1:51:03,  1.84it/s]

No 'enwiki' sitelink for Q84735825


Retrieving Wikipedia titles:  13%|█▎        | 1827/13971 [15:02<1:43:00,  1.96it/s]

No 'enwiki' sitelink for Q72762365


Retrieving Wikipedia titles:  13%|█▎        | 1852/13971 [15:15<1:52:11,  1.80it/s]

No 'enwiki' sitelink for Q43122612


Retrieving Wikipedia titles:  14%|█▎        | 1890/13971 [15:36<1:48:35,  1.85it/s]

No 'enwiki' sitelink for Q4146869


Retrieving Wikipedia titles:  15%|█▍        | 2064/13971 [17:05<1:49:24,  1.81it/s]

No 'enwiki' sitelink for Q42907466


Retrieving Wikipedia titles:  15%|█▌        | 2141/13971 [17:46<1:27:47,  2.25it/s]

No 'enwiki' sitelink for Q20647668


Retrieving Wikipedia titles:  16%|█▌        | 2179/13971 [18:06<1:30:01,  2.18it/s]

No 'enwiki' sitelink for Q85520551


Retrieving Wikipedia titles:  16%|█▌        | 2230/13971 [18:33<1:42:31,  1.91it/s]

No 'enwiki' sitelink for Q1546372


Retrieving Wikipedia titles:  16%|█▌        | 2232/13971 [18:34<1:37:14,  2.01it/s]

Wikidata entry not found or malformed for Q55623217


Retrieving Wikipedia titles:  17%|█▋        | 2329/13971 [19:19<1:36:19,  2.01it/s]

No 'enwiki' sitelink for Q130929


Retrieving Wikipedia titles:  17%|█▋        | 2416/13971 [20:01<1:25:54,  2.24it/s]

No 'enwiki' sitelink for Q76861


Retrieving Wikipedia titles:  19%|█▉        | 2639/13971 [21:56<1:19:02,  2.39it/s]

No 'enwiki' sitelink for Q75863568


Retrieving Wikipedia titles:  20%|█▉        | 2792/13971 [23:09<1:37:53,  1.90it/s]

No 'enwiki' sitelink for Q16235337


Retrieving Wikipedia titles:  20%|██        | 2836/13971 [23:31<1:34:45,  1.96it/s]

No 'enwiki' sitelink for Q3315346


Retrieving Wikipedia titles:  20%|██        | 2846/13971 [23:35<1:26:34,  2.14it/s]

No 'enwiki' sitelink for Q90880889


Retrieving Wikipedia titles:  22%|██▏       | 3023/13971 [25:00<1:32:12,  1.98it/s]

No 'enwiki' sitelink for Q30591832


Retrieving Wikipedia titles:  25%|██▍       | 3443/13971 [28:17<1:11:26,  2.46it/s]

No 'enwiki' sitelink for Q21580140


Retrieving Wikipedia titles:  25%|██▌       | 3493/13971 [28:40<1:12:32,  2.41it/s]

Wikidata entry not found or malformed for Q26132701


Retrieving Wikipedia titles:  26%|██▌       | 3594/13971 [29:30<1:43:13,  1.68it/s]

No 'enwiki' sitelink for Q731414


Retrieving Wikipedia titles:  31%|███▏      | 4396/13971 [36:23<1:25:48,  1.86it/s]

No 'enwiki' sitelink for Q39821310


Retrieving Wikipedia titles:  33%|███▎      | 4546/13971 [37:43<1:16:11,  2.06it/s]

No 'enwiki' sitelink for Q45715094


Retrieving Wikipedia titles:  33%|███▎      | 4629/13971 [38:25<1:00:51,  2.56it/s]

No 'enwiki' sitelink for Q2566297


Retrieving Wikipedia titles:  34%|███▍      | 4734/13971 [39:21<1:04:48,  2.38it/s]

No 'enwiki' sitelink for Q2242323


Retrieving Wikipedia titles:  36%|███▌      | 4976/13971 [41:22<1:25:48,  1.75it/s]

No 'enwiki' sitelink for Q467147


Retrieving Wikipedia titles:  39%|███▉      | 5463/13971 [45:25<55:37,  2.55it/s]  

Wikidata entry not found or malformed for Q28811961


Retrieving Wikipedia titles:  40%|███▉      | 5546/13971 [46:07<1:06:05,  2.12it/s]

No 'enwiki' sitelink for Q24049850


Retrieving Wikipedia titles:  42%|████▏     | 5813/13971 [48:18<49:50,  2.73it/s]  

No 'enwiki' sitelink for Q37831208


Retrieving Wikipedia titles:  42%|████▏     | 5876/13971 [48:48<1:19:17,  1.70it/s]

No 'enwiki' sitelink for Q172721


Retrieving Wikipedia titles:  44%|████▎     | 6098/13971 [50:48<1:08:30,  1.92it/s]

Wikidata entry not found or malformed for Q61003751


Retrieving Wikipedia titles:  47%|████▋     | 6557/13971 [55:04<1:02:19,  1.98it/s]

No 'enwiki' sitelink for Q42851750


Retrieving Wikipedia titles:  48%|████▊     | 6767/13971 [56:56<59:36,  2.01it/s]  

No 'enwiki' sitelink for Q97905930


Retrieving Wikipedia titles:  49%|████▉     | 6861/13971 [57:46<1:06:54,  1.77it/s]

No 'enwiki' sitelink for Q60974301


Retrieving Wikipedia titles:  51%|█████     | 7146/13971 [1:00:13<56:23,  2.02it/s]  

No 'enwiki' sitelink for Q15141324


Retrieving Wikipedia titles:  52%|█████▏    | 7223/13971 [1:00:54<56:47,  1.98it/s]  

No 'enwiki' sitelink for Q113628552


Retrieving Wikipedia titles:  53%|█████▎    | 7343/13971 [1:01:53<59:01,  1.87it/s]  

No 'enwiki' sitelink for Q20792297


Retrieving Wikipedia titles:  53%|█████▎    | 7391/13971 [1:02:18<1:02:02,  1.77it/s]

Wikidata entry not found or malformed for Q4827280


Retrieving Wikipedia titles:  54%|█████▎    | 7506/13971 [1:03:18<59:25,  1.81it/s]  

No 'enwiki' sitelink for Q27826669


Retrieving Wikipedia titles:  54%|█████▍    | 7559/13971 [1:03:46<42:25,  2.52it/s]  

Wikidata entry not found or malformed for Q60477696


Retrieving Wikipedia titles:  55%|█████▍    | 7639/13971 [1:04:28<57:00,  1.85it/s]  

No 'enwiki' sitelink for Q42418130


Retrieving Wikipedia titles:  59%|█████▊    | 8178/13971 [1:08:54<47:58,  2.01it/s]  

No 'enwiki' sitelink for Q113628485


Retrieving Wikipedia titles:  61%|██████    | 8460/13971 [1:11:05<43:07,  2.13it/s]  

No 'enwiki' sitelink for Q38161518


Retrieving Wikipedia titles:  64%|██████▎   | 8897/13971 [1:14:37<43:34,  1.94it/s]  

No 'enwiki' sitelink for Q54861598


Retrieving Wikipedia titles:  65%|██████▌   | 9102/13971 [1:16:14<29:10,  2.78it/s]  

No 'enwiki' sitelink for Q707492


Retrieving Wikipedia titles:  66%|██████▌   | 9166/13971 [1:16:45<34:11,  2.34it/s]

No 'enwiki' sitelink for Q7889282


Retrieving Wikipedia titles:  66%|██████▋   | 9278/13971 [1:17:47<38:58,  2.01it/s]  

No 'enwiki' sitelink for Q28224782


Retrieving Wikipedia titles:  68%|██████▊   | 9464/13971 [1:19:15<27:30,  2.73it/s]

No 'enwiki' sitelink for Q1785225


Retrieving Wikipedia titles:  69%|██████▉   | 9702/13971 [1:21:11<29:32,  2.41it/s]

Wikidata entry not found or malformed for Q5379321


Retrieving Wikipedia titles:  71%|███████   | 9853/13971 [1:22:21<40:39,  1.69it/s]

No 'enwiki' sitelink for Q14957003


Retrieving Wikipedia titles:  73%|███████▎  | 10257/13971 [1:25:34<26:26,  2.34it/s]

No 'enwiki' sitelink for Q10871933


Retrieving Wikipedia titles:  75%|███████▍  | 10445/13971 [1:27:03<30:40,  1.92it/s]

No 'enwiki' sitelink for Q113517072


Retrieving Wikipedia titles:  75%|███████▍  | 10465/13971 [1:27:12<30:11,  1.94it/s]

No 'enwiki' sitelink for Q7444352


Retrieving Wikipedia titles:  75%|███████▌  | 10503/13971 [1:27:29<26:28,  2.18it/s]

No 'enwiki' sitelink for Q55949824


Retrieving Wikipedia titles:  77%|███████▋  | 10810/13971 [1:29:59<27:27,  1.92it/s]

No 'enwiki' sitelink for Q52729635


Retrieving Wikipedia titles:  78%|███████▊  | 10852/13971 [1:30:19<21:22,  2.43it/s]

No 'enwiki' sitelink for Q326552


Retrieving Wikipedia titles:  78%|███████▊  | 10913/13971 [1:30:46<20:56,  2.43it/s]

No 'enwiki' sitelink for Q870764


Retrieving Wikipedia titles:  80%|████████  | 11224/13971 [1:33:21<23:13,  1.97it/s]

Wikidata entry not found or malformed for Q2493761


Retrieving Wikipedia titles:  80%|████████  | 11226/13971 [1:33:22<22:46,  2.01it/s]

No 'enwiki' sitelink for Q11895142


Retrieving Wikipedia titles:  82%|████████▏ | 11469/13971 [1:35:30<21:08,  1.97it/s]

No 'enwiki' sitelink for Q958419


Retrieving Wikipedia titles:  82%|████████▏ | 11500/13971 [1:35:46<23:28,  1.75it/s]

No 'enwiki' sitelink for Q12454125


Retrieving Wikipedia titles:  84%|████████▍ | 11798/13971 [1:38:22<20:13,  1.79it/s]

Wikidata entry not found or malformed for Q63932620


Retrieving Wikipedia titles:  86%|████████▌ | 12015/13971 [1:40:11<13:03,  2.50it/s]

No 'enwiki' sitelink for Q5457357


Retrieving Wikipedia titles:  86%|████████▋ | 12082/13971 [1:40:46<16:12,  1.94it/s]

No 'enwiki' sitelink for Q5116465


Retrieving Wikipedia titles:  87%|████████▋ | 12117/13971 [1:41:06<18:27,  1.67it/s]

No 'enwiki' sitelink for Q18044891


Retrieving Wikipedia titles:  88%|████████▊ | 12228/13971 [1:42:01<14:33,  2.00it/s]

Wikidata entry not found or malformed for Q206343


Retrieving Wikipedia titles:  88%|████████▊ | 12338/13971 [1:42:54<10:26,  2.61it/s]

No 'enwiki' sitelink for Q66517156


Retrieving Wikipedia titles:  89%|████████▉ | 12416/13971 [1:43:34<12:46,  2.03it/s]

No 'enwiki' sitelink for Q24234023


Retrieving Wikipedia titles:  89%|████████▉ | 12476/13971 [1:44:05<13:01,  1.91it/s]

No 'enwiki' sitelink for Q45837766


Retrieving Wikipedia titles:  89%|████████▉ | 12483/13971 [1:44:08<12:50,  1.93it/s]

No 'enwiki' sitelink for Q21993339


Retrieving Wikipedia titles:  91%|█████████ | 12645/13971 [1:45:34<10:51,  2.03it/s]

Wikidata entry not found or malformed for Q4574084


Retrieving Wikipedia titles:  96%|█████████▌| 13387/13971 [1:51:55<03:12,  3.03it/s]

No 'enwiki' sitelink for Q162449


Retrieving Wikipedia titles:  97%|█████████▋| 13611/13971 [1:53:47<03:00,  1.99it/s]

No 'enwiki' sitelink for Q1408517


Retrieving Wikipedia titles:  99%|█████████▉| 13822/13971 [1:55:34<01:14,  2.00it/s]

No 'enwiki' sitelink for Q1364307


Retrieving Wikipedia titles:  99%|█████████▉| 13853/13971 [1:55:49<01:06,  1.77it/s]

No 'enwiki' sitelink for Q16145297


Retrieving Wikipedia titles: 100%|██████████| 13971/13971 [1:56:45<00:00,  1.99it/s]

Successfully retrieved 13891 titles.
CSV of QIDs and titles written to qid_to_titles.csv



