In [1]:
import json
import os
import re
import time
from datetime import datetime
from itertools import islice


In [2]:
with open("data_sample/02_wikidata_repairs.json", "r") as f:
    wikidata_repairs = json.load(f)

In [3]:
with open("data_sample/03_world_state.json", "r") as f:
    world_state = json.load(f)

In [4]:
with open("data_sample/04_classified_benchmark.jsonl", "r", encoding="utf-8") as f:
    classified_benchmark = [json.loads(line) for line in islice(f, None)]  # Limit to first N entries for sample


In [4]:
len(wikidata_repairs), len(world_state)

(38738, 38738)

In [10]:
track_counts = {}
violation_types_counts = {}
for item in classified_benchmark:
    track = item["track"]
    track_counts[track] = track_counts.get(track, 0) + 1
    violation_type = item["violation_context"]["report_violation_type_normalized"]
    violation_types_counts[violation_type] = violation_types_counts.get(violation_type, 0) + 1
print(track_counts)
print(violation_types_counts)

{'T_BOX': 33356, 'A_BOX': 5382}
{'Unique value': 6732, 'Label in sk language': 333, 'None of': 922, 'Item P|19': 169, 'Item P|2446': 159, 'Allowed qualifiers': 925, 'Label in cy language': 144, 'One of': 1583, 'Single value': 3227, 'Label in vi language': 357, 'Label in de language': 459, 'Format': 4855, 'Symmetric': 546, 'Conflicts with P|31': 790, 'Type Q|53764782': 178, 'Type Q|5, Q|729, Q|115537581, Q|15619164, Q|201662, Q|40614, Q|7239, Q|11012, Q|10832, Q|79600797, Q|16521, Q|1285470, Q|26513, Q|2345820, Q|111282474, Q|2593744, Q|168658, Q|15707583, Q|26401003, Q|61002': 5, 'Scope': 308, 'Mandatory Qualifiers': 703, 'Label in es language': 720, 'Type Q|386724, Q|17737, Q|1151067, Q|17489659, Q|3331189, Q|15306849, Q|2424752, Q|2031291, Q|814232, Q|121033050, Q|1046315, Q|179550, Q|15938550, Q|95074, Q|1172486, Q|49848': 175, 'Value type Q|1456832, Q|690768, Q|1184244, Q|7311382, Q|2628882, Q|83267, Q|124734, Q|1323212, Q|12142141': 3, 'Label in zh language': 347, 'Item P|402': 33

In [6]:
with open("wikidata_repairs_sample.json", "w") as f:
    json.dump(wikidata_repairs[:10], f, indent=2)



In [15]:
with open("world_state_sample.json", "w") as f:
    json.dump({k: world_state.get(k) for k in list(world_state.keys())[:10]}, f, indent=2)

In [9]:
num_abox_entries = 15
num_tbox_entries = 10

classified_benchmark_sample = []
abox_count = 0
tbox_count = 0
for item in classified_benchmark:
    if item["track"] == "A_BOX" and abox_count < num_abox_entries:
        classified_benchmark_sample.append(item)
        abox_count += 1
    elif item["track"] == "T_BOX" and tbox_count < num_tbox_entries:
        classified_benchmark_sample.append(item)
        tbox_count += 1
    if abox_count >= num_abox_entries and tbox_count >= num_tbox_entries:
        break

with open("classified_benchmark_sample.json", "w") as f:
    json.dump(classified_benchmark_sample, f, indent=2)