In [1]:
# ■ 1) Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# ■ 2) Install / import tqdm for progress‑bars
try:
    from tqdm import tqdm
except ImportError:     
    !pip install -q tqdm
    from tqdm import tqdm


In [3]:
# ■ 3) Load merged JSON file
import json, re, unicodedata
from pprint import pprint

FILE_PATH = '/content/drive/MyDrive/HotpotQA_snapshot/all_docs_chunks_entities_relations_all.json'
print(f'Loading file: {FILE_PATH}')
with open(FILE_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)
print(f"Loaded {len(data):,} chunks.")


Loading file: /content/drive/MyDrive/HotpotQA_snapshot/all_docs_chunks_entities_relations_all.json
Loaded 66,237 chunks.


In [4]:
# ■ 4) Utility ▸ clean & normalise relation surface‑forms
QUOTE_CHARS = '"\'“”‘’`´‛❝❞❮❯⹂〝〞«»‚„”″′˝'
PUNCT_TAIL  = ',.;:!?☞\u2014\u2013-'

def strip_outer_quotes(txt: str) -> str:
    """Remove matching outer quote characters."""
    if len(txt) >= 2 and txt[0] in QUOTE_CHARS and txt[-1] in QUOTE_CHARS:
        return txt[1:-1]
    return txt

_WS_RE = re.compile(r'\s+', re.UNICODE)     # هر توالی whitespace

def clean_relation(raw: str) -> str:
    s = raw.strip()

    while s and s[0] in QUOTE_CHARS:
        s = s[1:]
    while s and s[-1] in QUOTE_CHARS:
        s = s[:-1]
    s = strip_outer_quotes(s)

    s = s.strip(PUNCT_TAIL + ' ')

    s = unicodedata.normalize('NFKC', s).casefold()

    s = _WS_RE.sub(' ', s)

    s = s.replace(' ', '_')

    return s.strip()

In [5]:
# ■ 5) Parse relations → collect unique, cleaned relation labels
unique_relations = set()
invalid_relations = 0

for chunk in tqdm(data, desc='Processing chunks'):
    for rel in chunk.get('relations', []):
        if not isinstance(rel, str):
            continue
        arrow_cnt = rel.count('->')
        if arrow_cnt < 2:
            invalid_relations += 1
            continue

        parts = [p.strip() for p in rel.split('->')]
        # throw away empty segments (happens if " -> " with spaces)
        parts = [p for p in parts if p]

        if len(parts) < 3:                
            middle = parts[1]
        else:                              
            middle = '_'.join(parts[1:-1])

        rel_clean = clean_relation(middle)
        if rel_clean:
            unique_relations.add(rel_clean)

print(f"Done. Skipped / ill‑formed relations: {invalid_relations:,}")
print(f"Unique relations collected: {len(unique_relations):,}")


Processing chunks: 100%|██████████| 66237/66237 [00:04<00:00, 14215.82it/s]

Done. Skipped / ill‑formed relations: 2,066
Unique relations collected: 139,253





In [6]:
# ■ 5‑bis) Count *all* relation strings

total_relations = sum(len(chunk.get('relations', [])) for chunk in data)
print(f"Total relation strings (raw, incl. duplicates): {total_relations:,}")
print(f"Unique relation labels (after cleaning):        {len(unique_relations):,}")


Total relation strings (raw, incl. duplicates): 1,235,425
Unique relation labels (after cleaning):        139,253


In [7]:
# ■ 6) Save the clean relation list for downstream embedding
OUTPUT_PATH = '/content/drive/MyDrive/HotpotQA_snapshot/unique_relations.txt'
with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
    for r in sorted(unique_relations):
        f.write(r + '\n')
print(f"Relation list saved → {OUTPUT_PATH}")


Relation list saved → /content/drive/MyDrive/HotpotQA_snapshot/unique_relations.txt


In [8]:
# ■ 7) Quick sanity‑check
print("First 100 relations (alphabetically):")
pprint(list(sorted(unique_relations))[:100])


First 100 relations (alphabetically):
['&_maintenance',
 '(1939)',
 '(date)',
 '(no_explicit_relation)',
 '(no_relation)',
 '(pauley,_j.)',
 '(s.d.n.y',
 '+_twitter_trending_140_chart',
 '/',
 '0-8',
 '0.6%',
 '1,000th_performance_on',
 '1.0%',
 '1.5%',
 '1.8%',
 '10',
 '1000s',
 '100_miles',
 '10th_duke_of',
 '10th_edition',
 '10th_largest_city_in_world',
 '10th_largest_in',
 '11:00_p.m',
 '11_of_14',
 '11th_playoff_meeting',
 '12',
 '12.4%',
 '12th_attempt',
 '12th_overall_to_be_selected',
 '13%',
 '13.1%',
 '13th_time_in',
 '14%',
 '14th',
 '150th_meeting',
 '155',
 '15_april_1909',
 '15th_highest-grossing_concert_tour_of_all_time',
 '160_km',
 '160_l',
 '16th',
 '17%',
 '17-yard_line',
 '18%',
 '180-gram_black_vinyl_reissue',
 '1854',
 '1881',
 '18_times',
 '18th',
 '18th_top_goalscorer_in',
 '1902',
 '1904',
 '1906',
 '1918_work',
 '1940_romance',
 '1956',
 '197_refugees',
 '1999',
 '1st',
 '1st_baron_montagu',
 '1st_battalion,_9th_cavalry',
 '1st_duchess_of',
 '1st_duke_of',
 '2%

In [10]:
pprint(list(sorted(unique_relations))[130000:130100])


['wanted_to_be',
 'wanted_to_be_a_combination_of',
 'wanted_to_be_buried_at',
 'wanted_to_be_buried_in',
 'wanted_to_be_called',
 'wanted_to_be_discussed_in',
 'wanted_to_be_endowed_with',
 'wanted_to_be_flown_to',
 'wanted_to_be_girlfriend_of',
 'wanted_to_be_grounded_by',
 'wanted_to_be_like',
 'wanted_to_be_on',
 'wanted_to_be_portrayed',
 'wanted_to_be_remembered_as',
 'wanted_to_be_rid_of',
 'wanted_to_be_separated_from',
 'wanted_to_be_traded_to',
 'wanted_to_be_with',
 'wanted_to_beat',
 'wanted_to_become',
 'wanted_to_book',
 'wanted_to_break_down',
 'wanted_to_break_off',
 'wanted_to_bring',
 'wanted_to_bring_back',
 'wanted_to_bring_in',
 'wanted_to_build',
 'wanted_to_buy',
 'wanted_to_cancel',
 'wanted_to_capitalize_on',
 'wanted_to_capitan',
 'wanted_to_capture',
 'wanted_to_carve_out',
 'wanted_to_cast',
 'wanted_to_challenge',
 'wanted_to_chase_away',
 'wanted_to_co-manage',
 'wanted_to_collaborate',
 'wanted_to_combine',
 'wanted_to_complete',
 'wanted_to_concentrate_on