In [1]:
# Install gdown & download the dataset (≈ 220 MB)
!pip install --quiet gdown

import os, json, textwrap
from pathlib import Path

DATA_DIR = Path("/content/Curious_LLM_DATA/HotpotQA")
DATA_DIR.mkdir(parents=True, exist_ok=True)
TEST_JSON = DATA_DIR / "test_docs.json"

# Google‑Drive file id for HotpotQA test set
!gdown --id 1a4dVGJ24cdQ88Ikz-vt3wFlxOREJlgLE -O "$TEST_JSON"
print(f"Downloaded to {TEST_JSON}\n")

Downloading...
From (original): https://drive.google.com/uc?id=1a4dVGJ24cdQ88Ikz-vt3wFlxOREJlgLE
From (redirected): https://drive.google.com/uc?id=1a4dVGJ24cdQ88Ikz-vt3wFlxOREJlgLE&confirm=t&uuid=69a457a9-16ee-43a8-9eb4-7bfc57fa368f
To: /content/Curious_LLM_DATA/HotpotQA/test_docs.json
100% 223M/223M [00:02<00:00, 74.5MB/s]
Downloaded to /content/Curious_LLM_DATA/HotpotQA/test_docs.json



In [2]:
# 🗂️ Cell 2 – Flatten *all* `titles` and `docs`
with TEST_JSON.open("r", encoding="utf-8") as f:
    records = json.load(f)

all_titles, all_docs = [], []
for rec in records:
    all_titles.extend(rec["titles"])
    all_docs.extend(rec["docs"])

assert len(all_titles) == len(all_docs), "titles/docs length mismatch!"
print(f"Total docs collected: {len(all_docs):,}")

ALL_DOCS_JSON = DATA_DIR / "all_docs.json"
with ALL_DOCS_JSON.open("w", encoding="utf-8") as f:
    json.dump({"titles": all_titles, "docs": all_docs}, f, ensure_ascii=False, indent=2)
print(f"Wrote complete snapshot → {ALL_DOCS_JSON}\n")

Total docs collected: 6,000
Wrote complete snapshot → /content/Curious_LLM_DATA/HotpotQA/all_docs.json



In [3]:
# 🗂️ Cell 4 – Identify docs with fewer than 150 words and list their indices

import json
from pathlib import Path

# Path to the flattened docs file
ALL_DOCS_JSON = Path("/content/Curious_LLM_DATA/HotpotQA/all_docs.json")

# Load titles and docs
with ALL_DOCS_JSON.open("r", encoding="utf-8") as f:
    data = json.load(f)
titles = data["titles"]
docs   = data["docs"]

# Collect indices of docs with word count < 150
short_doc_indices = [
    idx
    for idx, doc in enumerate(docs)
    if len(doc.split()) < 150
]

# Display results
print(f"Found {len(short_doc_indices)} docs with fewer than 150 words.\n")
print("Indices of short docs:")
print(short_doc_indices)


Found 753 docs with fewer than 150 words.

Indices of short docs:
[4, 7, 8, 9, 15, 34, 52, 65, 95, 108, 124, 127, 128, 135, 138, 141, 154, 155, 168, 185, 189, 210, 226, 253, 254, 262, 274, 279, 283, 296, 298, 300, 325, 328, 330, 355, 360, 362, 363, 376, 380, 386, 393, 400, 406, 408, 412, 420, 422, 423, 429, 434, 437, 449, 457, 459, 469, 477, 478, 481, 485, 497, 505, 512, 529, 539, 546, 548, 549, 561, 565, 577, 580, 584, 586, 592, 600, 601, 608, 610, 612, 617, 621, 630, 641, 644, 649, 661, 666, 671, 673, 675, 679, 687, 690, 699, 703, 707, 708, 712, 722, 728, 735, 739, 743, 744, 745, 752, 764, 783, 793, 816, 817, 822, 839, 842, 843, 866, 876, 879, 884, 892, 906, 908, 918, 923, 924, 940, 941, 945, 955, 957, 962, 974, 978, 980, 990, 995, 999, 1037, 1051, 1070, 1082, 1090, 1110, 1118, 1128, 1131, 1132, 1141, 1146, 1158, 1170, 1176, 1180, 1181, 1185, 1190, 1202, 1208, 1220, 1239, 1254, 1261, 1265, 1270, 1272, 1301, 1304, 1314, 1322, 1324, 1340, 1362, 1399, 1410, 1418, 1428, 1435, 1436, 1439,

In [4]:
# 📄 Cell 5 – Display index, title, and doc for all short docs (<150 words)

import json
from pathlib import Path

# Path to the flattened docs file
ALL_DOCS_JSON = Path("/content/Curious_LLM_DATA/HotpotQA/all_docs.json")

# Load titles and docs
with ALL_DOCS_JSON.open("r", encoding="utf-8") as f:
    data = json.load(f)
titles = data["titles"]
docs   = data["docs"]

# Identify short docs again
short_doc_indices = [
    idx
    for idx, doc in enumerate(docs)
    if len(doc.split()) < 150
]

# Display results
print(f"Found {len(short_doc_indices)} docs with fewer than 150 words.\n")

# Print index, title, and full doc text for each short document
for idx in short_doc_indices:
    print(f"Index: {idx}")
    print(f"Title: {titles[idx]}")
    print("Doc:")
    print(docs[idx])
    print("\n" + "="*80 + "\n")


Found 753 docs with fewer than 150 words.

Index: 4
Title: Wainscott (LIRR station)
Doc:
Wainscott was a former railroad station on the Montauk Branch of the Long Island Rail Road in Wainscott, New York, United States. It was opened in either 1897 or 1898 by the Brooklyn and Montauk Railroad, rebuilt in 1915 by the LIRR, and closed in 1938.Wainscott station has the distinction of being the only Long Island Rail Road station to have segregated waiting rooms, in spite of the fact that the New York State Legislature never allowed segregated facilities. Some sources claim that another station was segregated, but this has yet to be confirmed. In early 1938, the station was closed due to the effects of the Great Depression, and was moved to a beach, where it has served as a private residence since then.This New York train station–related article is a stub. You can help Wikipedia by expanding it.


Index: 7
Title: Bruce Bryant
Doc:
Bruce Bryant may refer to:


Index: 8
Title: Aealo
Doc:
Aealo

In [5]:
# 🗂️ Cell 6 – Find and display titles appearing ≥2 times (summary + truncated docs)

import json
from pathlib import Path
from collections import Counter

# Path to the flattened docs file
ALL_DOCS_JSON = Path("/content/Curious_LLM_DATA/HotpotQA/all_docs.json")

# Load titles and docs
with ALL_DOCS_JSON.open("r", encoding="utf-8") as f:
    data = json.load(f)
titles = data["titles"]
docs   = data["docs"]

# Count title frequencies
title_counts = Counter(titles)

# Filter titles that appear two or more times
duplicate_titles = [t for t, cnt in title_counts.items() if cnt >= 2]
total_occurrences = sum(title_counts[t] for t in duplicate_titles)

# Summary
print(f"Found {len(duplicate_titles)} titles with ≥2 occurrences, "
      f"for a total of {total_occurrences} docs.\n")

# Display each duplicate title with its truncated docs
for title in duplicate_titles:
    print(f"Title: {title} ({title_counts[title]} occurrences)\n")
    for idx, t in enumerate(titles):
        if t == title:
            snippet = docs[idx][:200] + ("..." if len(docs[idx]) > 200 else "")
            print(f"Index: {idx}")
            print(f"Doc: {snippet}\n")
    print("=" * 10 + "\n")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Index: 3625
Doc: Telos marks the third full-length album from Forevermore. Solid State Records released the project on July 22, 2014. Forevermore worked with Jordan Furr on the production of this album.Specifying in a...


Title: Bruce Lee (3 occurrences)

Index: 2206
Doc: Bruce Lee (Chinese: 李小龍; born Lee Jun-fan, 李振藩; November 27, 1940 – July 20, 1973) was a Chinese-American martial artist and actor whose career spanned Hong Kong and the United States. He was the foun...

Index: 5142
Doc: Bruce Lee (Chinese: 李小龍; born Lee Jun-fan, 李振藩; November 27, 1940 – July 20, 1973) was a Chinese-American martial artist and actor whose career spanned Hong Kong and the United States. He was the foun...

Index: 5891
Doc: Bruce Lee (Chinese: 李小龍; born Lee Jun-fan, 李振藩; November 27, 1940 – July 20, 1973) was a Chinese-American martial artist and actor whose career spanned Hong Kong and the United States. He was the foun...


Title: Par

In [6]:
print(f"Found {len(duplicate_titles)} titles with ≥2 occurrences, "
      f"for a total of {total_occurrences} docs.\n")

Found 1255 titles with ≥2 occurrences, for a total of 2773 docs.



In [7]:
# 🗂️ Cell 7 – Remove duplicate titles, overwrite all_docs.json, and report remaining count

import json
from pathlib import Path

# Path to the flattened docs file
ALL_DOCS_JSON = Path("/content/Curious_LLM_DATA/HotpotQA/all_docs.json")

# Load existing titles and docs
with ALL_DOCS_JSON.open("r", encoding="utf-8") as f:
    data = json.load(f)
titles = data["titles"]
docs   = data["docs"]

# Deduplicate by title, keeping first occurrence
seen = set()
unique_titles = []
unique_docs = []

for title, doc in zip(titles, docs):
    if title not in seen:
        seen.add(title)
        unique_titles.append(title)
        unique_docs.append(doc)

# Overwrite all_docs.json with deduplicated lists
with ALL_DOCS_JSON.open("w", encoding="utf-8") as f:
    json.dump({"titles": unique_titles, "docs": unique_docs}, f, ensure_ascii=False, indent=2)

# Report final count
print(f"Deduplication complete. {len(unique_docs)} docs remain (from {len(docs)} original entries).")


Deduplication complete. 4482 docs remain (from 6000 original entries).


In [8]:
# 🗂️ Cell 8 – Verify no duplicate titles remain in all_docs.json

import json
from pathlib import Path
from collections import Counter

# Path to the deduplicated docs file
ALL_DOCS_JSON = Path("/content/Curious_LLM_DATA/HotpotQA/all_docs.json")

# Load titles
with ALL_DOCS_JSON.open("r", encoding="utf-8") as f:
    data = json.load(f)
titles = data["titles"]

# Count title frequencies
title_counts = Counter(titles)

# Find any titles that still occur more than once
duplicates = {title: cnt for title, cnt in title_counts.items() if cnt > 1}

if not duplicates:
    print("✅ No duplicate titles found. All titles are unique.")
else:
    print("⚠️ Found the following duplicate titles still present:")
    for title, cnt in duplicates.items():
        print(f"- {title}: {cnt} occurrences")


✅ No duplicate titles found. All titles are unique.


In [9]:
# 🗂️ Cell 9 – Load fail_wiki.txt, split titles, deduplicate into a set

from pathlib import Path

# Path to the fail_wiki.txt file
FAIL_WIKI_PATH = Path("/content/fail_wiki.txt")

# Read and process titles
with FAIL_WIKI_PATH.open("r", encoding="utf-8") as f:
    # Strip whitespace and ignore empty lines
    lines = [line.strip() for line in f if line.strip()]

# Deduplicate by converting to a set
fail_titles_set = set(lines)

# (Optional) Convert back to a list if order doesn’t matter, or to a sorted list if you prefer consistency
fail_titles = list(fail_titles_set)
# Or, to keep them sorted alphabetically:
# fail_titles = sorted(fail_titles_set)

# Report how many unique failed titles we have
print(f"Loaded {len(lines)} titles from file, {len(fail_titles)} unique titles after deduplication.")

# Now `fail_titles_set` (or `fail_titles`) can be used in subsequent cells


Loaded 126 titles from file, 105 unique titles after deduplication.


In [10]:
for i in fail_titles:
  print(i)

Mark Rankin
Will &amp; Grace
Matheson &amp; Company
E'Shun Melvin
The Ren &amp; Stimpy Show
Wild &amp; Crazy Kids
The Life of Larry and Larry &amp; Steve
Romy Ruyssen
The Good Doctor (TV series)
History of the College of William &amp; Mary
Norman Dike
LBJ (film)
Kevin Byrne (politician)
Reagan (film)
Mirrors (film)
Ernst &amp; Young
The Memory of Our People
Culto A La Vida
No One Would Tell
J. P. Doherty
Limitless (EP)
Chris Jones (footballer, born 1985)
Rachelle Beinart
Albanian Coalition &quot;Perspective&quot;
Glenn Hughes
Doomsday Prophecy
Bruce Bryant
Mark Gaudet
Percy Jackson &amp; the Olympians
A Place to Live
Blue Dwarf roleplaying game
Belle &amp; Sebastian: The Adventure Continues
Scott Hicks
Grandtheft
John List
Holywood
Josue Larose
Eddie &quot;The Eagle&quot; Edwards
Lake George, New York
Rob Jenkins
Kansas City Power &amp; Light District
Him &amp; Her
Christmas Bounty
Thomas E. O'Donnell
Dean McCarthy (actor)
Rob &amp; Big
The Shack
Blake LeVine
Eenasul Fateh
Marco Gariba

In [11]:
# 🗂️ Cell 10 – Download indexed_test_docs.json and extract chunks for fail_titles

# Install gdown if necessary
!pip install --quiet gdown

import json
from pathlib import Path

# Paths
INDEXED_JSON = Path("/content/Curious_LLM_DATA/HotpotQA/indexed_test_docs.json")
FAIL_WIKI_PATH = Path("/content/fail_wiki.txt")

# Download the indexed_test_docs.json file
!gdown --id 1dgGbtiM4MlSAjudq7FuvSBGbCn8HpxIz -O "{INDEXED_JSON}"

# Load the list of failed titles and deduplicate
with FAIL_WIKI_PATH.open("r", encoding="utf-8") as f:
    fail_titles = {line.strip() for line in f if line.strip()}

# Load the indexed test docs
with INDEXED_JSON.open("r", encoding="utf-8") as f:
    records = json.load(f)

# Iterate records and print matching chunks
total_found = 0
for rec in records:
    titles = rec.get("titles", [])
    docs_chunks = rec.get("docs_chunks", [])
    title_chunks = rec.get("title_chunks", [])

    for idx, title in enumerate(titles):
        if title in fail_titles:
            total_found += 1
            print(f"Title: {title}\n")

            # Print docs_chunks for this title
            print("docs_chunks:")
            for chunk in docs_chunks[idx]:
                print(chunk)

            # Print title_chunks for this title
            print("\ntitle_chunks:")
            for tc in title_chunks:
                # Each title_chunk is [title, chunk_text]
                if isinstance(tc, list) and tc and tc[0] == title:
                    print(tc[1] if len(tc) > 1 else "")

            print("\n" + "="*80 + "\n")

print(f"Total titles found in indexed_test_docs.json: {total_found}")


Downloading...
From (original): https://drive.google.com/uc?id=1dgGbtiM4MlSAjudq7FuvSBGbCn8HpxIz
From (redirected): https://drive.google.com/uc?id=1dgGbtiM4MlSAjudq7FuvSBGbCn8HpxIz&confirm=t&uuid=d173b90b-bc1f-460f-9206-aa5dd8344be6
To: /content/Curious_LLM_DATA/HotpotQA/indexed_test_docs.json
100% 250M/250M [00:01<00:00, 139MB/s]
Title: John List

docs_chunks:
 On November 9, 1971, he killed his wife, mother, and three children in their home in Westfield, New Jersey, then disappeared.
John List may refer to:

title_chunks:
 On November 9, 1971, he killed his wife, mother, and three children in their home in Westfield, New Jersey, then disappeared.
John List may refer to:


Title: Scott Hicks

docs_chunks:
Scott Hicks may refer to:
 Hicks's work has been nominated for an Academy Award as well as winning an Emmy Award.
Robert Scott Hicks (born 4 March 1953) is an Australian film director and screenwriter.

title_chunks:
Scott Hicks may refer to:
 Hicks's work has been nominated for an A

In [12]:
# 🗂️ Cell 11 – Replace docs in all_docs.json with joined docs_chunks for unique fail_titles

import json
from pathlib import Path

# Paths
ALL_DOCS_JSON = Path("/content/Curious_LLM_DATA/HotpotQA/all_docs.json")
INDEXED_JSON  = Path("/content/Curious_LLM_DATA/HotpotQA/indexed_test_docs.json")
FAIL_WIKI_TXT = Path("/content/fail_wiki.txt")

# 1️⃣ Load and dedupe the fail_wiki titles, with counts for verification
with FAIL_WIKI_TXT.open("r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]
print(f"Loaded {len(lines)} lines from fail_wiki.txt")
fail_titles = set(lines)
print(f"{len(fail_titles)} unique titles after deduplication\n")

# 2️⃣ Load indexed_test_docs.json
with INDEXED_JSON.open("r", encoding="utf-8") as f:
    indexed_records = json.load(f)

# 3️⃣ Build a mapping from title to its full concatenated docs_chunks
replace_map = {}
for rec in indexed_records:
    for title, chunks in zip(rec.get("titles", []), rec.get("docs_chunks", [])):
        if title in fail_titles:
            replace_map[title] = " ".join(chunks)

print(f"Prepared replacement text for {len(replace_map)} titles\n")

# 4️⃣ Load existing all_docs.json
with ALL_DOCS_JSON.open("r", encoding="utf-8") as f:
    data = json.load(f)
titles = data["titles"]
docs   = data["docs"]

# 5️⃣ Replace docs where title matches
replace_count = 0
for idx, title in enumerate(titles):
    if title in replace_map:
        docs[idx] = replace_map[title]
        replace_count += 1

# 6️⃣ Overwrite all_docs.json with updated content
with ALL_DOCS_JSON.open("w", encoding="utf-8") as f:
    json.dump({"titles": titles, "docs": docs}, f, ensure_ascii=False, indent=2)

print(f"✅ Replaced {replace_count} docs in all_docs.json based on docs_chunks.")


Loaded 126 lines from fail_wiki.txt
105 unique titles after deduplication

Prepared replacement text for 105 titles

✅ Replaced 105 docs in all_docs.json based on docs_chunks.


In [13]:
# 🗂️ Cell 12 – Display replaced docs for each fail_title

import json
from pathlib import Path

# Paths
ALL_DOCS_JSON = Path("/content/Curious_LLM_DATA/HotpotQA/all_docs.json")
FAIL_WIKI_TXT = Path("/content/fail_wiki.txt")

# Load deduplicated fail_titles
with FAIL_WIKI_TXT.open("r", encoding="utf-8") as f:
    fail_titles = sorted({line.strip() for line in f if line.strip()})

# Load the deduplicated all_docs.json
with ALL_DOCS_JSON.open("r", encoding="utf-8") as f:
    data = json.load(f)
titles = data["titles"]
docs   = data["docs"]

# Print how each fail_title now appears in all_docs.json
for title in fail_titles:
    if title in titles:
        idx = titles.index(title)
        print(f"Title: {title}  (Index: {idx})\n")
        print(f"Doc:\n{docs[idx]}\n")
        print("="*80 + "\n")
    else:
        print(f"⚠️ Title not found in all_docs.json: {title}\n")


Title: 122nd SS-Standarte  (Index: 700)

Doc:
Other reasons this message may be displayed: The 122nd SS-Standarte was a regimental command of the Allgemeine-SS that was formed in the city of Strasbourg during World War II.


Title: A Place to Live  (Index: 1540)

Doc:
A Place to Live may refer to: A Place to Live is a 1941 documentary film directed by Irving Lerner and produced by the Philadelphia Housing Association, a nonprofit affordable housing advocacy group.


Title: Albanian Coalition &quot;Perspective&quot;  (Index: 2284)

Doc:
The requested page title is invalid. It may be empty, contain unsupported characters, or include a non-local or incorrectly linked interwiki prefix. You may be able to locate the desired page by searching for its name (with the interwiki prefix, if any) in the search box. Possible causes are:Return to Main Page. The Albanian Coalition "Perspective" is an Albanian political party in Montenegro.


Title: Bella (magazine)  (Index: 2105)

Doc:
Bella is a wee

In [14]:
# ✅ Unified Merge Cell: Robust normalization + append/replace wiki_docs into all_docs
# ------------------------------------------------------------------------------
# ورودی‌ها:
#   /content/Curious_LLM_DATA/HotpotQA/all_docs.json   (ساختار {"titles": [...], "docs": [...]})
#   /content/wiki_docs.json                            (لیست [{ "title": ..., "doc": ... }, ...])
#
# خروجی:
#   all_docs.json به‌روزشده با افزودن / جایگزینی متن‌های wiki به انتهای doc هر عنوان (با جداکننده)
#
# رفتار:
#   1. نرمال‌سازی عنوان‌ها (حذف فاصله اضافی، decode HTML entities، یکسان‌سازی نقل‌قول‌ها، &amp; → & ، lower-case)
#   2. تجمیع رکوردهای تکراری wiki برای یک عنوان
#   3. اگر قبلاً برای آن عنوان چیزی با SEPARATOR وجود داشته باشد و REPLACE_EXISTING=True:
#        → محتوای بعد از SEPARATOR حذف و متن جدید جایگزین می‌شود (فقط یک بلاک wiki تازه می‌ماند)
#      در غیر این صورت (REPLACE_EXISTING=False) و اگر متن جدید متفاوت باشد:
#        → بلاک جدید جداگانه‌ای اضافه می‌شود (در صورت FORCE_APPEND=True)
#   4. چاپ گزارش کامل
#
# پیکربندی را می‌توانید تغییر دهید:

import json, re, html
from pathlib import Path

# ---------------- Configuration ----------------
ALL_DOCS_PATH       = Path("/content/Curious_LLM_DATA/HotpotQA/all_docs.json")
WIKI_DOCS_PATH      = Path("/content/wiki_docs.json")

SEPARATOR           = "\n\n---[WIKI_APPEND]---\n\n"
REPLACE_EXISTING    = True   # اگر True: هر بار اجرا، متن wiki (بعد از SEPARATOR) را با نسخهٔ جدید جایگزین می‌کند
FORCE_APPEND        = True   # اگر False و REPLACE_EXISTING=False: فقط وقتی قبلاً SEPARATOR نبود اضافه می‌کند
SHOW_SAMPLES        = 7      # چند نمونه برای نمایش
MIN_WIKI_SNIPPET_CHK = 120   # طول چک برای تشخیص "همان متن" در حالت محافظه‌کار

# ---------------- Helpers ----------------
def normalize_quotes(s: str) -> str:
    return (s.replace('“','"').replace('”','"').replace('„','"').replace('‟','"')
              .replace('‘',"'").replace('’',"'").replace('‚',"'").replace('‛',"'"))

def norm_title(t: str) -> str:
    if not t: return ""
    t = html.unescape(t)
    t = normalize_quotes(t)
    t = t.replace('&amp;', '&')
    t = re.sub(r'\s+', ' ', t).strip().lower()
    return t

# ---------------- Load Base Dataset ----------------
assert ALL_DOCS_PATH.exists(), f"Base file not found: {ALL_DOCS_PATH}"
assert WIKI_DOCS_PATH.exists(), f"Wiki file not found: {WIKI_DOCS_PATH}"

with ALL_DOCS_PATH.open("r", encoding="utf-8") as f:
    base = json.load(f)

base_titles = base["titles"]
base_docs   = base["docs"]
assert len(base_titles) == len(base_docs), "Mismatch titles/docs length in base file."

# ایندکس نرمال‌شده: normalized_title -> indices list
norm_index = {}
for i, t in enumerate(base_titles):
    nt = norm_title(t)
    norm_index.setdefault(nt, []).append(i)

# ---------------- Load & Aggregate wiki_docs ----------------
with WIKI_DOCS_PATH.open("r", encoding="utf-8") as f:
    wiki_records = json.load(f)

# original title -> aggregated doc
wiki_map_raw = {}
for rec in wiki_records:
    title = (rec.get("title") or "").strip()
    doc   = (rec.get("doc") or "").strip()
    if not title or not doc:
        continue
    if title in wiki_map_raw:
        if doc not in wiki_map_raw[title]:
            wiki_map_raw[title] += "\n\n" + doc
    else:
        wiki_map_raw[title] = doc

# normalized_title -> list of (original_title)
norm_wiki_groups = {}
for ot in wiki_map_raw:
    norm_wiki_groups.setdefault(norm_title(ot), []).append(ot)

# ---------------- Merge Logic ----------------
appended, replaced, unchanged, not_found = [], [], [], []

for ntitle, original_title_list in norm_wiki_groups.items():
    # متن نهایی ویکی (ترکیب تمام variants همان عنوان نرمال‌شده)
    combined_wiki_text = "\n\n".join(
        wiki_map_raw[ot] for ot in original_title_list if wiki_map_raw[ot].strip()
    ).strip()
    if not combined_wiki_text:
        continue

    if ntitle not in norm_index:
        not_found.append((original_title_list, "NO_MATCH_IN_BASE"))
        continue

    for idx in norm_index[ntitle]:
        current_doc = base_docs[idx]

        if SEPARATOR in current_doc:
            pre, post = current_doc.split(SEPARATOR, 1)
            if REPLACE_EXISTING:
                base_docs[idx] = pre.rstrip() + SEPARATOR + combined_wiki_text
                replaced.append((base_titles[idx], idx, "REPLACED"))
            else:
                # حالت عدم جایگزینی: بررسی تکراری بودن
                if combined_wiki_text[:MIN_WIKI_SNIPPET_CHK] in post:
                    unchanged.append((base_titles[idx], idx, "UNCHANGED_ALREADY_PRESENT"))
                else:
                    if FORCE_APPEND:
                        base_docs[idx] = current_doc.rstrip() + SEPARATOR + combined_wiki_text
                        appended.append((base_titles[idx], idx, "APPENDED_EXTRA"))
                    else:
                        unchanged.append((base_titles[idx], idx, "SKIPPED_FORCE_OFF"))
        else:
            # بدون SEPARATOR قبلی → append اولیه
            base_docs[idx] = current_doc.rstrip() + SEPARATOR + combined_wiki_text
            appended.append((base_titles[idx], idx, "FIRST_APPEND"))

# ---------------- Save Back ----------------
with ALL_DOCS_PATH.open("w", encoding="utf-8") as f:
    json.dump({"titles": base_titles, "docs": base_docs}, f, ensure_ascii=False, indent=2)

# ---------------- Reporting ----------------
def summary(label, lst): return f"{label}: {len(lst)}"

print("===== FINAL MERGE REPORT =====")
print(f"Wiki original distinct titles    : {len(wiki_map_raw)}")
print(f"Normalized title groups          : {len(norm_wiki_groups)}")
print(summary("Appended (new blocks)", appended))
print(summary("Replaced (previous blocks)", replaced))
print(summary("Unchanged (already same)", unchanged))
print(summary("Not found (after normalization)", not_found))
print(f"Total affected (append+replace)  : {len(appended)+len(replaced)} / {len(wiki_map_raw)} wiki titles\n")

if not_found:
    print("⚠️ Not Found Groups (up to 10):")
    for group, reason in not_found[:10]:
        print("  - Variants:", group, "| Reason:", reason)
    print()

def show_samples(lst, title):
    if not lst:
        print(f"({title}: none)")
        return
    print(f"--- {title} (showing up to {SHOW_SAMPLES}) ---")
    for i, (t, idx, tag) in enumerate(lst[:SHOW_SAMPLES]):
        doc = base_docs[idx]
        if SEPARATOR in doc:
            _, after = doc.split(SEPARATOR, 1)
            snippet = after[:400].replace("\n", " ")
        else:
            snippet = "(Separator missing?)"
        print(f"[{i+1}] {t} (idx {idx}) [{tag}] → {snippet[:380]}{'...' if len(snippet)>380 else ''}")
    print()

show_samples(appended, "APPENDED")
show_samples(replaced, "REPLACED")
show_samples(unchanged, "UNCHANGED")

print("✅ Done. File updated at:", ALL_DOCS_PATH)


===== FINAL MERGE REPORT =====
Wiki original distinct titles    : 76
Normalized title groups          : 76
Appended (new blocks): 76
Replaced (previous blocks): 0
Unchanged (already same): 0
Not found (after normalization): 0
Total affected (append+replace)  : 76 / 76 wiki titles

--- APPENDED (showing up to 7) ---
[1] 122nd SS-Standarte (idx 700) [FIRST_APPEND] → In Nazi Germany , the Standarte (pl. Standarten ) was a paramilitary unit of Nazi Party (NSDAP), Sturmabteilung (SA), NSKK , NSFK , and Schutzstaffel (SS). Translated literally as "Regimental standard ", the name refers to the flag paramilitary formations carried in formations and parades.  The Sturmabteilung (SA) was organized into several large regional groups ( Gruppen ). Ea...
[2] A Place to Live (idx 1540) [FIRST_APPEND] → A Place to Live is a 1941 documentary film directed by Irving Lerner and produced by the Philadelphia Housing Association, a nonprofit affordable housing advocacy group. The film aimed to call attentio

In [15]:
# 🔍 Inspect merged wiki text for selected titles

import json, re, html
from pathlib import Path

ALL_DOCS_PATH = Path("/content/Curious_LLM_DATA/HotpotQA/all_docs.json")
SEPARATOR = "\n\n---[WIKI_APPEND]---\n\n"

TARGET_TITLES = [
    "Lake George, New York",
    "Fucking, Austria",
    "Bruce Bryant",
]

assert ALL_DOCS_PATH.exists(), f"Base file not found: {ALL_DOCS_PATH}"

with ALL_DOCS_PATH.open("r", encoding="utf-8") as f:
    data = json.load(f)

titles = data["titles"]
docs   = data["docs"]

def show_title(title):
    if title not in titles:
        print(f"⚠️ Title not found: {title}")
        return
    idx = titles.index(title)
    full_doc = docs[idx]
    has_sep = SEPARATOR in full_doc

    if has_sep:
        pre, post = full_doc.split(SEPARATOR, 1)
        wiki_part = post
    else:
        pre, wiki_part = full_doc, ""

    def wc(text):  # word count
        return len([w for w in re.split(r"\s+", text.strip()) if w])

    print("="*90)
    print(f"🔎 Title: {title}")
    print(f"📍 Index: {idx}")
    print(f"📏 Full doc: {len(full_doc)} chars | {wc(full_doc)} words")
    if has_sep:
        print(f"📏 Original (before separator): {len(pre)} chars | {wc(pre)} words")
        print(f"📏 Appended wiki segment       : {len(wiki_part)} chars | {wc(wiki_part)} words")
    else:
        print("⚠️ No separator found (wiki segment not appended or already merged differently).")

    # نمایش بخشی از متن اصلی (ابتدا) و بخشی از بخش ویکی (ابتدا)
    head_main = pre[:500].replace("\n", " ")
    print("\n--- Main (first 500 chars) ---")
    print(head_main + ("..." if len(pre) > 500 else ""))

    if wiki_part:
        head_wiki = wiki_part[:1200].replace("\n", " ")
        print("\n--- Wiki appended (first 1200 chars) ---")
        print(head_wiki + ("..." if len(wiki_part) > 1200 else ""))

    print("="*90 + "\n")

for t in TARGET_TITLES:
    show_title(t)


🔎 Title: Lake George, New York
📍 Index: 52
📏 Full doc: 4042 chars | 746 words
📏 Original (before separator): 167 chars | 30 words
📏 Appended wiki segment       : 3852 chars | 715 words

--- Main (first 500 chars) ---
Lake George, New York may refer to one of several locations in New York State in the United States:  The town is part of the Glens Falls Metropolitan Statistical Area.

--- Wiki appended (first 1200 chars) ---
Lake George is a town in Warren County , New York , United States. The population was 3,502 at the 2020 census. The town is named after the lake, Lake George . The town is part of the Glens Falls Metropolitan Statistical Area .  The first European to visit the lake was Father Isaac Jogues in August 1642. He was later captured by Mohawks , escaped, and returned home to France . In 1646, he was sent on a political mission to the Iroquois to propose a treaty of peace, and at that point dubbed the lake "Lac du Saint Sacrement" (Lake of the Blessed Sacrament) .  In 1755, 

In [16]:
# 📦 Mount Google Drive & copy all_docs.json into Drive

from google.colab import drive
from pathlib import Path
from datetime import datetime
import shutil, json, os

# 1. Mount
drive.mount('/content/drive')

# 2. Paths
SOURCE_FILE = Path("/content/Curious_LLM_DATA/HotpotQA/all_docs.json")
assert SOURCE_FILE.exists(), f"Source file not found: {SOURCE_FILE}"

DEST_DIR = Path("/content/drive/MyDrive/HotpotQA_snapshot")
DEST_DIR.mkdir(parents=True, exist_ok=True)

# 3. Base copy
DEST_FILE = DEST_DIR / "all_docs.json"
shutil.copy2(SOURCE_FILE, DEST_FILE)

# 4. Timestamped archival copy
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
ARCHIVE_FILE = DEST_DIR / f"all_docs_{ts}.json"
shutil.copy2(SOURCE_FILE, ARCHIVE_FILE)

print(f"✅ Copied current all_docs.json to: {DEST_FILE}")
print(f"🗃 Archived snapshot: {ARCHIVE_FILE}")
print(f"📏 File size: {DEST_FILE.stat().st_size / 1024:.1f} KB")


Mounted at /content/drive
✅ Copied current all_docs.json to: /content/drive/MyDrive/HotpotQA_snapshot/all_docs.json
🗃 Archived snapshot: /content/drive/MyDrive/HotpotQA_snapshot/all_docs_20250719_222359.json
📏 File size: 50941.3 KB


In [17]:
# 📊 Comprehensive statistical analysis of all_docs.json

import json, re, math, statistics, collections, csv
from pathlib import Path
from pprint import pprint
from datetime import datetime

ALL_DOCS_PATH = Path("/content/Curious_LLM_DATA/HotpotQA/all_docs.json")
SEPARATOR = "---[WIKI_APPEND]---"

assert ALL_DOCS_PATH.exists(), f"File not found: {ALL_DOCS_PATH}"

with ALL_DOCS_PATH.open("r", encoding="utf-8") as f:
    data = json.load(f)

titles = data["titles"]
docs   = data["docs"]
n = len(docs)
assert len(titles) == n, "titles/docs mismatch!"

# ---------- Helpers ----------
word_pattern = re.compile(r"\b\w+\b", flags=re.UNICODE)

def tokenize(text: str):
    # ساده: جداکردن با pattern واژه؛ می‌توانید بعداً پیش‌پردازش پیشرفته‌تر اضافه کنید
    return word_pattern.findall(text.lower())

def safe_stat(fn, seq, default=float('nan')):
    try:
        return fn(seq)
    except statistics.StatisticsError:
        return default

# ---------- Basic Length Metrics ----------
char_lengths = [len(d) for d in docs]
word_lengths = [len(tokenize(d)) for d in docs]

# pseudo token lengths (می‌توانید بعداً با tiktoken جایگزین کنید)
avg_chars_per_word = sum(char_lengths) / sum(wl if wl>0 else 1 for wl in word_lengths)
tokens_lengths = word_lengths  # در این مرحله همان word count را به عنوان proxy می‌گیریم

# ---------- Title Metrics ----------
title_char_lengths = [len(t) for t in titles]
title_word_lengths = [len(t.strip().split()) for t in titles]

# ---------- Separator (wiki append) presence ----------
has_separator = [SEPARATOR in d for d in docs]
num_with_separator = sum(has_separator)

# ---------- Vocabulary ----------
vocab_counter = collections.Counter()
for d in docs:
    vocab_counter.update(tokenize(d))

total_tokens = sum(vocab_counter.values())
unique_tokens = len(vocab_counter)
type_token_ratio = unique_tokens / total_tokens if total_tokens else 0

# ---------- Extreme Documents ----------
SHORT_WORD_THRESHOLD = 150
LONG_WORD_THRESHOLD  = 5000

short_indices = [i for i, wl in enumerate(word_lengths) if wl < SHORT_WORD_THRESHOLD]
long_indices  = [i for i, wl in enumerate(word_lengths) if wl > LONG_WORD_THRESHOLD]

# ---------- Percentiles ----------
def percentiles(values, ps):
    sorted_vals = sorted(values)
    out = {}
    for p in ps:
        if not sorted_vals:
            out[p] = float('nan')
            continue
        k = (len(sorted_vals)-1) * (p/100)
        f = math.floor(k); c = math.ceil(k)
        if f == c:
            out[p] = sorted_vals[int(k)]
        else:
            out[p] = sorted_vals[f] + (sorted_vals[c]-sorted_vals[f]) * (k - f)
    return out

percentile_points = [5,10,25,50,75,90,95,99]
char_percentiles = percentiles(char_lengths, percentile_points)
word_percentiles = percentiles(word_lengths, percentile_points)

# ---------- Per-title duplication check (should be unique if dedup applied) ----------
from collections import Counter
title_freq = Counter(titles)
duplicate_titles = {t:c for t,c in title_freq.items() if c>1}

# ---------- Character composition ----------
def char_composition(text):
    letters = sum(ch.isalpha() for ch in text)
    digits  = sum(ch.isdigit() for ch in text)
    spaces  = sum(ch.isspace() for ch in text)
    others  = len(text) - letters - digits - spaces
    return letters, digits, spaces, others

letters_total = digits_total = spaces_total = others_total = 0
for d in docs:
    l,dg,sp,ot = char_composition(d)
    letters_total += l; digits_total += dg; spaces_total += sp; others_total += ot

char_comp = {
    "letters": letters_total,
    "digits": digits_total,
    "spaces": spaces_total,
    "others": others_total,
    "letters_ratio": letters_total / sum(char_lengths),
    "digits_ratio": digits_total / sum(char_lengths),
    "spaces_ratio": spaces_total / sum(char_lengths),
    "others_ratio": others_total / sum(char_lengths),
}

# ---------- Aggregate Statistics ----------
stats = {
    "timestamp": datetime.utcnow().isoformat()+"Z",
    "documents": n,
    "with_separator": num_with_separator,
    "with_separator_ratio": num_with_separator / n,
    "total_chars": sum(char_lengths),
    "total_words": sum(word_lengths),
    "avg_chars_per_doc": sum(char_lengths)/n,
    "avg_words_per_doc": sum(word_lengths)/n,
    "median_chars_per_doc": safe_stat(statistics.median, char_lengths),
    "median_words_per_doc": safe_stat(statistics.median, word_lengths),
    "stdev_chars_per_doc": safe_stat(statistics.pstdev, char_lengths),
    "stdev_words_per_doc": safe_stat(statistics.pstdev, word_lengths),
    "min_chars": min(char_lengths),
    "max_chars": max(char_lengths),
    "min_words": min(word_lengths),
    "max_words": max(word_lengths),
    "char_percentiles": char_percentiles,
    "word_percentiles": word_percentiles,
    "short_docs_threshold_words": SHORT_WORD_THRESHOLD,
    "short_docs_count": len(short_indices),
    "long_docs_threshold_words": LONG_WORD_THRESHOLD,
    "long_docs_count": len(long_indices),
    "vocab_total_tokens": total_tokens,
    "vocab_unique_tokens": unique_tokens,
    "type_token_ratio": type_token_ratio,
    "avg_chars_per_word_global": avg_chars_per_word,
    "title_avg_chars": sum(title_char_lengths)/n,
    "title_median_chars": safe_stat(statistics.median, title_char_lengths),
    "title_avg_words": sum(title_word_lengths)/n,
    "title_median_words": safe_stat(statistics.median, title_word_lengths),
    "duplicate_titles_count": len(duplicate_titles),
    "char_composition": char_comp
}

# ---------- Top Frequent Tokens ----------
TOP_K = 30
top_tokens = vocab_counter.most_common(TOP_K)

# ---------- Output ----------
print("=== DATASET STATISTICS SUMMARY ===")
for k,v in stats.items():
    if isinstance(v, dict):
        print(f"{k}:")
        for sk, sv in v.items():
            print(f"  - {sk}: {sv}")
    else:
        print(f"{k}: {v}")

print("\n=== TOP TOKENS (most common) ===")
for token, freq in top_tokens:
    print(f"{token:>15} : {freq}")

if duplicate_titles:
    print("\n⚠️ Duplicate titles detected (should be 0 if dedup was correct):")
    for t,c in list(duplicate_titles.items())[:20]:
        print(f" - {t} ({c})")
else:
    print("\n✅ No duplicate titles detected.")

print(f"\nShort docs (<{SHORT_WORD_THRESHOLD} words): {len(short_indices)}")
print(f"Long docs (>{LONG_WORD_THRESHOLD} words): {len(long_indices)}")

# ---------- Save stats to JSON ----------
STATS_JSON = ALL_DOCS_PATH.parent / "docs_stats.json"
with STATS_JSON.open("w", encoding="utf-8") as f:
    json.dump({
        "stats": stats,
        "top_tokens": top_tokens,
        "short_doc_indices": short_indices[:500],  # truncate for safety
        "long_doc_indices": long_indices[:500],
    }, f, ensure_ascii=False, indent=2)

# ---------- Save per-doc lengths CSV ----------
CSV_PATH = ALL_DOCS_PATH.parent / "docs_lengths.csv"
with CSV_PATH.open("w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["index","title","char_length","word_length","has_separator"])
    for i,(t,cl,wl,hs) in enumerate(zip(titles,char_lengths,word_lengths,has_separator)):
        writer.writerow([i, t, cl, wl, int(hs)])

print(f"\n📝 Saved stats JSON → {STATS_JSON}")
print(f"🗂 Saved lengths CSV → {CSV_PATH}")

# ---------- (Optional) Histogram (uncomment to visualize) ----------
# import matplotlib.pyplot as plt
# import math
# plt.figure(figsize=(6,4))
# plt.hist([wl for wl in word_lengths if wl < 5000], bins=60)
# plt.xlabel("Words per doc (<5000 clipped)")
# plt.ylabel("Count")
# plt.title("Document Word Count Distribution")
# plt.show()


=== DATASET STATISTICS SUMMARY ===
timestamp: 2025-07-19T22:25:21.791727Z
documents: 4482
with_separator: 76
with_separator_ratio: 0.016956715751896476
total_chars: 51759812
total_words: 8770692
avg_chars_per_doc: 11548.373940205265
avg_words_per_doc: 1956.8701472556895
median_chars_per_doc: 5522.5
median_words_per_doc: 945.0
stdev_chars_per_doc: 16223.195139888927
stdev_words_per_doc: 2726.8960364123786
min_chars: 87
max_chars: 202004
min_words: 17
max_words: 35291
char_percentiles:
  - 5: 553.2
  - 10: 846.0
  - 25: 1963.5
  - 50: 5522.5
  - 75: 14059.0
  - 90: 30082.9
  - 95: 44610.19999999997
  - 99: 77639.9199999999
word_percentiles:
  - 5: 94.0
  - 10: 143.10000000000002
  - 25: 331.0
  - 50: 945.0
  - 75: 2397.75
  - 90: 5081.9
  - 95: 7498.9499999999925
  - 99: 13007.86999999997
short_docs_threshold_words: 150
short_docs_count: 478
long_docs_threshold_words: 5000
long_docs_count: 463
vocab_total_tokens: 8770692
vocab_unique_tokens: 159674
type_token_ratio: 0.018205405001110515
