In [1]:
from google.colab import drive
drive.mount('/content/drive')

import json, os, math, sys
from collections import Counter
import pandas as pd


FOLDER_NAME = "HotpotQA_snapshot"          
FILENAME    = "all_docs.json"              
BASE_PATH   = "/content/drive/MyDrive"     
FILE_PATH   = os.path.join(BASE_PATH, FOLDER_NAME, FILENAME)

SEARCH_IF_NOT_FOUND = True
MAX_PREVIEW_ITEMS   = 3    
# ==============================================

print("🗂 مسیر هدف:", FILE_PATH)

def search_file(filename, root="/content/drive"):
    matches = []
    for dirpath, dirnames, filenames in os.walk(root):
        if filename in filenames:
            matches.append(os.path.join(dirpath, filename))
    return matches

if not os.path.isfile(FILE_PATH):
    if SEARCH_IF_NOT_FOUND:
        print("⚠️ فایل در مسیر مستقیم پیدا نشد؛ در حال جستجو...")
        candidates = search_file(FILENAME, BASE_PATH)
        if not candidates:
            raise FileNotFoundError(f"فایل {FILENAME} پیدا نشد. مسیر را بررسی کنید.")
        prioritized = [p for p in candidates if FOLDER_NAME in p]
        chosen = prioritized[0] if prioritized else candidates[0]
        print("مسیرهای یافت شده:")
        for i, c in enumerate(candidates, 1):
            print(f"  {i}. {c}")
        print(f"✅ مسیر انتخاب‌شده: {chosen}")
        FILE_PATH = chosen
    else:
        raise FileNotFoundError(f"فایل در مسیر {FILE_PATH} پیدا نشد.")

file_size_mb = os.path.getsize(FILE_PATH) / (1024*1024)
print(f"\n📏 اندازه فایل: {file_size_mb:.2f} MB")

print("\n📥 در حال بارگذاری JSON ...")
with open(FILE_PATH, "rb") as raw:
    raw_bytes = raw.read()

BOM_UTF8 = b'\xef\xbb\xbf'
if raw_bytes.startswith(BOM_UTF8):
    print("ℹ️ BOM UTF-8 شناسایی شد؛ حذف می‌گردد.")
    raw_bytes = raw_bytes[len(BOM_UTF8):]

text_data = raw_bytes.decode("utf-8")

data = json.loads(text_data)
print("✅ بارگذاری موفق.")

def preview_structure(obj, depth=0, max_items=MAX_PREVIEW_ITEMS):
    prefix = "  " * depth
    if isinstance(obj, list):
        print(f"{prefix}- نوع: list | طول: {len(obj)}")
        for i, item in enumerate(obj[:max_items]):
            print(f"{prefix}  [#{i}] نوع عضو: {type(item).__name__}")
            preview_structure(item, depth+2, max_items)
        if len(obj) > max_items:
            print(f"{prefix}  ... ({len(obj) - max_items} عضو دیگر)")
    elif isinstance(obj, dict):
        print(f"{prefix}- نوع: dict | تعداد کلیدها: {len(obj)}")
        for i, (k, v) in enumerate(list(obj.items())[:max_items]):
            print(f"{prefix}  کلید نمونه: {repr(k)} -> نوع مقدار: {type(v).__name__}")
            preview_structure(v, depth+2, max_items)
        extra = len(obj) - max_items
        if extra > 0:
            print(f"{prefix}  ... ({extra} کلید دیگر)")
    else:
        s = str(obj)
        if len(s) > 80: s = s[:77] + "..."
        print(f"{prefix}- مقدار ساده ({type(obj).__name__}): {s}")

def summarize_records(records):
    df = pd.DataFrame(records)
    print("\nستون‌ها:", list(df.columns))
    print("تعداد رکورد:", len(df))
    display(df.head(5))
    stats = {}
    if "title" in df.columns:
        stats["unique_titles"] = df["title"].nunique()
        stats["avg_title_len"] = df["title"].astype(str).str.len().mean()
    text_col = next((c for c in ["text", "content", "body"] if c in df.columns), None)
    if text_col:
        lengths = df[text_col].astype(str).str.len()
        stats.update({
            "text_col": text_col,
            "avg_text_length": lengths.mean(),
            "median_text_length": lengths.median(),
            "min_text_length": lengths.min(),
            "max_text_length": lengths.max(),
        })
    print("\nآمار متنی:")
    for k, v in stats.items():
        print(f"  {k}: {v}")

print("\n==== پیش‌نمایش ساختار کلی ====")
preview_structure(data)

if isinstance(data, list):
    if data and all(isinstance(x, dict) for x in data):
        print("\n🔎 تشخیص: لیستی از دیکشنری‌ها (records).")
        summarize_records(data)
    elif data and all(isinstance(x, str) for x in data):
        print("\n🔎 تشخیص: لیستی از رشته‌ها.")
        print("تعداد آیتم‌ها:", len(data))
        print("نمونه‌ها:", data[:5])
    else:
        print("\n🔎 ساختار لیست ترکیبی/متفاوت است؛ برای تحلیل بیشتر نمونه‌ها را بررسی کنید.")
elif isinstance(data, dict):
    if data and all(isinstance(v, str) for v in data.values()):
        print("\n🔎 تشخیص: دیکشنری نگاشت title -> text.")
        print("تعداد رکورد:", len(data))
        for k, v in list(data.items())[:5]:
            print(f"- عنوان: {k!r} | طول متن: {len(v)}")
    elif any(isinstance(v, list) and v and isinstance(v[0], dict) for v in data.values()):
        print("\n🔎 تشخیص: دیکشنری سطح بالا با یک یا چند لیست رکورد.")
        for k, v in data.items():
            if isinstance(v, list) and v and isinstance(v[0], dict):
                print(f"\n> تحلیل لیست رکورد زیر کلید: {k}")
                summarize_records(v)
    else:
        print("\n🔎 دیکشنری با ساختار غیرمتداول / ترکیبی. خروجی پیش‌نمایش را بررسی کنید.")
else:
    print("\n🔎 مقدار ساده‌ی JSON (نه لیست و نه dict):", type(data).__name__)

print("\n✅ تحلیل اولیه انجام شد.")

def quick_summary(obj):
    if isinstance(obj, list):
        kind = "list"
        length = len(obj)
        elem_types = Counter(type(x).__name__ for x in obj[:100])
        return f"نوع ریشه: list | طول کل: {length} | انواع 100 عضو اول: {dict(elem_types)}"
    if isinstance(obj, dict):
        key_count = len(obj)
        sample_keys = list(obj.keys())[:5]
        return f"نوع ریشه: dict | تعداد کلید: {key_count} | چند کلید نمونه: {sample_keys}"
    return f"ریشه نوع ساده: {type(obj).__name__}"

print("\n📌 خلاصه سریع:", quick_summary(data))


Mounted at /content/drive
🗂 مسیر هدف: /content/drive/MyDrive/HotpotQA_snapshot/all_docs.json

📏 اندازه فایل: 49.75 MB

📥 در حال بارگذاری JSON ...
✅ بارگذاری موفق.

==== پیش‌نمایش ساختار کلی ====
- نوع: dict | تعداد کلیدها: 2
  کلید نمونه: 'titles' -> نوع مقدار: list
    - نوع: list | طول: 4482
      [#0] نوع عضو: str
        - مقدار ساده (str): Anthony Avent
      [#1] نوع عضو: str
        - مقدار ساده (str): Newark, New Jersey
      [#2] نوع عضو: str
        - مقدار ساده (str): Dražen Dalipagić
      ... (4479 عضو دیگر)
  کلید نمونه: 'docs' -> نوع مقدار: list
    - نوع: list | طول: 4482
      [#0] نوع عضو: str
        - مقدار ساده (str): Anthony Avent (born October 18, 1969) is an American former professional bask...
      [#1] نوع عضو: str
        - مقدار ساده (str): Newark (/ˈnjuːərk/ NEW-ərk, locally [nʊəɹk]) is the most populous city in the...
      [#2] نوع عضو: str
        - مقدار ساده (str): Dražen "Praja" Dalipagić (Serbian Cyrillic: Дражен "Праја" Далипагић; born ...
      

In [2]:
# Chunking HotpotQA all_docs.json  

import os, json
from typing import List

FOLDER_NAME           = "HotpotQA_snapshot"
INPUT_FILENAME        = "all_docs.json"
OUTPUT_FILENAME       = "all_docs_chunks.json"      
OUTPUT_JSONL_FILENAME = "all_docs_chunks.jsonl"     
BASE_PATH             = "/content/drive/MyDrive"

CHUNK_SIZE_WORDS      = 150
CHUNK_OVERLAP_WORDS   = 20
PRINT_SAMPLE          = 3
WRITE_JSON_ARRAY      = True    
WRITE_JSONL           = False   
# -----------------------------------------

INPUT_PATH        = os.path.join(BASE_PATH, FOLDER_NAME, INPUT_FILENAME)
OUTPUT_PATH       = os.path.join(BASE_PATH, FOLDER_NAME, OUTPUT_FILENAME)
OUTPUT_JSONL_PATH = os.path.join(BASE_PATH, FOLDER_NAME, OUTPUT_JSONL_FILENAME)

print(f"📥 در حال خواندن فایل ورودی: {INPUT_PATH}")

if not os.path.isfile(INPUT_PATH):
    raise FileNotFoundError("فایل ورودی پیدا نشد.")

with open(INPUT_PATH, "r", encoding="utf-8") as f:
    raw = json.load(f)

titles = raw.get("titles")
docs   = raw.get("docs")

if not (isinstance(titles, list) and isinstance(docs, list) and len(titles) == len(docs)):
    raise ValueError("ساختار فایل مطابق انتظار نیست (لیست‌های titles و docs با طول برابر).")

n_records = len(titles)
print(f"✅ تعداد رکوردهای اصلی: {n_records}")

def chunk_text(text: str, chunk_size: int, overlap: int) -> List[str]:
    words = text.split()
    n = len(words)
    if n == 0:
        return []
    chunks = []
    start = 0
    while start < n:
        end = start + chunk_size
        chunks.append(" ".join(words[start:end]))
        if end >= n:
            break
        start = end - overlap
        if start < 0:
            start = 0
    return chunks

if WRITE_JSONL:
    os.makedirs(os.path.dirname(OUTPUT_JSONL_PATH), exist_ok=True)
    jsonl_f = open(OUTPUT_JSONL_PATH, "w", encoding="utf-8")
else:
    jsonl_f = None

all_chunk_records = [] 
global_id = 0

print("🔧 در حال تولید قطعات ... (ممکن است کمی زمان ببرد)")

for original_index, (title, doc) in enumerate(zip(titles, docs)):
    chunks = chunk_text(doc, CHUNK_SIZE_WORDS, CHUNK_OVERLAP_WORDS)
    total_chunks = len(chunks)
    for ci, ch in enumerate(chunks):
        rec = {
            "id": global_id,            # شناسه یکتا
            "title_chunk": title,       # طبق درخواست
            "doc_chunk": ch,
            "chunk_index": ci,
            "total_chunks": total_chunks,
            "original_index": original_index
        }
        if WRITE_JSON_ARRAY:
            all_chunk_records.append(rec)
        if WRITE_JSONL:
            jsonl_f.write(json.dumps(rec, ensure_ascii=False) + "\n")
        global_id += 1

if WRITE_JSONL:
    jsonl_f.close()
    print(f"💾 فایل JSONLines ذخیره شد: {OUTPUT_JSONL_PATH}")

print(f"✅ تعداد کل قطعات: {global_id}")

if WRITE_JSON_ARRAY:
    os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
    with open(OUTPUT_PATH, "w", encoding="utf-8") as f_out:
        json.dump(all_chunk_records, f_out, ensure_ascii=False)
    out_size_mb = os.path.getsize(OUTPUT_PATH) / (1024*1024)
    print(f"💾 فایل JSON (آرایه‌ای) ذخیره شد: {OUTPUT_PATH}")
    print(f"📏 اندازه فایل خروجی: {out_size_mb:.2f} MB")

print(f"\n🔍 پیش‌نمایش {PRINT_SAMPLE} رکورد اول:")
preview_list = all_chunk_records[:PRINT_SAMPLE] if WRITE_JSON_ARRAY else []
if not preview_list and WRITE_JSONL:
    with open(OUTPUT_JSONL_PATH, "r", encoding="utf-8") as f_preview:
        for _ in range(PRINT_SAMPLE):
            line = f_preview.readline()
            if not line:
                break
            preview_list.append(json.loads(line))

for sample in preview_list:
    print("-" * 60)
    print(f"id: {sample['id']}")
    print(f"title_chunk: {sample['title_chunk']}")
    print(f"chunk_index: {sample['chunk_index'] + 1} / {sample['total_chunks']}")
    print(f"original_index: {sample['original_index']}")
    print(f"doc_chunk (کلمات ~{len(sample['doc_chunk'].split())}):")
    print(sample['doc_chunk'][:400] + ("..." if len(sample['doc_chunk']) > 400 else ""))

print("\n✅ کار تمام شد.")


📥 در حال خواندن فایل ورودی: /content/drive/MyDrive/HotpotQA_snapshot/all_docs.json
✅ تعداد رکوردهای اصلی: 4482
🔧 در حال تولید قطعات ... (ممکن است کمی زمان ببرد)
✅ تعداد کل قطعات: 66237
💾 فایل JSON (آرایه‌ای) ذخیره شد: /content/drive/MyDrive/HotpotQA_snapshot/all_docs_chunks.json
📏 اندازه فایل خروجی: 64.99 MB

🔍 پیش‌نمایش 3 رکورد اول:
------------------------------------------------------------
id: 0
title_chunk: Anthony Avent
chunk_index: 1 / 2
original_index: 0
doc_chunk (کلمات ~150):
Anthony Avent (born October 18, 1969) is an American former professional basketball player who was selected by the Atlanta Hawks in the first round (15th pick overall) of the 1991 NBA draft. Born in Rocky Mount, North Carolina, Avent played for the Milwaukee Bucks, Orlando Magic, Vancouver Grizzlies, Utah Jazz and Los Angeles Clippers in six NBA seasons. He played collegiately at Seton Hall Univer...
------------------------------------------------------------
id: 1
title_chunk: Anthony Avent
chunk_index

In [3]:
for sample in preview_list:
    print("-" * 60)
    print(f"id: {sample['id']}")
    print(f"title_chunk: {sample['title_chunk']}")
    print(f"chunk_index: {sample['chunk_index'] + 1} / {sample['total_chunks']}")
    print(f"original_index: {sample['original_index']}")
    print(f"doc_chunk (کلمات ~{len(sample['doc_chunk'].split())}):")
    print(sample['doc_chunk'][:])

print("\n✅ کار تمام شد.")

------------------------------------------------------------
id: 0
title_chunk: Anthony Avent
chunk_index: 1 / 2
original_index: 0
doc_chunk (کلمات ~150):
Anthony Avent (born October 18, 1969) is an American former professional basketball player who was selected by the Atlanta Hawks in the first round (15th pick overall) of the 1991 NBA draft. Born in Rocky Mount, North Carolina, Avent played for the Milwaukee Bucks, Orlando Magic, Vancouver Grizzlies, Utah Jazz and Los Angeles Clippers in six NBA seasons. He played collegiately at Seton Hall University where he played in the 1989 NCAA championship game. Prior to Seton Hall, Avent played at Malcolm X Shabazz High School in Newark, New Jersey.Upon being drafted 15th overall by the Bucks, Avent went on to instead sign with Phonola Caserta of the Italian League. He made this decision after failing to reach a satisfactory contract with the Bucks. After one season in Italy, Avent signed a four-year deal with the Bucks, beginning with a $500