In [1]:
import json
import uuid
from pathlib import Path
from collections import defaultdict
import random
import re

In [18]:
DATA_DIR = Path("../data/processed_files")
OUT_DIR = Path("../data/fine_tune")
OUT_DIR.mkdir(parents=True, exist_ok=True)

FILES = [
    DATA_DIR / "financial_statement_fixed_2024.json",  # FY24 (year-ended Mar-31-2024)
    DATA_DIR / "financial_statement_fixed_2025.json",  # FY25 (year-ended Mar-31-2025)
]

In [41]:
# ----------------------------
# Load rows from JSON
# ----------------------------
def load_rows(file_path: Path):
    """Load rows (headers + table) from JSON file."""
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    rows = data["rows"]
    headers = rows[0]   # first row = column headers
    return headers, rows

In [42]:
# ----------------------------
# Direct Questions (KPI in year)
# ----------------------------
def make_direct_questions(headers, rows, source_file):
    qas = []
    for row in rows[1:]:
        kpi = row[0].strip()
        for col_name, val in zip(headers[1:], row[1:]):
            if val.strip() and val != "-":
                qas.append({
                    "id": str(uuid.uuid4()),
                    "question": f"What was the {kpi} in {col_name}?",
                    "answer": f"The {kpi} in {col_name} was {val}.",
                    "meta": {"source": source_file.name, "kpi": kpi}
                })
    return qas

In [43]:
# ----------------------------
# Comparison Questions (between two periods)
# ----------------------------
def safe_float(x: str):
    """Convert string to float if possible, else return None."""
    try:
        return float(x.replace(",", ""))
    except:
        return None

In [44]:
def make_comparison_questions(headers, rows, source_file):
    qas = []
    for row in rows[1:]:
        kpi = row[0].strip()
        for i in range(1, len(headers)):
            for j in range(i + 1, len(headers)):
                va, vb = row[i], row[j]
                pa, pb = headers[i], headers[j]

                na, nb = safe_float(va), safe_float(vb)
                if na is None or nb is None:
                    continue  # skip invalids like "-"

                qa_text = f"Was '{kpi}' higher in {pa} than in {pb}?"
                ans = "Yes" if na > nb else "No"
                qas.append({
                    "id": str(uuid.uuid4()),
                    "question": qa_text,
                    "answer": ans,
                    "meta": {"source": source_file.name, "kpi": kpi}
                })
    return qas


In [46]:
# ----------------------------
# Build QAs for all files
# ----------------------------
FILES = [
    Path("../data/processed_files/financial_statement_fixed_2024.json"),
    Path("../data/processed_files/financial_statement_fixed_2025.json")
]

all_qas = []
for fp in FILES:
    if not fp.exists():
        print("❌ Missing:", fp)
        continue
    headers, rows = load_rows(fp)
    direct = make_direct_questions(headers, rows, fp)
    comp   = make_comparison_questions(headers, rows, fp)
    all_qas.extend(direct)
    all_qas.extend(comp)

print(f"✅ Total QA generated: {len(all_qas)}")
print(json.dumps(all_qas[:5], indent=2))  # preview

✅ Total QA generated: 3752
[
  {
    "id": "4da623ae-3ccf-453b-a23a-dd214f7dff62",
    "question": "What was the Revenue from operations in Q4 FY24?",
    "answer": "The Revenue from operations in Q4 FY24 was 61,237.",
    "meta": {
      "source": "financial_statement_fixed_2024.json",
      "kpi": "Revenue from operations"
    }
  },
  {
    "id": "944f27ff-e4ed-4365-8d45-b7fd6ae56cf1",
    "question": "What was the Revenue from operations in Q3 FY23?",
    "answer": "The Revenue from operations in Q3 FY23 was 60,583.",
    "meta": {
      "source": "financial_statement_fixed_2024.json",
      "kpi": "Revenue from operations"
    }
  },
  {
    "id": "db49c9a4-5786-4afd-8ae5-e21f183a172b",
    "question": "What was the Revenue from operations in Q4 FY23?",
    "answer": "The Revenue from operations in Q4 FY23 was 59,162.",
    "meta": {
      "source": "financial_statement_fixed_2024.json",
      "kpi": "Revenue from operations"
    }
  },
  {
    "id": "4413f228-5853-4efc-bbb7-e5c80

In [48]:
# ----------------------------
# Save outputs
# ----------------------------
out_dir = Path("../data/qa_datasets")
out_dir.mkdir(parents=True, exist_ok=True)

rag_path  = out_dir / "rag_qas.json"
ft_path   = out_dir / "t5_finetune.jsonl"
test_path = out_dir / "test_qas.json"

# Save RAG style JSON
with open(rag_path, "w", encoding="utf-8") as f:
    json.dump(all_qas, f, ensure_ascii=False, indent=2)

# Save fine-tune JSONL
with open(ft_path, "w", encoding="utf-8") as f:
    for qa in all_qas:
        f.write(json.dumps({
            "input": qa["question"],
            "output": qa["answer"]
        }) + "\n")

# Save test subset
with open(test_path, "w", encoding="utf-8") as f:
    json.dump(all_qas[:10], f, ensure_ascii=False, indent=2)

print("✅ Saved:")
print("  RAG QAs (rich):", rag_path)
print("  T5 Fine-tune set:", ft_path)
print("  Test (10):", test_path)

✅ Saved:
  RAG QAs (rich): ..\data\qa_datasets\rag_qas.json
  T5 Fine-tune set: ..\data\qa_datasets\t5_finetune.jsonl
  Test (10): ..\data\qa_datasets\test_qas.json
