In [1]:
import os
import json
import re

# Paths
input_dir = r"D:\Data\mpep_finetune"
jsonl_file = r"D:\Data\mpep_finetune\mpep_data.jsonl"

# Step 1: Collect all section numbers from the .json filenames
expected_sections = set()

for root, dirs, files in os.walk(input_dir):
    for file in files:
        if file.startswith("s") and file.endswith(".json"):
            match = re.search(r"s(\d+)\.json", file)
            if match:
                expected_sections.add(match.group(1))  # Collect section number (e.g., '704')

# Step 2: Extract sections mentioned in mpep_data.jsonl
found_sections = set()

with open(jsonl_file, "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        instruction = entry["instruction"]
        match = re.search(r"MPEP (\d+)", instruction)  # Find MPEP section number
        if match:
            found_sections.add(match.group(1))

# Step 3: Compare and find missing sections
missing_sections = expected_sections - found_sections

# Step 4: Report missing sections
if missing_sections:
    print("Missing sections:")
    for section in sorted(missing_sections):
        print(f"MPEP {section}")
else:
    print("All sections are covered in mpep_data.jsonl.")



log_file = r"D:\Data\mpep_finetune\missing_sections.log"

with open(log_file, "w", encoding="utf-8") as log:
    if missing_sections:
        log.write("Missing sections:\n")
        for section in sorted(missing_sections):
            log.write(f"MPEP {section}\n")
        print(f"Missing sections logged to {log_file}")
    else:
        log.write("All sections are covered in mpep_data.jsonl.\n")
        print("All sections are covered. Log created.")


All sections are covered in mpep_data.jsonl.
All sections are covered. Log created.
