In [None]:
# --- Imports ---

import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm
from transformers import AutoTokenizer

In [None]:
# Do word count analysis on markdown files

data_dir = Path("../data/NICE_Guidelines_MD")

word_counts = []

for md_file in data_dir.glob("*.md"):
    try:
        with open(md_file, 'r', encoding='utf-8') as f:
            content = f.read()
        
        word_count = len(content.split())
        word_counts.append({
            'guideline_id': md_file.stem.replace('_structured_document', ''),
            'word_count': word_count
        })
        
    except Exception as e:
        print(f"Error processing {md_file}: {e}")

df = pd.DataFrame(word_counts)

print(f"Total files: {len(df)}")
print(f"Total words: {df['word_count'].sum():,}")
print(f"Average words per file: {df['word_count'].mean():.0f}")
print(f"Min words: {df['word_count'].min()}")
print(f"Max words: {df['word_count'].max()}")

print("\nWord count distribution:")
print(df['word_count'].describe())

In [None]:
# Do word count analysis on CG and NG guidelines

word_counts = []

for md_file in data_dir.glob("*.md"):
    if md_file.stem.upper().startswith(('CG', 'NG')):
        try:
            with open(md_file, 'r', encoding='utf-8') as f:
                content = f.read()
            
            word_count = len(content.split())
            word_counts.append({
                'guideline_id': md_file.stem.replace('_structured_document', ''),
                'word_count': word_count
            })
            
        except Exception as e:
            print(f"Error processing {md_file}: {e}")

df = pd.DataFrame(word_counts)

print(f"Total CG/NG files: {len(df)}")
print(f"Total words: {df['word_count'].sum():,}")
print(f"Average words per file: {df['word_count'].mean():.0f}")
print(f"Min words: {df['word_count'].min()}")
print(f"Max words: {df['word_count'].max()}")

print("\nWord count distribution:")
print(df['word_count'].describe())

print(f"\nBreakdown by type:")
cg_files = df[df['guideline_id'].str.upper().str.startswith('CG')]
ng_files = df[df['guideline_id'].str.upper().str.startswith('NG')]
print(f"CG files: {len(cg_files)} (Total words: {cg_files['word_count'].sum():,})")
print(f"NG files: {len(ng_files)} (Total words: {ng_files['word_count'].sum():,})")

In [None]:
# Token counts on NG and CG guidelines

tokenizer = AutoTokenizer.from_pretrained("voyageai/voyage-3-large", use_fast=True)

token_counts = []
files = [f for f in data_dir.glob("*.md") if f.stem.upper().startswith(("CG", "NG"))]
for md_file in tqdm(files, desc="Tokenizing files", unit="file"):
    try:
        content = md_file.read_text(encoding="utf-8")
        input_ids = tokenizer(content, add_special_tokens=False)["input_ids"]
        token_counts.append({
            "guideline_id": md_file.stem.replace("_structured_document", ""),
            "token_count": len(input_ids)
        })
    except Exception as e:
        print(f"Error processing {md_file}: {e}")

df_tokens = pd.DataFrame(token_counts)

df = df.merge(df_tokens, on="guideline_id", how="left")

print(f"Total CG/NG files: {len(df)}")
print(f"Total tokens: {df['token_count'].sum():,}")
print(f"Average tokens per file: {df['token_count'].mean():.0f}")
print(f"Min tokens: {df['token_count'].min()}")
print(f"Max tokens: {df['token_count'].max()}")

print("\nToken count distribution:")
print(df["token_count"].describe())

print("\nBreakdown by type:")
cg_files = df[df["guideline_id"].str.upper().str.startswith("CG")]
ng_files = df[df["guideline_id"].str.upper().str.startswith("NG")]
print(f"CG files: {len(cg_files)} (Total tokens: {cg_files['token_count'].sum():,})")
print(f"NG files: {len(ng_files)} (Total tokens: {ng_files['token_count'].sum():,})")
