# GenAI SOP Rewriter (MVP)

This notebook demonstrates a minimal end-to-end pipeline:
1) Load SOP PDFs  
2) Minimal text cleaning  
3) Send to an LLM using a strict SOP template  
4) Save rewritten SOPs to `.txt` and `.docx`

⚠️ **Draft output only** — requires engineer review.


In [ ]:
!pip -q install langchain-community pymupdf python-docx openai


In [ ]:
import os, re
from langchain_community.document_loaders import PyMuPDFLoader

FILES = ["SOP1.pdf", "SOP2.pdf", "SOP3.pdf"]

def minimal_clean(t: str) -> str:
    t = t.replace("-\n", "")
    t = re.sub(r"\bPage\s+\d+\s+of\s+\d+\b", " ", t, flags=re.IGNORECASE)
    t = t.replace("\r", "\n")
    t = re.sub(r"\n+", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def load_pdf_as_text(path: str) -> str:
    pages = PyMuPDFLoader(path).load()
    return "\n".join([p.page_content for p in pages])

cleaned = {f: minimal_clean(load_pdf_as_text(f)) for f in FILES}
print(cleaned[FILES[0]][:800])


In [ ]:
SOP_TEMPLATE_V1 = """
Rewrite the SOP into this structure (use headings exactly):

1. Purpose
2. Scope
3. Definitions (only if present)
4. Responsibilities
5. Hazards and Risks
6. Engineering Controls
7. PPE
8. Materials / Equipment
9. Procedure (numbered steps)
10. Storage Requirements
11. Waste Handling and Disposal
12. Spill / Emergency Response
13. References (only if present)

Rules:
- Keep all safety requirements from the source.
- Do not invent requirements or chemicals not in the source.
- Use clear, unambiguous language (avoid "ensure", "as needed", "as appropriate").
- If a section is not in the source, write "Not specified in source SOP."
"""


In [ ]:
from openai import OpenAI

# In Colab, set your key via the sidebar: Secrets → OPENAI_API_KEY
# Or set it here (not recommended for GitHub): os.environ["OPENAI_API_KEY"] = "..."

client = OpenAI()

def call_llm(prompt: str, model: str = "gpt-4o-mini") -> str:
    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a careful chemical engineering documentation assistant."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.2,
    )
    return resp.choices[0].message.content

def build_prompt(cleaned_text: str) -> str:
    return f"{SOP_TEMPLATE_V1}\n\nSOURCE SOP:\n{cleaned_text}"


In [ ]:
from docx import Document

OUTPUT_DIR = "rewritten_sops"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def save_txt(path: str, text: str):
    with open(path, "w", encoding="utf-8") as f:
        f.write(text)

def save_docx(path: str, text: str):
    doc = Document()
    for line in text.split("\n"):
        doc.add_paragraph(line)
    doc.save(path)

results = {}
for f in FILES:
    rewritten = call_llm(build_prompt(cleaned[f]))
    results[f] = rewritten
    base = os.path.splitext(os.path.basename(f))[0]
    save_txt(os.path.join(OUTPUT_DIR, f"{base}_rewritten.txt"), rewritten)
    save_docx(os.path.join(OUTPUT_DIR, f"{base}_rewritten.docx"), rewritten)

print("Done. Files saved to:", OUTPUT_DIR)
