In [1]:
!pip install langchain
!pip install langchain-community
!pip install pymupdf
!pip install langchain-core
!pip install langchain-text-splitters
!pip install lxml
!pip install python-docx
!pip -q install openai



In [2]:
import re
from langchain_community.document_loaders import PyMuPDFLoader


files = ["SOP1.pdf", "SOP2.pdf", "SOP3.pdf"]


def clean_text(t):
    # 1) Fix hyphenated line breaks: "chemi-\ncal" -> "chemical"
    t = t.replace("-\n", "")

    # 2) Remove "Page X of Y" (common PDF artifact)
    t = re.sub(r"\bPage\s+\d+\s+of\s+\d+\b", " ", t, flags=re.IGNORECASE)

    # 3) Unwrap lines: convert newlines to spaces
    # Since your text has 0 double-newlines, we can safely unwrap all
    t = t.replace("\r", "\n")
    t = re.sub(r"\n+", " ", t)

    # Normalize whitespace
    t = re.sub(r"\s+", " ", t).strip()
    return t


def load_pdf_as_text(path):
    pages = PyMuPDFLoader(path).load()
    full_text = "\n".join([p.page_content for p in pages])
    return full_text


cleaned_sops = {}
for f in files:
    raw = load_pdf_as_text(f)
    cleaned_sops[f] = clean_text(raw)

print(cleaned_sops["SOP1.pdf"][:800])



Doc. No. & Rev. EHS-00005 R17 Hard copy of this document, if not marked “CONTROLLED” in red, is by definition uncontrolled and may be out of date. NY Creates Processed by Document Control on September 26, 2025 Standard Operating Procedure for Chemical Handling and Storage REVISION Rev No. DCN No. Change Summary Release Date DCN Initiator Document Owner 17 DCN4493 Updates for branding and logo. October 2025 C. Treacy D. Brookhart K. Rydberg Prior revision history, if applicable, is available from the Document Control Office. Standard Operating Procedure for Chemical Handling and Storage EHS-00005 R17 Printed copies are considered uncontrolled. Verify revision prior to use. DCN4493 NY Creates 1. PURPOSE & SCOPE The purpose of this document is to provide minimum requirements for the safe hand


In [3]:
SOP_TEMPLATE_V1 = """
Rewrite the SOP into this structure (use headings exactly):

1. Purpose
2. Scope
3. Definitions (only if present)
4. Responsibilities
5. Hazards and Risks
6. Engineering Controls
7. PPE
8. Materials / Equipment
9. Procedure (numbered steps)
10. Storage Requirements
11. Waste Handling and Disposal
12. Spill / Emergency Response
13. References (only if present)

Rules:
- Keep all safety requirements from the source.
- Do not invent requirements or chemicals not in the source.
- Use clear, unambiguous language (avoid "ensure", "as needed", "as appropriate").
- If a section is not in the source, write "Not specified in source SOP."
"""


In [5]:
import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")


from openai import OpenAI
client = OpenAI()

def call_llm(prompt, model = "gpt-4o-mini"):
    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a careful chemical engineering documentation assistant."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.2,
    )
    return resp.choices[0].message.content


In [6]:
import os
from docx import Document

OUTPUT_DIR = "rewritten_sops"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def save_txt(path: str, text: str):
    with open(path, "w", encoding="utf-8") as f:
        f.write(text)

def save_docx(path: str, text: str):
    doc = Document()
    for line in text.split("\n"):
        doc.add_paragraph(line)
    doc.save(path)

def build_prompt(cleaned_text: str) -> str:
    return f"""{SOP_TEMPLATE_V1}

SOURCE SOP:
{cleaned_text}
"""

results = {}

for f in files:
    prompt = build_prompt(cleaned_sops[f])
    rewritten = call_llm(prompt)
    results[f] = rewritten

    base = os.path.splitext(os.path.basename(f))[0]
    txt_path = os.path.join(OUTPUT_DIR, f"{base}_rewritten.txt")
    docx_path = os.path.join(OUTPUT_DIR, f"{base}_rewritten.docx")

    save_txt(txt_path, rewritten)
    save_docx(docx_path, rewritten)

    print(f"Saved: {txt_path} and {docx_path}")

Saved: rewritten_sops/SOP1_rewritten.txt and rewritten_sops/SOP1_rewritten.docx
Saved: rewritten_sops/SOP2_rewritten.txt and rewritten_sops/SOP2_rewritten.docx
Saved: rewritten_sops/SOP3_rewritten.txt and rewritten_sops/SOP3_rewritten.docx
