# Document Generation with DSPy

In [1]:
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv
import mlflow
assert(load_dotenv())
import dspy
import random
from pydantic import BaseModel, Field
from typing import Optional
import utils

random.seed(313)
dspy.configure(lm=dspy.LM("openai/gpt-4o-mini"))
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Astrik Doc Gen")
mlflow.dspy.autolog(log_evals=True, log_compiles=True, log_traces_from_compile=True)
utils.hello_world()

'Hello, world!'

Load Synthea Data

In [2]:
data_dir = Path("data/synthea")
dataframes = {csv_file.stem: pd.read_csv(csv_file, header=0) for csv_file in data_dir.glob("*.csv")}

for key, df in dataframes.items():
    print(f"{key}:\t{sorted(df.columns.tolist())}")


allergies:	['CODE', 'DESCRIPTION', 'ENCOUNTER', 'PATIENT', 'START', 'STOP']
careplans:	['CODE', 'DESCRIPTION', 'ENCOUNTER', 'Id', 'PATIENT', 'REASONCODE', 'REASONDESCRIPTION', 'START', 'STOP']
conditions:	['CODE', 'DESCRIPTION', 'ENCOUNTER', 'PATIENT', 'START', 'STOP']
devices:	['CODE', 'DESCRIPTION', 'ENCOUNTER', 'PATIENT', 'START', 'STOP', 'UDI']
encounters:	['BASE_ENCOUNTER_COST', 'CODE', 'DESCRIPTION', 'ENCOUNTERCLASS', 'Id', 'ORGANIZATION', 'PATIENT', 'PAYER', 'PAYER_COVERAGE', 'PROVIDER', 'REASONCODE', 'REASONDESCRIPTION', 'START', 'STOP', 'TOTAL_CLAIM_COST']
imaging_studies:	['BODYSITE_CODE', 'BODYSITE_DESCRIPTION', 'DATE', 'ENCOUNTER', 'Id', 'MODALITY_CODE', 'MODALITY_DESCRIPTION', 'PATIENT', 'SOP_CODE', 'SOP_DESCRIPTION']
immunizations:	['BASE_COST', 'CODE', 'DATE', 'DESCRIPTION', 'ENCOUNTER', 'PATIENT']
medications:	['BASE_COST', 'CODE', 'DESCRIPTION', 'DISPENSES', 'ENCOUNTER', 'PATIENT', 'PAYER', 'PAYER_COVERAGE', 'REASONCODE', 'REASONDESCRIPTION', 'START', 'STOP', 'TOTALCOS

In [3]:


# Load the first patient as an example
first_patient = utils.row_to_patient_profile(dataframes["patients"].iloc[0])
print(f"Patient: {first_patient.prefix} {first_patient.first} {first_patient.last}")
print(f"Birthdate: {first_patient.birthdate.year}-{first_patient.birthdate.month:02d}-{first_patient.birthdate.day:02d}")
print(f"Alive: {first_patient.deathdate is None}")
print(f"\nFull profile:")
print(first_patient.model_dump_json(indent=2))

1989-05-25
Patient: Mr. José Eduardo181 Gómez206
Birthdate: 1989-05-25
Alive: True

Full profile:
{
  "id": "1d604da9-9a81-4ba9-80c2-de3375d59b40",
  "birthdate": {
    "year": 1989,
    "month": 5,
    "day": 25
  },
  "ssn": "999-76-6866",
  "first": "José Eduardo181",
  "last": "Gómez206",
  "marital": "M",
  "race": "white",
  "ethnicity": "hispanic",
  "gender": "M",
  "birthplace": "Marigot  Saint Andrew Parish  DM",
  "address": "427 Balistreri Way Unit 19",
  "city": "Chicopee",
  "state": "Massachusetts",
  "county": "Hampden County",
  "deathdate": null,
  "drivers": "S99984236",
  "passport": "X19277260X",
  "prefix": "Mr.",
  "suffix": null,
  "maiden": null,
  "zip": "1013",
  "lat": 42.22835382315942,
  "lon": -72.56295055096882,
  "healthcare_expenses": 271227.08,
  "healthcare_coverage": 1334.88
}


In [None]:
class PatientProfileNarrator(dspy.Signature):
    """Reads a structured patient profile and narrates it as a medical professional would do in the context of a conversation with a colleague. The narrator wants to give a concise summary of the patient profile, focusing on the most relevant attributes for a medical professional in a clinical setting. The colleague should be able to use the information for generating a medical report or for further analysis of the patient profile."""

    patient_profile: utils.PatientProfile = dspy.InputField(
        desc="A structured patient profile from the EHR system. The profile includes relevant attributes like name, gender, birthdate etc. but also attributes that might not be relevant for the conversation like social security number. Some attributes like Address may only be relevant if it is something unusual like homelessness or if its in a tropical country for example.")

    narrative: str = dspy.OutputField(desc="The narrative summary of the patient profile.")

narrator = dspy.ChainOfThought(PatientProfileNarrator)
# sample a line from dataframes["patients"]
sample_patient = dataframes["patients"].sample().iloc[0]
patient_profile = utils.row_to_patient_profile(
    sample_patient
)
narrative = narrator(patient_profile=patient_profile).narrative
print(narrative)


{910: '1999-12-14'}


AttributeError: 'dict' object has no attribute 'split'