In [0]:
%pip install --upgrade "mlflow[databricks]>=3.1.0" openai "databricks-connect>=16.1"
%pip install faker
dbutils.library.restartPython()

In [0]:
import json
import os
import mlflow
from openai import OpenAI
import pandas as pd
import random
from datetime import datetime, timedelta
import faker

In [0]:
faker = faker.Faker()
def random_datetime(start, end):
    return start + timedelta(seconds=random.randint(0, int((end - start).total_seconds())))

num_records = 200
start_date = datetime(2024, 1, 1)
end_date = datetime(2025, 6, 1)

visit_data = []

print(f"Generating {num_records} synthetic patient visits...")
for i in range(num_records):
    patient_name = faker.name()
    patient_id = faker.uuid4()
    visit_date = random_datetime(start_date, end_date)
    doctor_name = faker.name()
    department = random.choice(["Cardiology", "Neurology", "Oncology", "Orthopedics", "General Medicine"])

    record = {
        "patient_id": str(patient_id),
        "patient_name": patient_name,
        "visit_datetime": visit_date.isoformat(),
        "doctor_name": doctor_name,
        "department": department
    }
    visit_data.append(record)

# Convert to DataFrame
visit_df = pd.DataFrame(visit_data)

# Write patient visit data to table
visit_spark_df = spark.createDataFrame(visit_df)
visit_spark_df.write.mode("overwrite").saveAsTable("mzervou.healthcare.synthetic_patient_visits")

print("✅ Patient visit data written to table: mzervou.healthcare.synthetic_patient_visits")


In [0]:
model_name = "databricks-meta-llama-3-3-70b-instruct"
catalog = "mzervou"
schema = "healthcare"

In [0]:
from pyspark.sql.functions import expr

# Limit if testing first
# df_visits = visit_spark_df.limit(5)

# Define your prompt template
prompt_template = """You are a doctor named {doctor_name} in the {department} department. You just completed a consultation with patient {patient_name} on {visit_datetime}. Record an audio note in natural, spoken style summarizing the encounter. Include symptoms, findings, impressions, and plan. Do not use SOAP format. Begin with: 'Okay, today I saw patient {patient_name} ...'"""

# Build ai_query column
df_transcripts = visit_spark_df.withColumn(
    "audio_transcript",
    expr(
        f"""
        ai_query(
            '{model_name}',
            concat(
                'You are a doctor named ', doctor_name,
                ' in the ', department, ' department. You just completed a consultation with patient ', patient_name,
                ' on ', visit_datetime, '. Record an audio note in natural, spoken style summarizing the encounter. Include symptoms, findings, impressions, and plan. Do not use SOAP format. Begin with: \\'Okay, today I saw patient ', patient_name, ' ...\\''
            )
        )
        """
    )
)

# Save to target table
df_transcripts.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.synthetic_audio_transcripts")

print(f"✅ Audio transcripts written using ai_query() to table: {catalog}.{schema}.synthetic_audio_transcripts")