<a href="https://colab.research.google.com/github/markumreed/health_disparities_role_of_gen_ai/blob/main/ccda_and_trends_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🧬 Health Disparities & AI: CCDA Generator + Google Trends Tracker
Generate synthetic CCDA patient records and track public interest trends on AI in healthcare.

In [2]:
!pip install faker lxml pytrends

Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Collecting pytrends
  Downloading pytrends-4.9.2-py3-none-any.whl.metadata (13 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytrends-4.9.2-py3-none-any.whl (15 kB)
Installing collected packages: faker, pytrends
Successfully installed faker-37.1.0 pytrends-4.9.2


In [12]:
import pandas as pd
import random
from faker import Faker
from lxml import etree
from pytrends.request import TrendReq
from pytrends import exceptions
from IPython.display import display, Markdown
import time

# Setup
fake = Faker()
ethnicities = ["Black", "White", "Hispanic", "Asian", "AIAN", "NHPI"]
CCDA_CODES = {
    "demographics": "21112-8",
    "social_history": "29762-2",
    "conditions": "11450-4",
    "medications": "10160-0",
    "procedures": "47519-4"
}


In [5]:
# Load dataset
df = pd.read_csv("healthcare_dataset.csv")
df.head()


Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


In [6]:
def create_ccda_from_row(row):
    root = etree.Element("ClinicalDocument")

    demo = etree.SubElement(root, "section", code=CCDA_CODES["demographics"])
    etree.SubElement(demo, "name").text = row["Name"].title()
    etree.SubElement(demo, "age").text = str(row["Age"])
    etree.SubElement(demo, "gender").text = row["Gender"]
    etree.SubElement(demo, "blood_type").text = row["Blood Type"]
    etree.SubElement(demo, "ethnicity").text = random.choice(ethnicities)

    social = etree.SubElement(root, "section", code=CCDA_CODES["social_history"])
    etree.SubElement(social, "insurance").text = row["Insurance Provider"]
    etree.SubElement(social, "admission_type").text = row["Admission Type"]
    etree.SubElement(social, "billing_amount").text = f"{row['Billing Amount']:.2f}"

    conditions = etree.SubElement(root, "section", code=CCDA_CODES["conditions"])
    etree.SubElement(conditions, "condition").text = row["Medical Condition"]
    etree.SubElement(conditions, "test_result").text = row["Test Results"]

    meds = etree.SubElement(root, "section", code=CCDA_CODES["medications"])
    etree.SubElement(meds, "medication").text = row["Medication"]

    procs = etree.SubElement(root, "section", code=CCDA_CODES["procedures"])
    etree.SubElement(procs, "procedure").text = f"Admitted on {row['Date of Admission']} for {row['Admission Type']}"

    return etree.tostring(root, pretty_print=True).decode("utf-8")


In [7]:
# Generate and display a sample XML
sample_xml = create_ccda_from_row(df.iloc[0])
print(sample_xml)


<ClinicalDocument>
  <section code="21112-8">
    <name>Bobby Jackson</name>
    <age>30</age>
    <gender>Male</gender>
    <blood_type>B-</blood_type>
    <ethnicity>Hispanic</ethnicity>
  </section>
  <section code="29762-2">
    <insurance>Blue Cross</insurance>
    <admission_type>Urgent</admission_type>
    <billing_amount>18856.28</billing_amount>
  </section>
  <section code="11450-4">
    <condition>Cancer</condition>
    <test_result>Normal</test_result>
  </section>
  <section code="10160-0">
    <medication>Paracetamol</medication>
  </section>
  <section code="47519-4">
    <procedure>Admitted on 2024-01-31 for Urgent</procedure>
  </section>
</ClinicalDocument>



In [15]:
keywords = ["AI in healthcare", "health disparities", "artificial intelligence bias"]
pytrends = TrendReq(hl='en-US', tz=360)
pytrends.build_payload(keywords, cat=0, timeframe='today 5-y', geo='', gprop='')
data = pytrends.interest_over_time()

TooManyRequestsError: The request failed: Google returned a response with code 429