<a href="https://colab.research.google.com/github/nattaran/HealthTequity-LLM/blob/main/generateBloodPressureData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

🩺 Generate Synthetic Blood Pressure Dataset
This notebook generates a realistic 30-day blood pressure dataset
for one individual (age, sex, systolic, diastolic)
and saves it with a descriptive filename including the age and sex.

Example output file:
synthetic_bp_45_female.csv

Author: Nasrin Attaran
Created: 2025-10-19
Project: HealthTequity Voice Pipeline


# **🧰 1. Setup & Imports**

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
from google.colab import drive


# **📁 2. Mount Google Drive**

In [None]:
# Mount Google Drive to save output CSV file
drive.mount('/content/drive')

# **📂 3. Define Output Folder**

In [None]:
# Create directory to save synthetic dataset
DATA_DIR = Path("/content/drive/MyDrive/HealthTequity-LLM/data/synthetic_csv")
DATA_DIR.mkdir(parents=True, exist_ok=True)

# **⚙️ 4. Define User Parameters**

In [None]:
# Synthetic patient profile
# Synthetic patient profile
age = 77
sex = "male"
num_days = 30

# Set random seed for reproducibility
np.random.seed(42)

# **📆 5. Generate Dates**

In [None]:
# Generate list of dates for past 30 days
start_date = datetime.today() - timedelta(days=num_days - 1)
dates = pd.date_range(start=start_date, periods=num_days)

# **🔀 6. Simulate Blood Pressure States**

In [None]:
# Randomly assign "normal" or "hypertensive" state to each day
states = np.random.choice(["normal", "hypertensive"], size=num_days, p=[0.6, 0.4])



# **🧪 7. Generate Blood Pressure Readings**


In [None]:
systolic = []
diastolic = []

for state in states:
    if state == "normal":
        sys = np.random.normal(118, 6)  # Normal systolic ~118±6 mmHg
        dia = np.random.normal(76, 4)   # Normal diastolic ~76±4 mmHg
    else:  # Hypertensive
        sys = np.random.normal(148, 10)  # Hypertensive systolic ~148±10 mmHg
        dia = np.random.normal(96, 6)    # Hypertensive diastolic ~96±6 mmHg

    systolic.append(round(sys))
    diastolic.append(round(dia))


# **🧾 8. Create DataFrame**

In [None]:
# Build dataset
df = pd.DataFrame({
    "date": [d.strftime("%Y-%m-%d") for d in dates],
    "age": age,
    "sex": sex,
    "systolic_mmHg": systolic,
    "diastolic_mmHg": diastolic,
    "bp_category": states
})


# **💾 9. Save Dataset to Drive**

In [None]:
# Define filename and save CSV
filename = f"synthetic_bp_{age}_{sex.lower()}.csv"
output_csv = DATA_DIR / filename
df.to_csv(output_csv, index=False, encoding="utf-8-sig")

print(f"✅ File saved to Google Drive: {output_csv.resolve()}")


# **👀 10. Display Sample & Stats**

In [None]:
# Preview first 10 rows
display(df.head(10))

# Show category distribution (normal vs hypertensive)
print("\n📊 Category distribution:")
print(df["bp_category"].value_counts())
