In [None]:
import os
import pandas as pd
from sqlalchemy import create_engine, text

# ----------------------------
# Course database defaults
# ----------------------------
DB_DEFAULTS = {
    "PGHOST": "db",
    "PGPORT": "5432",
    "PGDATABASE": "data_science",
    "PGUSER": "dataScience_user",
    "PGPASSWORD": "data_science",
}

PGHOST = os.getenv("PGHOST", DB_DEFAULTS["PGHOST"])
PGPORT = os.getenv("PGPORT", DB_DEFAULTS["PGPORT"])
PGDATABASE = os.getenv("PGDATABASE", DB_DEFAULTS["PGDATABASE"])
PGUSER = os.getenv("PGUSER", DB_DEFAULTS["PGUSER"])
PGPASSWORD = os.getenv("PGPASSWORD", DB_DEFAULTS["PGPASSWORD"])

SCHEMA = "data_science"  # keep schema explicit for teaching

engine = create_engine(
    f"postgresql+psycopg2://{PGUSER}:{PGPASSWORD}@{PGHOST}:{PGPORT}/{PGDATABASE}",
    pool_pre_ping=True,
)

# Fail fast if DB connection isn't ready
with engine.connect() as conn:
    conn.execute(text("SELECT 1;"))

def run_sql(query: str) -> pd.DataFrame:
    """Run a SQL query and return a DataFrame."""
    return pd.read_sql(query, engine)

print("✅ Connected to:", PGDATABASE, "as", PGUSER, "on", PGHOST)


✅ Connected to: data_science as dataScience_user on db


In [3]:
SCHEMA = "public"


In [None]:
pd.read_sql("""
SELECT table_schema, table_name
FROM information_schema.tables
WHERE table_schema = 'public'
ORDER BY table_name;
""", engine)



Unnamed: 0,table_schema,table_name


In [6]:
from sqlalchemy import create_engine, text

engine = create_engine(
    "postgresql+psycopg2://dataScience_user:data_science@db:5432/data_science",
    pool_pre_ping=True,
)

with engine.connect() as conn:
    conn.execute(text("SELECT 1;"))

engine




Engine(postgresql+psycopg2://dataScience_user:***@db:5432/data_science)

In [None]:
# Shape of the dataset
patients.shape


In [None]:
# Column names
patients.columns


In [None]:
# Basic info
patients.info()


In [None]:
from pathlib import Path
import pandas as pd

DATA_DIR = Path("..") / "data"
patients = pd.read_csv(DATA_DIR / "patients.csv")

patients.head()


In [None]:
engine = create_engine(
    "postgresql+psycopg2://hi5304_user:hi5304_password@db:5432/hi5304"
)


In [None]:
query = """
SELECT *
FROM hi5304.patients
LIMIT 5;
"""

df_patients = pd.read_sql(query, engine)
df_patients


In [None]:
query = """
SELECT
    p.patient_id,
    p.first_name,
    p.last_name,
    b.reading_date,
    b.systolic,
    b.diastolic,
    b.heart_rate
FROM hi5304.patients p
JOIN hi5304.bp_readings b
  ON p.patient_id = b.patient_id
ORDER BY b.reading_date;
"""

df_bp = pd.read_sql(query, engine)
df_bp.head()



In [None]:
query = """
SELECT
    patient_id,
    COUNT(*) AS medication_count
FROM hi5304.medications
GROUP BY patient_id
ORDER BY medication_count DESC;
"""

df_meds = pd.read_sql(query, engine)
df_meds


In [None]:
query = """
SELECT
    AVG(systolic) AS avg_systolic,
    AVG(diastolic) AS avg_diastolic,
    COUNT(*) AS total_readings
FROM hi5304.bp_readings;
"""

df_summary = pd.read_sql(query, engine)
df_summary


In [None]:
df_bp.describe()


In [None]:
df_bp.groupby("patient_id")[["systolic", "diastolic"]].mean()


In [None]:
import matplotlib.pyplot as plt

df_bp.plot(x="reading_date", y="systolic", kind="line")
plt.show()


In [None]:
from sqlalchemy import create_engine
import pandas as pd


In [None]:
query = """
SELECT * 
FROM hi5304.cardio1
WHERE sbp IS NOT NULL
  AND cardio IS NOT NULL;
"""
df = pd.read_sql(query, engine)

df.head()


In [None]:
df["cardio"].value_counts()


In [None]:
sbp_cardio_0.describe(), sbp_cardio_1.describe()


In [None]:
from scipy.stats import f_oneway


In [None]:
f_stat, p_value = f_oneway(sbp_cardio_0, sbp_cardio_1)

f_stat, p_value


In [None]:
import matplotlib.pyplot as plt

df.boxplot(column="sbp", by="cardio")
plt.xlabel("Cardio (0 = No CVD, 1 = CVD)")
plt.ylabel("Systolic Blood Pressure (mmHg)")
plt.title("SBP by Cardiovascular Disease Status")
plt.suptitle("")
plt.show()


In [None]:
sbp_cardio_0 = df.loc[df["cardio"] == 0, "sbp"]
sbp_cardio_1 = df.loc[df["cardio"] == 1, "sbp"]


In [None]:
from scipy.stats import f_oneway

f_stat, p_value = f_oneway(sbp_cardio_0, sbp_cardio_1)
f_stat, p_value
