In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.config import DATA_PATH
from src.utils import print_banner


In [None]:
print_banner("Loading raw patient data")
df = pd.read_csv("data/preprocessed/synthetic_patient_data.csv")
df.head()

In [None]:
df.info()
df.describe()
df.isnull().sum()

In [None]:
sns.countplot(x="disease_risk", data=df)
plt.title("Target Distribution")
plt.xlabel("Disease Risk")
plt.ylabel("Count")
plt.grid(True)
plt.show()

In [None]:
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
df[numeric_cols].hist(figsize=(12, 10), bins=30)
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Matrix")
plt.show()

In [None]:
key_features = ["age", "bmi", "glucose", "blood_pressure", "disease_risk"]
sns.pairplot(df[key_features], hue="disease_risk")