In [2]:
import pandas as pd

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import joblib

In [5]:
df_heart = pd.read_csv("/Users/manuelrodriguezsutil/Developer/health-risk/data/heart_disease_cleaned.csv")

In [8]:
X = df_heart.drop(columns=["target", "source"])
y = df_heart["target"]

In [10]:
num_features = ["age", "trestbps", "chol", "thalach", "oldpeak", "ca"]
cat_features = ["cp", "slope", "thal", "exang"]

In [14]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(drop="first"), cat_features)
]) 

In [16]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=200,
        max_depth=5,
        min_samples_split=5,
        max_features="log2",
        random_state=42
    ))
])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [20]:
pipeline.fit(X_train, y_train)

In [23]:
joblib.dump(pipeline, "/Users/manuelrodriguezsutil/Developer/health-risk/models/heart_rf_pipeline.pkl")

['/Users/manuelrodriguezsutil/Developer/health-risk/models/heart_rf_pipeline.pkl']