# Projet MLOps – Automation

Pipeline automatisé d'entraînement, évaluation et sauvegarde.

In [5]:
import pandas as pd
import numpy as np
import os, json
from datetime import datetime
import joblib
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [6]:
def run_pipeline(data_path):
    df = pd.read_csv(data_path)
    X = df.drop(columns=["income"])
    y = df["income"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    preprocess = ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore"), X.columns)
    ])

    model = LogisticRegression(max_iter=2000, class_weight="balanced")

    pipe = Pipeline([
        ("prep", preprocess),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "f1_macro": f1_score(y_test, y_pred, average="macro")
    }

    run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    os.makedirs(f"artifacts/{run_id}", exist_ok=True)

    joblib.dump(pipe, f"artifacts/{run_id}/model.joblib")
    with open(f"artifacts/{run_id}/metrics.json", "w") as f:
        json.dump(metrics, f, indent=2)

    return metrics

In [7]:
metrics = run_pipeline("C:/Users/adamt/Downloads/data_modeling.csv")
metrics


{'accuracy': 0.7883099600777971, 'f1_macro': 0.752022335293755}