In [None]:
import warnings 
warnings.filterwarnings('ignore')

import os
import numpy as np
import pandas as pd

import mlflow
from mlflow.models import infer_signature

from datetime import datetime, timedelta
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, median_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sqlalchemy import create_engine
from typing import Any, Dict, Literal, NoReturn

In [None]:
os.getenv("MLFLOW_S3_ENDPOINT_URL", None)

In [None]:
os.getenv("MLFLOW_TRACKING_URI", None)

In [None]:
FEATURES = [
    "MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup",
    "Latitude", "Longitude"
]
TARGET = "MedHouseVal"

models = dict(zip(["RandomForest", "LinearRegression", "HistGB"], 
                  [RandomForestRegressor(), LinearRegression(), HistGradientBoostingRegressor()]))

In [None]:
def get_data():
    
    engine = create_engine("postgresql://postgres:postgres@localhost:5432/postgres")
    data = pd.read_sql_query("SELECT * FROM california_housing", engine)
    
    return data

In [None]:
 data = get_data()

In [None]:
data.head()

In [None]:
def preprocess_data(data):
    # Сделать препроцессинг
    # Разделить на фичи и таргет
    X, y = data[FEATURES], data[TARGET]

    # Разделить данные на обучение и тест
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    # Обучить стандартизатор на train
    scaler = StandardScaler()
    X_train_fitted = scaler.fit_transform(X_train)
    X_test_fitted = scaler.transform(X_test)
    
    return X_train_fitted, X_test_fitted, y_train, y_test

In [None]:
X_train_fitted, X_test_fitted, y_train, y_test = preprocess_data(data)

In [None]:
def train_model(model, name, X_train, X_test, y_train, y_test):

    # Обучить модель
    model.fit(X_train, y_train)

    # Сделать predict
    prediction = model.predict(X_test)

    # Получить описание данных
    signature = infer_signature(X_test, prediction)
    # Сохранить модель в артифактори
    model_info = mlflow.sklearn.log_model(model, name, signature=signature)
    # Сохранить метрики модели
    mlflow.evaluate(
        model_info.model_uri,
        data=X_test,
        targets=y_test.values,
        model_type="regressor",
        evaluators=["default"],
    )

In [None]:
# Создать новый эксперимент
exp_name = "parent_run_experiment"
experiment_id = mlflow.create_experiment(exp_name, artifact_location=f"s3://lizvladi-mlflow-artifacts/{exp_name}")
mlflow.set_experiment(exp_name)

In [None]:
with mlflow.start_run(run_name="parent_run", experiment_id = 57, description = "parent") as parent_run:
    for model_name in models.keys():
        with mlflow.start_run(run_name=model_name, experiment_id= 57, nested=True) as child_run:
            train_model(models[model_name], model_name, X_train_fitted, X_test_fitted, y_train, y_test)