In [1]:
import mlflow
import mlflow.sklearn
import os
import numpy as np
from dotenv import load_dotenv
import pandas as pd
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

load_dotenv()

True

In [2]:
mlflow_user = os.getenv('DB_USER')
mlflow_psw = os.getenv('DB_PSW')
artifact = os.getenv('ROOT') + '/mlruns'

tracking_uri = f"postgresql://{mlflow_user}:{mlflow_psw}@127.0.0.1:5432/winedb"
mlflow.set_tracking_uri(tracking_uri)

In [3]:
experiment_name = 'RandomForests_model'

if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(name=experiment_name, artifact_location=artifact)
experiment = mlflow.get_experiment_by_name(experiment_name)

In [4]:
data = pd.read_csv(os.getenv('ROOT') + '/backend/data/winequality-red.csv')

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1187,7.0,0.43,0.30,2.0,0.085,6.0,39.0,0.99346,3.33,0.46,11.90
199,6.9,1.09,0.06,2.1,0.061,12.0,31.0,0.99480,3.51,0.43,11.40
503,10.5,0.26,0.47,1.9,0.078,6.0,24.0,0.99760,3.18,1.04,10.90
743,11.6,0.41,0.58,2.8,0.096,25.0,101.0,1.00024,3.13,0.53,10.00
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.80
...,...,...,...,...,...,...,...,...,...,...,...
308,10.3,0.43,0.44,2.4,0.214,5.0,12.0,0.99940,3.19,0.63,9.50
55,7.7,0.62,0.04,3.8,0.084,25.0,45.0,0.99780,3.34,0.53,9.50
1521,6.9,0.48,0.20,1.9,0.082,9.0,23.0,0.99585,3.39,0.43,9.05
940,9.6,0.33,0.52,2.2,0.074,13.0,25.0,0.99509,3.36,0.76,12.40


In [10]:
with mlflow.start_run(experiment_id=experiment.experiment_id, run_name=f"run_{experiment_name}"):
    rand_state = np.random.randint(1, 100)

    Y_train = data['quality']
    X_train, X_test, y_train, y_test = train_test_split(data.drop(['quality'], axis=1), Y_train, test_size=0.4,
                                                        random_state=rand_state)

    model = RandomForestClassifier(random_state=rand_state)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    test_metrics = {
        'accuracy': metrics.accuracy_score(y_test, y_pred),
        'precision': metrics.precision_score(y_test, y_pred, average='micro'),
        'recall': metrics.recall_score(y_test, y_pred, average='micro'),
        'f-score': metrics.f1_score(y_test, y_pred, average='micro')
    }

    params = {
        'random_state': rand_state,
    }

    mlflow.sklearn.log_model(model, 'random forest classifier')
    mlflow.log_params(params)
    mlflow.log_metrics(test_metrics)