In [29]:
import os

import mlflow
import pandas as pd
import plotly_express as px
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [None]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("exploration")
mlflow.autolog(log_models=False)
try:
    mlflow.end_run()
except Exception:
    pass

os.environ["AWS_ACCESS_KEY_ID"] = "minio_mlflow_user"
os.environ["AWS_SECRET_ACCESS_KEY"] = "secret!"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://localhost:9000"

2024/11/20 09:37:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [11]:
df = pd.read_csv("data/features.csv", encoding="utf-8", compression="gzip")
df = df.set_index("time", drop=True)
df = df.drop(["_dlt_load_id", "_dlt_id"], axis=1)
df.index = pd.to_datetime(df.index)
df = df.resample(rule="1d").max()

X = df[
    [
        "carbon_monoxide",
        "nitrogen_dioxide",
        "sulphur_dioxide",
        "ozone",
        "dust",
        "european_aqi",
    ]
]
y = df[["is_sick"]]

In [14]:
grp = y["is_sick"].eq(False).cumsum()
arr = grp.loc[y["is_sick"].eq(True)] \
         .groupby(grp) \
         .apply(lambda x: [x.index.min(), x.index.max()])

fig = px.line(df, y=[
        "carbon_monoxide",
        "nitrogen_dioxide",
        "sulphur_dioxide",
        "ozone",
        "dust",
        "european_aqi",
    ])

for sick_period in arr.values:
    fig.add_vrect(x0=sick_period[0], x1=sick_period[1], fillcolor="red", opacity=0.2, line_width=0)
fig.show()


In [15]:
X_shifted = X.shift(1).iloc[1:-1]
y_shifted = y.iloc[1:-1]

X_train, X_test, y_train, y_test = train_test_split(X_shifted, y_shifted, test_size=0.33, stratify=y_shifted["is_sick"])
print(X_train.shape)
print(X_test.shape)

(198, 6)
(99, 6)


In [None]:
model = GradientBoostingClassifier(n_estimators=300)

with mlflow.start_run():
    model.fit(X=X_train, y=y_train.values.ravel())
    training_run_id = mlflow.active_run().info.run_id

    y_pred = model.predict(X=X_test)
    class_report = classification_report(y_true=y_test, y_pred=y_pred, output_dict=True)
    print(class_report)

    mlflow.log_metric("accuracy", class_report.pop("accuracy"))
    for class_or_avg, metrics_dict in class_report.items():
        for metric, value in metrics_dict.items():
            mlflow.log_metric(class_or_avg + '_' + metric,value)

2024/11/20 09:39:44 INFO mlflow.tracking._tracking_service.client: üèÉ View run rogue-koi-68 at: http://localhost:5000/#/experiments/2/runs/8fe6c2738fa94d8fa598bf78b04e31e2.
2024/11/20 09:39:44 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000/#/experiments/2.


{'0': {'precision': 0.6666666666666666, 'recall': 0.5, 'f1-score': 0.5714285714285714, 'support': 24.0}, '1': {'precision': 0.8518518518518519, 'recall': 0.92, 'f1-score': 0.8846153846153846, 'support': 75.0}, 'accuracy': 0.8181818181818182, 'macro avg': {'precision': 0.7592592592592593, 'recall': 0.71, 'f1-score': 0.728021978021978, 'support': 99.0}, 'weighted avg': {'precision': 0.8069584736251403, 'recall': 0.8181818181818182, 'f1-score': 0.8086913086913086, 'support': 99.0}}
