In [115]:
import os

import mlflow
import pandas as pd
import plotly_express as px
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [None]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("exploration")
mlflow.autolog(log_models=False)
try:
    mlflow.end_run()
except Exception:
    pass

os.environ["AWS_ACCESS_KEY_ID"] = "secret!"
os.environ["AWS_SECRET_ACCESS_KEY"] = "secret!"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://localhost:9000"

2024/11/15 20:30:31 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [110]:
df = pd.read_csv("data/features.csv", encoding="utf-8", compression="gzip")
df = df.set_index("time", drop=True)
df = df.drop(["_dlt_load_id", "_dlt_id"], axis=1)
df.index = pd.to_datetime(df.index)
df = df.resample(rule="1d").max()
target = pd.DataFrame(
    index=df.index,
    columns=["is_sick"],
    data=(df["european_aqi"] > 30).astype(int).values,
)

X = df[
    [
        "carbon_monoxide",
        "nitrogen_dioxide",
        "sulphur_dioxide",
        "ozone",
        "dust",
        "european_aqi",
    ]
]
y = target

In [111]:
grp = y["is_sick"].eq(False).cumsum()
arr = grp.loc[y["is_sick"].eq(True)] \
         .groupby(grp) \
         .apply(lambda x: [x.index.min(), x.index.max()])

fig = px.line(df, y=[
        "carbon_monoxide",
        "nitrogen_dioxide",
        "sulphur_dioxide",
        "ozone",
        "dust",
        "european_aqi",
    ])

for sick_period in arr.values:
    fig.add_vrect(x0=sick_period[0], x1=sick_period[1], fillcolor="red", opacity=0.2, line_width=0)
fig.show()


In [112]:
X_shifted = X.shift(1).iloc[1:-1]
y_shifted = y.iloc[1:-1]

X_train, X_test, y_train, y_test = train_test_split(X_shifted, y_shifted, test_size=0.33, stratify=y_shifted["is_sick"])
print(X_train.shape)
print(X_test.shape)

(35, 6)
(18, 6)


In [118]:
model = RandomForestClassifier(n_estimators=300, min_samples_split=4)

with mlflow.start_run():
    model.fit(X=X_train, y=y_train.values.ravel())
    training_run_id = mlflow.active_run().info.run_id

    y_pred = model.predict(X=X_test)
    class_report = classification_report(y_true=y_test, y_pred=y_pred, output_dict=True)
    print(class_report)

    mlflow.log_metric("accuracy", class_report.pop("accuracy"))
    for class_or_avg, metrics_dict in class_report.items():
        for metric, value in metrics_dict.items():
            mlflow.log_metric(class_or_avg + '_' + metric,value)

2024/11/15 20:33:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run tasteful-colt-925 at: http://localhost:5000/#/experiments/2/runs/6c0701640c9c4d509fd1bdeb8500e1cd.
2024/11/15 20:33:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/2.


{'0': {'precision': 0.8, 'recall': 0.5714285714285714, 'f1-score': 0.6666666666666666, 'support': 7.0}, '1': {'precision': 0.7692307692307693, 'recall': 0.9090909090909091, 'f1-score': 0.8333333333333334, 'support': 11.0}, 'accuracy': 0.7777777777777778, 'macro avg': {'precision': 0.7846153846153847, 'recall': 0.7402597402597402, 'f1-score': 0.75, 'support': 18.0}, 'weighted avg': {'precision': 0.7811965811965812, 'recall': 0.7777777777777778, 'f1-score': 0.7685185185185186, 'support': 18.0}}
