## Modelling

In this Notebook we can start modelling, with some data from our DB.

- To do this we can connect with our local DB using the `duckdb` library
- When a connection has been made we can start retrieving data from our DB.


### Setup


In [None]:
import duckdb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%load_ext sql
conn = duckdb.connect(database="../dsp-dagster/data_systems_project.duckdb")
%sql conn --alias duckdb

In [None]:
%sql SHOW ALL TABLES; # shows all available tables

In [None]:
## We can use SQL magic to retrieve data from our DB like so:
# %sql res << SELECT * FROM joined.deployment_incident_vehicles_weather
# res

In [None]:
# Or the more Pythonic way:

# Here we retrieve a table where KNMI weather data and Fire Department data is combined
# df = conn.execute(
#     """
#     SELECT * FROM joined.incidents_buurten """
# ).pl()

# Close the database connection
conn.close()

In [None]:
# df.head()

In [None]:
def plot_feature_importances(model, feature_names, top_n=20, title="Feature Importances"):
    """
    Plots the top n feature importances in a horizontal bar chart.

    :param model: The trained model
    :param feature_names: List of feature names
    :param top_n: Number of top features to display
    :param title: Title of the plot
    """
    # Extract feature importances
    importances = model.feature_importances_

    # Create a DataFrame and sort it based on importances
    feature_importance_df = pd.DataFrame(
        {"Feature": feature_names, "Importance": importances}
    )
    feature_importance_df = feature_importance_df.sort_values(
        by="Importance", ascending=False
    ).head(top_n)

    # Plotting
    plt.figure(figsize=(10, 6))
    sns.barplot(
        data=feature_importance_df, y="Feature", x="Importance", palette="viridis"
    )
    plt.title(title)
    plt.xlabel("Relative Importance")
    plt.ylabel("Feature")
    plt.tight_layout()

    return plt

### XGBoost


In [None]:
# Select only the relevant columns
weather_cols = [
    "Dd",
    "Fh",
    "Ff",
    "Fx",
    "T",
    "T10n",
    "Td",
    "Sq",
    "Q",
    "Dr",
    "Rh",
    "P",
    "Vv",
    "N",
    "U",
    "Ww",
    "Ix",
    "M",
    "R",
    "S",
    "O",
    "Y",
]
group_cols = ["Date", "Hour", "Service_Area", "Damage_Type"] + weather_cols

# Aggregate data
agg_df = (
    df.groupby(group_cols)
    .agg(Incident_Count=pl.count("Incident_ID"))
    .sort(["Date", "Hour"])
)


# Drop Date and Hour columns if not needed
agg_df = agg_df.drop(["Date", "Hour"])

# Encode categorical variables using one-hot encoding
agg_df = agg_df.to_dummies(columns=["Service_Area", "Damage_Type"])

# Splitting the features and target variable
y = agg_df["Incident_Count"]
X = agg_df.drop("Incident_Count")

# Convert to Pandas DataFrame for compatibility with scikit-learn
X_pd = X.to_pandas()
y_pd = y.to_pandas()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_pd, y_pd, test_size=0.2, random_state=42
)

# Train XGBoost model
model = xgb.XGBRegressor(
    objective="count:poisson"
)  # Using Poisson regression for count data

model.fit(X_train, y_train)

# Make predictions and calculate metrics
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

# Plot feature importances
feature_importances = model.feature_importances_

print(feature_importances)
plot_feature_importances(model, X_train.columns, top_n=20)


# Set the MLflow tracking URI
mlflow.set_tracking_uri("http://dsp-mlflow:5001")

# Start an MLFlow run
with mlflow.start_run(run_name="Incident Prediction Model"):
    # Log model
    mlflow.xgboost.log_model(model, "xgboost-model")

    # Log parameters
    mlflow.log_params(model.get_params())

    # Log metrics
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("MAE", mae)