In [None]:
# ============================================================
# Notebook setup: run this before everything
# ============================================================
# -- Copied from lecture
%load_ext autoreload
%config IPCompleter.greedy=True
%autoreload 1
%aimport util
import logging

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

from util import util

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Control figure size
interactive_figures = False
if interactive_figures:
    # Normal behavior
    %matplotlib widget
    figsize=(9, 3)
else:
    # PDF export behavior
    figsize=(14, 4)

raw_data = util.load_dataset('7_gecco2019_train_water_quality.csv')

# Gaussian Mixture Models
Gaussian Mixture Models (GMM) are a popular unsupervised learning algorithm that can be used to model the distribution of a dataset. In the context of anomaly detection, GMM can be used to find clusters of normal data points and identify anomalies. GMMs describe the distribution via a weighted sum of Gaussian components.

GMMs assume, that data is generated by the following probabilistic model:
$$
X_Z,
$$
where both $Z$ and $X_Z$ are random variables. $Z$ is a latent variable that represents the component of the data, while $X_Z$ is the observed data. The latent variable $Z$ is assumed to be generated by a probability distribution $p(Z)$, while $X_Z$ follows a multivariate Gaussian distribution.

In mathematical terms, a GMM is a probability distribution that can be represented as:
$$
g(x, \mu, \Sigma, \tau) = \sum_{k=1}^{n} \tau_{k} \mathcal{f}(x, \mu_{k}, \Sigma_{k}),
$$
where $\tau$ is a vector of weights, $\mu$ is a vector of means, $\Sigma$ is a covariance matrix, and $\mathcal{f}$ is the Gaussian probability density function.

## Preprocessing
In order to use GMM for anomaly detection, we first need to make sure, that our data is free from missing values. As seen before, a linear interpolation approach yields the best results. Therefore, we will interpolate missing values using this method. Then, we have to apply a sliding window approach using the `aggregation_length` parameter explained above and aggregate the data into windows. This makes sure that we capture temporal correlations between data points and additionally removes noise in the data. The final step of our preprocessing pipeline is to standardize the data.

In [None]:
# Preprocess the data (interpolating missing values and applying sliding window)
gmm_data_df = util.impute_missing_values(raw_data)
gmm_data_df = util.apply_sliding_window_and_aggregate(gmm_data_df)

# Identify the features to be used for GMM
gmm_features = util.get_feature_columns(gmm_data_df)

# Standardize the data (GMM assumes normally distributed data)
gmm_scaler = StandardScaler()
gmm_data_df[gmm_features] = gmm_scaler.fit_transform(gmm_data_df[gmm_features])

print(gmm_data_df.head())

## Determine the number of Gaussians
Next, we need to determine the number of Gaussians to use for our GMM. We can do this using the Bayesian Information Criterion (BIC) or the elbow method. The BIC is a measure of the model's goodness of fit, while the elbow method is a visual tool that helps us determine the optimal number of Gaussians. In our case, we will use the BIC method.

In [None]:
# Select only relevant features (mean and variance).
X = gmm_data_df[gmm_features]

# Fit GMM and determine optimal K using BIC.
lowest_bic = np.inf
best_k = None

# Try GMMs with 1 to 10 components.
for k in range(1, 11):
    gmm = GaussianMixture(n_components=k, covariance_type='full', random_state=42)
    gmm.fit(X)

    bic = gmm.bic(X)

    # Do we have a better model?
    if bic < lowest_bic:
        lowest_bic = bic
        best_k = k

print(f"Best GMM with {best_k} components, BIC: {lowest_bic:.2f}")

## Train the GMM
Next, we train the GMM with the optimal number of components. Additionally, we compute the log likelihood scores for each data point. This score is a measure of how likely a data point is to be generated by the Gaussian distribution. Higher scores indicate a higher likelihood of being generated by a Gaussian distribution.

In [None]:
# Train GMM with best K.
best_k = 7
final_gmm = GaussianMixture(n_components=best_k, covariance_type='full', random_state=42)
final_gmm.fit(X)

# Compute likelihood scores for the training data.
log_likelihood = final_gmm.score_samples(X)

## Threshold optimization
Now, we need to define a threshold to separate normal data from anomalous data. We will use a simple threshold optimization approach. First, we define the percentiles to test. Then, we compute precision, recall, and F1 scores for each percentile. These are the preferred metrics when working with big class imbalances. Finally, we select the percentile with the highest F1 score.

In [None]:
# Define percentiles to test.
percentiles = np.arange(0.1, 2.1, 0.1)

# For storing the results.
results = []

for p in percentiles:
    # Get predictions and threshold
    y_pred, threshold = util.get_predictions_from_log_likelihood(log_likelihood, p)

    # Compute performance
    f1, precision, recall = util.compute_model_performance(y_pred, gmm_data_df['Event'])

    # Store results.
    results.append((p, threshold, precision, recall, f1))

# Convert to DataFrame for better visualization.
df_results = pd.DataFrame(results, columns=['Percentile', 'Threshold', 'Precision', 'Recall', 'F1-score'])

# Display results.
gmm_best_percentile = df_results.loc[df_results['F1-score'].idxmax()]
print(df_results)
print(f"Best GMM model with percentile {gmm_best_percentile['Percentile']} and threshold {gmm_best_percentile['Threshold']} achieves an F1-score of {gmm_best_percentile['F1-score']}")

## Compute number of anomalies
Now, we compute the number of anomalies for the identified threshold.

In [None]:
# Compute number of anomalies
threshold = np.percentile(log_likelihood, gmm_best_percentile['Percentile'])
anomalies = log_likelihood < threshold

print(f"Anomaly threshold: {threshold:.2f}")
print(f"Number of anomalies: {np.sum(anomalies)}")

# Add anomaly labels to DataFrame
gmm_data_df['Anomaly_Score'] = log_likelihood
gmm_data_df['Anomaly'] = anomalies

print(gmm_data_df['Anomaly_Score'].head())

## Confusion Matrix and Classification Report
To measure the performance of our anomaly detection model on the training data, we can use the confusion matrix and classification report.

In [None]:
# Convert Boolean to integers for evaluation
y_true = gmm_data_df['Event'].astype(int)  # Actual contamination events
y_pred = gmm_data_df['Anomaly'].astype(int)  # Detected anomalies

# Print performance metrics
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
print("Classification Report:\n", classification_report(y_true, y_pred))

The GMM performed well on the training data with a high overall accuracy of 99%, but this is primarily due to the class imbalance that favors normal samples. The model correctly classified nearly all normal cases (TN = 131,397) with a precision of 1.00. This indicates relatively few false positives (FP = 745). However, while it detected 56% of actual anomalies (recall = 0.56), it still missed 44% (FN = 146). The precision for anomalies (0.20) suggests that only one-fifth of the detected anomalies were true contaminations, meaning there are still false alarms. The F1-score of 0.29 shows a weaker balance between precision and recall. We can conclude that the model is moderately effective at detecting anomalies in the training set, but further improvements may be needed for generalization to unseen data.

## Performance on Test Data
The lack of generalization can be seen when we test the trained GMM on the test data. First, we have to perform the same preprocessing steps as with the training data. Afterwards, we can apply the GMM to the test data and compute the confusion matrix and classification report.

In [None]:
# Load test data
test_data = util.load_dataset('6_gecco2019_test_water_quality.csv')

# Preprocess test data
gmm_test_data_df = util.impute_missing_values(test_data)
gmm_test_data_df = util.apply_sliding_window_and_aggregate(gmm_test_data_df)

# Standardize the data (GMM assumes normally distributed data)
gmm_test_data_df[gmm_features] = gmm_scaler.transform(gmm_test_data_df[gmm_features])

# Compute scores for test data
X_test = gmm_test_data_df[gmm_features]
log_likelihood_test = final_gmm.score_samples(X_test)  # Higher is more normal, lower is more anomalous

threshold_test = np.percentile(log_likelihood_test, 1)  # This time use the 1% percentile as cutoff
anomalies_test = log_likelihood_test < threshold_test

# Add anomaly labels to DataFrame
gmm_test_data_df['Anomaly_Score'] = log_likelihood_test
gmm_test_data_df['Anomaly'] = anomalies_test

# Convert Boolean to integers for evaluation
y_true = gmm_test_data_df['Event'].astype(int)  # Actual contamination events
y_pred = gmm_test_data_df['Anomaly'].astype(int)  # Detected anomalies

# Print performance metrics
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
print("Classification Report:\n", classification_report(y_true, y_pred))

On the test data, the performance of the GMM dropped significantly, with anomaly recall dropping to 31%. The model failed to detect most contaminants (FN = 206). While normal samples were still well classified (99% accuracy for class 0), anomaly precision was only 29%, indicating a high false positive rate. The F1-score of 0.30 confirms that the model struggles to generalize, probably due to overfitting on training data. These results suggest that the model needs better generalization techniques, such as adjusting the threshold, retraining with a more balanced dataset, or considering alternative anomaly detection methods.