In [1]:
# ============================================================
# Notebook setup: run this before everything
# ============================================================
# -- Copied from lecture
%load_ext autoreload
%config IPCompleter.greedy=True
%autoreload 1
%aimport util
import logging

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

from util import util

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Control figure size
interactive_figures = False
if interactive_figures:
    # Normal behavior
    %matplotlib widget
    figsize=(9, 3)
else:
    # PDF export behavior
    figsize=(14, 4)

# Gaussian Mixture Models
Gaussian Mixture Models (GMM) are a popular unsupervised learning algorithm that can be used to model the distribution of a dataset. In the context of anomaly detection, GMM can be used to find clusters of normal data points and identify anomalies. GMMs describe the distribution via a weighted sum of Gaussian components.

GMMs assume, that data is generated by the following probabilistic model:
$$
X_Z,
$$
where both $Z$ and $X_Z$ are random variables. $Z$ is a latent variable that represents the component of the data, while $X_Z$ is the observed data. The latent variable $Z$ is assumed to be generated by a probability distribution $p(Z)$, while $X_Z$ follows a multivariate Gaussian distribution.

In mathematical terms, a GMM is a probability distribution that can be represented as:
$$
g(x, \mu, \Sigma, \tau) = \sum_{k=1}^{n} \tau_{k} \mathcal{f}(x, \mu_{k}, \Sigma_{k}),
$$
where $\tau$ is a vector of weights, $\mu$ is a vector of means, $\Sigma$ is a covariance matrix, and $\mathcal{f}$ is the Gaussian probability density function.

## Preprocessing
In order to use GMM for anomaly detection, we first need to make sure, that our training data is free from missing values and anomalies. As seen before, a linear interpolation approach yields the best results. Therefore, we will interpolate missing values using this method. Then, we have to apply a sliding window approach using the `aggregation_length` parameter explained above and aggregate the data into windows. This makes sure that we capture temporal correlations between data points and additionally removes noise in the data. The final step of our preprocessing pipeline is to standardize the data.

In [2]:
# Preprocess config
preprocess_clean = [util.impute_missing_values, util.impute_anomalies, util.apply_sliding_window_and_aggregate]
preprocess = [util.impute_missing_values, util.apply_sliding_window_and_aggregate]

# Load datasets
X_train_clean, y_train_clean = util.load_dataset_xy('7_gecco2019_train_water_quality.csv', preprocess=preprocess_clean)
X_train, y_train = util.load_dataset_xy('7_gecco2019_train_water_quality.csv', preprocess=preprocess)
X_val, y_val = util.load_dataset_xy('8_gecco2019_valid_water_quality.csv', preprocess=preprocess)
X_test, y_test = util.load_dataset_xy('6_gecco2019_test_water_quality.csv', preprocess=preprocess)

# Identify the features to be used for GMM
features = util.get_feature_columns(X_train_clean)

# Standardize the data (GMM assumes normally distributed data)
scaler = StandardScaler()
X_train_clean[features] = scaler.fit_transform(X_train_clean[features])
X_train[features] = scaler.transform(X_train[features])
X_val[features] = scaler.transform(X_val[features])
X_test[features] = scaler.transform(X_test[features])

print(X_train_clean.head())

                     window_0  window_1  window_2  window_3  window_4  \
Time                                                                    
2017-07-01 00:09:00 -1.232383  1.187793 -0.392840 -0.809330 -1.697608   
2017-07-01 00:10:00 -1.243568  1.158432 -0.359859 -0.372582 -1.686283   
2017-07-01 00:11:00 -1.232383  1.099867 -0.352313 -0.622491 -1.709695   
2017-07-01 00:12:00 -1.232383  1.099867 -0.342251 -0.723540 -1.694760   
2017-07-01 00:13:00 -1.232383  1.129229 -0.337220 -0.138949 -1.710721   

                     window_5  window_6  window_7  window_8  window_9  ...  \
Time                                                                   ...   
2017-07-01 00:09:00 -2.181413 -1.243574  1.158442 -0.359853 -0.372584  ...   
2017-07-01 00:10:00 -2.209374 -1.232388  1.099877 -0.352306 -0.622493  ...   
2017-07-01 00:11:00 -2.195118 -1.232388  1.099877 -0.342244 -0.723542  ...   
2017-07-01 00:12:00 -2.215171 -1.232388  1.129239 -0.337213 -0.138951  ...   
2017-07-01 00:13:00 

## Determine the Number of Gaussians
Next, we need to determine the number of Gaussians to use for our GMM. We can do this using the Bayesian Information Criterion (BIC) or the elbow method. The BIC is a measure of the model's goodness of fit, while the elbow method is a visual tool that helps us determine the optimal number of Gaussians. In our case, we will use the BIC method.

In [3]:
# Fit GMM and determine optimal K using BIC.
lowest_bic = np.inf
best_k = None

# Try GMMs with 1 to 10 components.
for k in range(1, 11):
    gmm = GaussianMixture(n_components=k, covariance_type='full', random_state=42)
    gmm.fit(X_train_clean)

    bic = gmm.bic(X_val)

    # Do we have a better model?
    if bic < lowest_bic:
        lowest_bic = bic
        best_k = k

print(f"Best GMM with {best_k} components, BIC: {lowest_bic:.2f}")

Best GMM with 4 components, BIC: 5556969130.96


## Train the GMM
Next, we train the GMM with the optimal number of components. Additionally, we compute the log likelihood scores for each data point. This score is a measure of how likely a data point is to be generated by the Gaussian distribution. Higher scores indicate a higher likelihood of being generated by a Gaussian distribution.

In [4]:
# Train GMM with best K.
final_gmm = GaussianMixture(n_components=best_k, covariance_type='full', random_state=42)
final_gmm.fit(X_train_clean)

## Threshold Optimization
Now, we need to define a threshold to separate normal data from anomalous data. We will use a simple threshold optimization approach. First, we define the percentiles to test. Then, we compute precision, recall, and F1-score for each percentile. These are the preferred metrics when working with big class imbalances. Finally, we select the percentile with the highest F1-score.

In [5]:
# Compute likelihood scores for the validation data.
log_likelihood_val = final_gmm.score_samples(X_val)

# Define percentiles to test.
percentiles = np.arange(0.1, 2.1, 0.1)

# For storing the results.
results = []

for p in percentiles:
    # Get predictions and threshold
    y_pred, threshold = util.get_predictions_from_log_likelihood(log_likelihood_val, p)

    # Compute performance
    f1, precision, recall = util.compute_model_performance(y_pred, y_val)

    # Store results.
    results.append((p, threshold, precision, recall, f1))

# Convert to DataFrame for better visualization.
df_results = pd.DataFrame(results, columns=['Percentile', 'Threshold', 'Precision', 'Recall', 'F1-score'])

# Display results.
best_percentile = df_results.loc[df_results['F1-score'].idxmax()]
print(df_results)
print(f"Best GMM model with percentile {best_percentile['Percentile']} and threshold {best_percentile['Threshold']} achieves an F1-score of {best_percentile['F1-score']}")

    Percentile     Threshold  Precision    Recall  F1-score
0          0.1 -2.400630e+07   0.581818  0.096677  0.165803
1          0.2 -7.842527e+06   0.390909  0.129909  0.195011
2          0.3 -4.660417e+06   0.345455  0.172205  0.229839
3          0.4 -3.218652e+06   0.401826  0.265861  0.320000
4          0.5 -1.416382e+06   0.434307  0.359517  0.393388
5          0.6 -7.554792e+05   0.428571  0.425982  0.427273
6          0.7 -1.533860e+05   0.450521  0.522659  0.483916
7          0.8 -4.419262e+04   0.420091  0.555891  0.478544
8          0.9 -4.188588e+04   0.375254  0.558912  0.449029
9          1.0 -3.977271e+04   0.337591  0.558912  0.420933
10         1.1 -3.771574e+04   0.306799  0.558912  0.396146
11         1.2 -3.571500e+04   0.281583  0.558912  0.374494
12         1.3 -3.380550e+04   0.259831  0.558912  0.354746
13         1.4 -3.215464e+04   0.241199  0.558912  0.336976
14         1.5 -3.034745e+04   0.225061  0.558912  0.320902
15         1.6 -2.865872e+04   0.211187 

## Performance on Training Data
The confusion matrix and the classification report give valuable insights on the success of the model's training.

In [6]:
# Compute likelihood scores for the training data.
log_likelihood_train = final_gmm.score_samples(X_train)
y_pred_train, threshold_train = util.get_predictions_from_log_likelihood(log_likelihood_train, best_percentile['Percentile'])

# Print threshold
print(f'Ideal threshold: {threshold_train:.2f}')

# Print performance metrics
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))
print("Classification Report:\n", classification_report(y_train, y_pred_train))

Confusion Matrix:
 [[131449    693]
 [    94    235]]
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.99      1.00    132142
         1.0       0.25      0.71      0.37       329

    accuracy                           0.99    132471
   macro avg       0.63      0.85      0.69    132471
weighted avg       1.00      0.99      1.00    132471



The model achieves a very high overall accuracy of 99%, driven primarily by its excellent performance on the dominant class (class 0), where it correctly identifies almost all cases (99% recall, nearly 100% precision). However, this high accuracy masks a significant weakness in detecting the minority class (class 1). For class 1, although the recall is moderately high at 71% (meaning that most actual positives are captured), the precision is notably low at just 25%, indicating that a considerable number of false positives are being flagged. This imbalance suggests that while the KDE model is very effective at recognizing the majority class, it struggles to reliably identify the minority class, likely due to the imbalanced nature of the training data.

## Performance on Test Data
The lack of generalization can be seen when we test the trained GMM on the test data. First, we have to perform the same preprocessing steps as with the training data. Afterwards, we can apply the GMM to the test data and compute the confusion matrix and classification report.

In [7]:
# Compute likelihood scores for the test data.
log_likelihood_test = final_gmm.score_samples(X_test)
y_pred_test, threshold_test = util.get_predictions_from_log_likelihood(log_likelihood_test, best_percentile['Percentile'])

# Print performance metrics
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))

Confusion Matrix:
 [[31195   147]
 [  224    75]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     31342
         1.0       0.34      0.25      0.29       299

    accuracy                           0.99     31641
   macro avg       0.67      0.62      0.64     31641
weighted avg       0.99      0.99      0.99     31641



On the test data, the model maintains a high overall accuracy of 99% and continues to perform exceptionally well for the majority class (class 0), with nearly perfect precision and recall. However, when comparing the training and test results, a notable shift in the performance for the minority class (class 1) emerges. In training, class 1 achieved a moderately high recall of approximately 71–77% (albeit with a very low precision of around 25%), indicating that most true positives were captured even though many false positives were flagged. On the test set, however, the recall for class 1 drops sharply to about 25%, meaning the model is missing a substantial number of true positives. In contrast, the precision for class 1 improves to roughly 34% on the test data, suggesting a reduction in false positives. Despite this slight improvement in precision, the overall F1 score for class 1 remains low at approximately 0.29. These differences point to a potential overfitting issue or a limited ability of the model to generalize for the minority class when faced with unseen data.