In [1]:
# ============================================================
# Notebook setup: run this before everything
# ============================================================
# -- Copied from lecture
%load_ext autoreload
%config IPCompleter.greedy=True
%autoreload 1
%aimport util
import logging

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

from util import util

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Control figure size
interactive_figures = False
if interactive_figures:
    # Normal behavior
    %matplotlib widget
    figsize=(9, 3)
else:
    # PDF export behavior
    figsize=(14, 4)

# Load datasets
raw_data = util.load_dataset('7_gecco2019_train_water_quality.csv')
val_data = util.load_dataset('8_gecco2019_valid_water_quality.csv')
test_data = util.load_dataset('6_gecco2019_test_water_quality.csv')

# Gaussian Mixture Models
Gaussian Mixture Models (GMM) are a popular unsupervised learning algorithm that can be used to model the distribution of a dataset. In the context of anomaly detection, GMM can be used to find clusters of normal data points and identify anomalies. GMMs describe the distribution via a weighted sum of Gaussian components.

GMMs assume, that data is generated by the following probabilistic model:
$$
X_Z,
$$
where both $Z$ and $X_Z$ are random variables. $Z$ is a latent variable that represents the component of the data, while $X_Z$ is the observed data. The latent variable $Z$ is assumed to be generated by a probability distribution $p(Z)$, while $X_Z$ follows a multivariate Gaussian distribution.

In mathematical terms, a GMM is a probability distribution that can be represented as:
$$
g(x, \mu, \Sigma, \tau) = \sum_{k=1}^{n} \tau_{k} \mathcal{f}(x, \mu_{k}, \Sigma_{k}),
$$
where $\tau$ is a vector of weights, $\mu$ is a vector of means, $\Sigma$ is a covariance matrix, and $\mathcal{f}$ is the Gaussian probability density function.

## Preprocessing
In order to use GMM for anomaly detection, we first need to make sure, that our training data is free from missing values and anomalies. As seen before, a linear interpolation approach yields the best results. Therefore, we will interpolate missing values using this method. Then, we have to apply a sliding window approach using the `aggregation_length` parameter explained above and aggregate the data into windows. This makes sure that we capture temporal correlations between data points and additionally removes noise in the data. The final step of our preprocessing pipeline is to standardize the data.

In [2]:
# Interpolate missing values
gmm_data_df = util.impute_missing_values(raw_data)

# Interpolate all values where Event is True
gmm_data_df.loc[gmm_data_df['Event'] == True, util.get_feature_columns(gmm_data_df)] = np.nan
gmm_data_df = util.impute_missing_values(gmm_data_df)

# Apply sliding window and aggregate
gmm_data_df = util.apply_sliding_window_and_aggregate(gmm_data_df)

# Identify the features to be used for GMM
gmm_features = util.get_feature_columns(gmm_data_df)

# Standardize the data (GMM assumes normally distributed data)
gmm_scaler = StandardScaler()
gmm_data_df[gmm_features] = gmm_scaler.fit_transform(gmm_data_df[gmm_features])

print(gmm_data_df.head())

# Preprocess validation data
gmm_val_data_df = util.impute_missing_values(val_data)
gmm_val_data_df = util.apply_sliding_window_and_aggregate(gmm_val_data_df)

# Standardize the data (KDE assumes normally distributed data)
gmm_val_data_df[gmm_features] = gmm_scaler.transform(gmm_val_data_df[gmm_features])

                     window_0  window_1  window_2  window_3  window_4  \
Time                                                                    
2017-07-01 00:09:00 -1.232383  1.187793 -0.392840 -0.809330 -1.697608   
2017-07-01 00:10:00 -1.243568  1.158432 -0.359859 -0.372582 -1.686283   
2017-07-01 00:11:00 -1.232383  1.099867 -0.352313 -0.622491 -1.709695   
2017-07-01 00:12:00 -1.232383  1.099867 -0.342251 -0.723540 -1.694760   
2017-07-01 00:13:00 -1.232383  1.129229 -0.337220 -0.138949 -1.710721   

                     window_5  window_6  window_7  window_8  window_9  ...  \
Time                                                                   ...   
2017-07-01 00:09:00 -2.181413 -1.243574  1.158442 -0.359853 -0.372584  ...   
2017-07-01 00:10:00 -2.209374 -1.232388  1.099877 -0.352306 -0.622493  ...   
2017-07-01 00:11:00 -2.195118 -1.232388  1.099877 -0.342244 -0.723542  ...   
2017-07-01 00:12:00 -2.215171 -1.232388  1.129239 -0.337213 -0.138951  ...   
2017-07-01 00:13:00 

## Determine the number of Gaussians
Next, we need to determine the number of Gaussians to use for our GMM. We can do this using the Bayesian Information Criterion (BIC) or the elbow method. The BIC is a measure of the model's goodness of fit, while the elbow method is a visual tool that helps us determine the optimal number of Gaussians. In our case, we will use the BIC method.

In [3]:
# Select only relevant features (mean and variance).
X = gmm_data_df[gmm_features]
X_val = gmm_val_data_df[gmm_features]

# Fit GMM and determine optimal K using BIC.
lowest_bic = np.inf
best_k = None

# Try GMMs with 1 to 10 components.
for k in range(1, 11):
    gmm = GaussianMixture(n_components=k, covariance_type='full', random_state=42)
    gmm.fit(X)

    bic = gmm.bic(X_val)

    # Do we have a better model?
    if bic < lowest_bic:
        lowest_bic = bic
        best_k = k

print(f"Best GMM with {best_k} components, BIC: {lowest_bic:.2f}")

Best GMM with 4 components, BIC: 5556969130.96


## Train the GMM
Next, we train the GMM with the optimal number of components. Additionally, we compute the log likelihood scores for each data point. This score is a measure of how likely a data point is to be generated by the Gaussian distribution. Higher scores indicate a higher likelihood of being generated by a Gaussian distribution.

In [4]:
# Train GMM with best K.
final_gmm = GaussianMixture(n_components=best_k, covariance_type='full', random_state=42)
final_gmm.fit(X)

# Compute likelihood scores for the training data.
log_likelihood = final_gmm.score_samples(X)

## Threshold Optimization
Now, we need to define a threshold to separate normal data from anomalous data. We will use a simple threshold optimization approach. First, we define the percentiles to test. Then, we compute precision, recall, and F1-score for each percentile. These are the preferred metrics when working with big class imbalances. Finally, we select the percentile with the highest F1-score.

In [5]:
# Compute likelihood scores for the training data.
log_likelihood_val = final_gmm.score_samples(X_val)

# Define percentiles to test.
percentiles = np.arange(0.1, 2.1, 0.1)

# For storing the results.
results = []

for p in percentiles:
    # Get predictions and threshold
    y_pred, threshold = util.get_predictions_from_log_likelihood(log_likelihood_val, p)

    # Compute performance
    f1, precision, recall = util.compute_model_performance(y_pred, gmm_val_data_df['Event'])

    # Store results.
    results.append((p, threshold, precision, recall, f1))

# Convert to DataFrame for better visualization.
df_results = pd.DataFrame(results, columns=['Percentile', 'Threshold', 'Precision', 'Recall', 'F1-score'])

# Display results.
gmm_best_percentile = df_results.loc[df_results['F1-score'].idxmax()]
print(df_results)
print(f"Best GMM model with percentile {gmm_best_percentile['Percentile']} and threshold {gmm_best_percentile['Threshold']} achieves an F1-score of {gmm_best_percentile['F1-score']}")

    Percentile     Threshold  Precision    Recall  F1-score
0          0.1 -2.400630e+07   0.581818  0.096677  0.165803
1          0.2 -7.842527e+06   0.390909  0.129909  0.195011
2          0.3 -4.660417e+06   0.345455  0.172205  0.229839
3          0.4 -3.218652e+06   0.401826  0.265861  0.320000
4          0.5 -1.416382e+06   0.434307  0.359517  0.393388
5          0.6 -7.554792e+05   0.428571  0.425982  0.427273
6          0.7 -1.533860e+05   0.450521  0.522659  0.483916
7          0.8 -4.419262e+04   0.420091  0.555891  0.478544
8          0.9 -4.188588e+04   0.375254  0.558912  0.449029
9          1.0 -3.977271e+04   0.337591  0.558912  0.420933
10         1.1 -3.771574e+04   0.306799  0.558912  0.396146
11         1.2 -3.571500e+04   0.281583  0.558912  0.374494
12         1.3 -3.380550e+04   0.259831  0.558912  0.354746
13         1.4 -3.215464e+04   0.241199  0.558912  0.336976
14         1.5 -3.034745e+04   0.225061  0.558912  0.320902
15         1.6 -2.865872e+04   0.211187 

## Compute Number of Anomalies
Now, we compute the number of anomalies for the identified threshold.

In [6]:
# Compute number of anomalies
threshold = np.percentile(log_likelihood, gmm_best_percentile['Percentile'])
anomalies = log_likelihood < threshold

print(f"Anomaly threshold: {threshold:.2f}")
print(f"Number of anomalies: {np.sum(anomalies)}")

# Add anomaly labels to DataFrame
gmm_data_df['Anomaly_Score'] = log_likelihood
gmm_data_df['Anomaly'] = anomalies

print(gmm_data_df['Anomaly_Score'].head())

Anomaly threshold: 76.12
Number of anomalies: 928
Time
2017-07-01 00:09:00    142.628872
2017-07-01 00:10:00    144.959728
2017-07-01 00:11:00    146.009770
2017-07-01 00:12:00    141.062214
2017-07-01 00:13:00    138.712673
Name: Anomaly_Score, dtype: float64


## Confusion Matrix and Classification Report
To measure the performance of our anomaly detection model on the training data, we can use the confusion matrix and classification report.

In [7]:
# Convert Boolean to integers for evaluation
y_true = gmm_data_df['Event'].astype(int)  # Actual contamination events
y_pred = gmm_data_df['Anomaly'].astype(int)  # Detected anomalies

# Print performance metrics
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
print("Classification Report:\n", classification_report(y_true, y_pred))

Confusion Matrix:
 [[131230    912]
 [   313     16]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00    132142
           1       0.02      0.05      0.03       329

    accuracy                           0.99    132471
   macro avg       0.51      0.52      0.51    132471
weighted avg       1.00      0.99      0.99    132471



The GMM performed well on the training data with an overall accuracy of 99%, but this high accuracy is largely driven by the class imbalance that favors normal samples. The model correctly classified nearly all normal cases (TN = 131,230) with a precision of 1.00, resulting in relatively few false positives (FP = 912). However, when it comes to anomalies, the performance is much weaker. The model only detected 16 out of 329 anomalies (recall = 0.05), meaning that it missed 313 true anomalies. Moreover, the anomaly precision of 0.02 indicates that almost all of the flagged anomalies were false alarms, and the F1-score of 0.03 further highlights the severe imbalance between precision and recall.

## Performance on Test Data
The lack of generalization can be seen when we test the trained GMM on the test data. First, we have to perform the same preprocessing steps as with the training data. Afterwards, we can apply the GMM to the test data and compute the confusion matrix and classification report.

In [8]:
# Preprocess test data
gmm_test_data_df = util.impute_missing_values(test_data)
gmm_test_data_df = util.apply_sliding_window_and_aggregate(gmm_test_data_df)

# Standardize the data (GMM assumes normally distributed data)
gmm_test_data_df[gmm_features] = gmm_scaler.transform(gmm_test_data_df[gmm_features])

# Compute scores for test data
X_test = gmm_test_data_df[gmm_features]
log_likelihood_test = final_gmm.score_samples(X_test)  # Higher is more normal, lower is more anomalous

threshold_test = np.percentile(log_likelihood_test, gmm_best_percentile['Percentile'])
anomalies_test = log_likelihood_test < threshold_test

# Add anomaly labels to DataFrame
gmm_test_data_df['Anomaly_Score'] = log_likelihood_test
gmm_test_data_df['Anomaly'] = anomalies_test

# Convert Boolean to integers for evaluation
y_true = gmm_test_data_df['Event'].astype(int)  # Actual contamination events
y_pred = gmm_test_data_df['Anomaly'].astype(int)  # Detected anomalies

# Print performance metrics
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
print("Classification Report:\n", classification_report(y_true, y_pred))

Confusion Matrix:
 [[31195   147]
 [  224    75]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99     31342
           1       0.34      0.25      0.29       299

    accuracy                           0.99     31641
   macro avg       0.67      0.62      0.64     31641
weighted avg       0.99      0.99      0.99     31641



On the test data, the performance of the GMM deteriorated notably compared to the training results. For normal samples (class 0), the model still performed very well with a 99% accuracy. However, for anomalies (class 1), the recall was only 25%, meaning that only one-quarter of the actual anomalies were correctly detected, while 224 anomalies were missed. Moreover, the anomaly precision was 34%, indicating that a significant number of the detected anomalies were false positives. The F1-score for anomalies of 0.29 further highlights the model’s struggle to effectively balance precision and recall. These results underscore the need for improved generalization techniques, such as adjusting the detection threshold, retraining with a more balanced dataset, or exploring alternative anomaly detection methods.