In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [None]:
# dataset from kagglehub.dataset_download("philanipro/mercedesbenz-greener-manufacturing")
df = pd.read_csv('train.csv')

In [None]:
print(df.head())

In [None]:
def strings_to_numbers(strings):
    s = pd.Series(strings)
    labels, levels = pd.factorize(s)
    return labels

In [None]:
X_train = df.iloc[:, [2, 3, 4, 5, 6, 7, 8, 9]].values
for i in range(8):
    X_train[:, i] = strings_to_numbers(X_train[:, i])
X_train = X_train.astype(np.int64)
y_train = df['X10']

In [None]:
df_val = pd.read_csv('test.csv')

In [None]:
X_val = df_val.iloc[:, [1, 2, 3, 4, 5, 6, 7, 8]].values
for i in range(8):
    X_val[:, i] = strings_to_numbers(X_val[:, i]).astype(np.int64)
X_val = X_val.astype(np.int64)
y_val = df_val['X10']

In [None]:
def estimate_gaussian(X): 
    """
    Calculates mean and variance of all features 
    in the dataset
    
    Args:
        X (ndarray): (m, n) Data matrix
    
    Returns:
        mu (ndarray): (n,) Mean of all features
        var (ndarray): (n,) Variance of all features
    """
    
    mu = np.mean(X, axis = 0)
    var = np.var(X, axis = 0)
        
    return mu, var

In [None]:
def multivariate_gaussian(X, mu, var):
    """
    Computes the probability 
    density function of the examples X under the multivariate gaussian 
    distribution with parameters mu and var. If var is a matrix, it is
    treated as the covariance matrix. If var is a vector, it is treated
    as the var values of the variances in each dimension (a diagonal
    covariance matrix
    """
    
    k = len(mu)
    
    if var.ndim == 1:
        var = np.diag(var)
        
    X = X - mu
    p = (2* np.pi)**(-k/2) * np.linalg.det(var)**(-0.5) * \
        np.exp(-0.5 * np.sum(np.matmul(X, np.linalg.pinv(var)) * X, axis=1))
    
    return p

In [None]:
def select_threshold(y_val, p_val): 
    """
    Finds the best threshold to use for selecting outliers 
    based on the results from a validation set (p_val) 
    and the ground truth (y_val)
    
    Args:
        y_val (ndarray): Ground truth on validation set
        p_val (ndarray): Results on validation set
        
    Returns:
        epsilon (float): Threshold chosen 
        F1 (float):      F1 score by choosing epsilon as threshold
    """ 

    best_epsilon = 0
    best_F1 = 0
    F1 = 0
    
    step_size = (max(p_val) - min(p_val)) / 1000
    
    for epsilon in np.arange(min(p_val), max(p_val), step_size):
        predictions = [1 if p == True else 0 for p in (p_val < epsilon)]
        cm = confusion_matrix(y_val, predictions)
        tn, fp, fn, tp = cm.ravel()
        if tp + fp == 0 or tp + fn == 0:
            continue
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        F1 = 2 * precision * recall / (precision + recall)
        
        if F1 > best_F1:
            best_F1 = F1
            best_epsilon = epsilon
        
    return best_epsilon, best_F1

In [None]:
X_train

In [None]:
# Estimate the Gaussian parameters
mu, var = estimate_gaussian(X_train)

# Evaluate the probabilites for the training set
p = multivariate_gaussian(X_train, mu, var)

# Evaluate the probabilites for the cross validation set
p_val = multivariate_gaussian(X_val, mu, var)

# Find the best threshold
epsilon, F1 = select_threshold(y_val, p_val)

print('Best epsilon found using cross-validation: %e'% epsilon)
print('Best F1 on Cross Validation Set:  %f'% F1)
print('# Anomalies found: %d'% sum(p < epsilon))

In [None]:
# This F1 score looks quite bad, but there was never a guarantee that X10 in the dataset would be related to anomalies in the first 8 columns