In [7]:
import pandas as pd
import numpy as np

# Load the dataset
dataset = pd.read_excel("../data/datasets/dataset_backup.xlsx")
dataset.set_index(['date', 'region', 'phenomena', 'county'], inplace=True)

# Display the first few rows
dataset.head()

Unnamed: 0,date,region,phenomena,county,pm10_quality,air_pressure,temperature,humidity
0,2009-02-19,Banat,"['Snow', 'Wind']",TM,51.63,995.4,-4.095,85.5
1,2009-02-19,Dobrogea,"['Rain', 'Sleet', 'Wind', 'Snow', 'Flood', 'Fog']",CT,21.52,1005.875,6.745,96.75
2,2009-02-19,Moldova,"['Snow', 'Frost', 'Rain', 'Sleet', 'Wind', 'Fl...",IS,44.6,1010.7,-0.935,99.0
3,2009-02-19,Muntenia,"['Snow', 'Sleet', 'Rain', 'Wind', 'Flood', 'Fr...",B,34.428571,998.88,0.252,86.2
4,2009-02-20,Moldova,"['Wind', 'Snow']",IS,46.05,1014.1,-1.615,99.5


In [9]:
dataset.set_index(['date', 'region', 'phenomena', 'county'], inplace=True)

### IsolationForest

In [18]:
from sklearn.ensemble import IsolationForest

# Initialize the Isolation Forest model
iso_forest = IsolationForest(contamination=0.05, random_state=42)

# Fit the model and predict outliers
outliers = iso_forest.fit_predict(dataset)

# Mark the outliers in the dataset
dataset['is_outlier'] = [True if o == -1 else False for o in outliers]

# Display the rows that are marked as outliers
outlier_data = dataset[dataset['is_outlier'] == True]



### OneClassSVM

In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM

# Scale the numerical data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(dataset)

# Initialize the One-Class SVM model
one_class_svm = OneClassSVM(nu=0.0001, kernel="rbf", gamma='scale')

# Fit the model and predict outliers
svm_outliers = one_class_svm.fit_predict(scaled_data)

# Mark the outliers in the dataset
dataset['svm_is_outlier'] = [True if o == -1 else False for o in svm_outliers]

# Display the rows that are marked as outliers by One-Class SVM
svm_outlier_data = dataset[dataset['svm_is_outlier'] == True]

### VAE

In [27]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_val = train_test_split(scaled_data, test_size=0.2, random_state=42)

X_train.shape, X_val.shape

((18387, 6), (4597, 6))

In [31]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.losses import mse
from tensorflow.keras import backend as K

# Define VAE architecture parameters
input_dim = X_train.shape[1]
intermediate_dim = 10
latent_dim = 2

# Encoder architecture
inputs = Input(shape=(input_dim,), name='encoder_input')
x = Dense(intermediate_dim, activation='relu')(inputs)
z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)

# Reparameterization trick to sample z values
def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])

# Decoder architecture
decoder_h = Dense(intermediate_dim, activation='relu', name='decoder_h')
decoder_mean = Dense(input_dim, name='decoder_mean')
h_decoded = decoder_h(z)
x_decoded_mean = decoder_mean(h_decoded)

# VAE model
vae = Model(inputs, x_decoded_mean)

# Define VAE loss
xent_loss = input_dim * mse(inputs, x_decoded_mean)
kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
vae_loss = K.mean(xent_loss + kl_loss)

# Compile the model
vae.add_loss(vae_loss)
vae.compile(optimizer='adam')

# vae.summary()

In [36]:
# Step 1: Compute Reconstruction
reconstructed_data = vae.predict(X_val)

# Step 2: Calculate Reconstruction Error (MSE)
mse_errors = np.mean(np.square(X_val - reconstructed_data), axis=1)

# Step 3: Set a Threshold
# Here, we can use a high percentile (e.g., 95th percentile) of the MSE as a threshold
threshold = np.percentile(mse_errors, 95)

# Flag data points as outliers based on the threshold
outliers_vae = np.where(mse_errors > threshold)[0]



In [39]:
X_val

array([[-0.63726389,  0.47673483, -0.73458078,  0.98946019, -0.22949978,
        -0.22939472],
       [-1.14284805,  0.18467055, -0.43843936, -0.05498435, -0.22949978,
        -0.22939472],
       [-0.4050231 ,  0.57073791,  0.46994152, -0.03764917, -0.22949978,
        -0.22939472],
       ...,
       [ 0.33920306, -1.41595395,  0.70566922,  1.67853355, -0.22949978,
        -0.22939472],
       [-0.02196057, -1.54134984,  0.86526802,  0.24838128, -0.22949978,
        -0.22939472],
       [ 1.9511075 ,  0.03828765, -2.08597747, -1.03875576,  4.35730272,
        -0.22939472]])

In [41]:
# Convert X_train and X_val back to DataFrames to retrieve original indices
X_train_df = pd.DataFrame(X_train, index=dataset.index[:len(X_train)])
X_val_df = pd.DataFrame(X_val, index=dataset.index[len(X_train):])

# Retrieve the original indices of the outliers from the validation set DataFrame
original_indices_vae = X_val_df.iloc[outliers_vae].index

# Extract the outlier rows from the original dataset
outlier_rows_vae = dataset.loc[original_indices_vae]