# Training of GMM with preprocessed SmA Dataset
Visual evaluation of the preciseness of discretization and meaningful categories compared to original states. <br>
Wihtin the first subplot we plot a selection of measurement values from the dataset. <br>
The second subplot includes the learned states from the GMM. <br>
The third subplot plots the loglikelihood of the GMM according to the the input data. <br>

To make a statement about the change in states and likelihood, we plot one codeblock with nominal system behavior and the following with anomalous system behavior. 

In [None]:
cd ..

In [None]:
import pandas as pd 
from sklearn.mixture import GaussianMixture
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from utils import standardize_data


In [None]:
def plot_gmm(data, labels, like, labels_orig):
    l = data.shape[1]
    fig = make_subplots(rows=3, cols=1, shared_xaxes=True)
    for i in range(0, l):
        fig.add_trace(go.Scatter(x=data.index, y=data[data.columns[i]], mode='markers', name=data.columns[i]), 
                                row=1, col=1)
    fig.add_trace(go.Scatter(x=data.index, y=labels, mode='markers', name='categories'), row=2, col=1)
    fig.add_trace(go.Scatter(x=data.index, y=labels_orig, mode='markers', name='original categories'), row=2, col=1)

    fig.add_trace(go.Scatter(x=data.index, y=like, mode='markers', name='likelihood'), row=3, col=1)
    fig.show()

In [None]:
path_norm = 'preprocessed_data/SmA/id1_norm.csv'
path_anom = 'preprocessed_data/SmA/id2_anomaly.csv'

df_norm = pd.read_csv(path_norm)
df_anom = pd.read_csv(path_anom)

In [None]:
# Fitting GMM in nominal data
gmm = GaussianMixture(n_components=5, random_state=0, covariance_type='diag').fit(df_norm)

In [None]:
# predict labels/discretizations and its likelihood.
labels = gmm.predict(df_norm)
labels_anom = gmm.predict(df_anom)
labels_orig = df_norm['CuStepNo ValueY'].reset_index(drop=True)
labels_orig_anom = df_anom['CuStepNo ValueY'].reset_index(drop=True)
like_gmm = pd.DataFrame(gmm.score_samples(df_norm)).rolling(10).median().fillna(method='bfill')
like_gmm_anom = pd.DataFrame(gmm.score_samples(df_anom)).rolling(10).median().fillna(method='bfill')


In [None]:
# plot of nominal data with a point anomaly 
plot_gmm(data=df_norm[10000:15000], labels=labels[10000:15000], like=like_gmm[0][10000:15000], labels_orig=labels_orig[5000:10000])

In [None]:
# plot of anomalous data as can be seen e.g. between idx 46.000 and 46.500 of the likelihood
plot_gmm(data=df_anom[45000:50000], labels=labels_anom[45000:50000], like=like_gmm_anom[0][45000:50000], labels_orig=labels_orig_anom[45000:50000])