In [18]:
import os
import torch
import polars as pl
import numpy as np
import seaborn as sns
import warnings
from feature_eng.scalers import ranged_scaler
warnings.filterwarnings("ignore", category=UserWarning) 

In [19]:
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"


In [20]:
cats_df = pl.read_csv("data/data.csv", separator=",")  

In [21]:
metadata = pl.read_csv('data/metadata.csv',separator=',')

In [22]:
cats_df.head()

timestamp,aimp,amud,arnd,asin1,asin2,adbr,adfl,bed1,bed2,bfo1,bfo2,bso1,bso2,bso3,ced1,cfo1,cso1,y,category
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""2023-01-01 00:00:00""",0.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""2023-01-01 00:00:01""",0.0,1.0,20.080031,2e-05,0.0002,0.0,0.0,0.0,0.0,0.0,4.9939e-07,0.000789,0.0,0.0,0.0,2.1e-05,0.001229,0.0,0.0
"""2023-01-01 00:00:02""",0.0,1.0,20.276562,4e-05,0.0004,0.0,0.0,0.0,0.0,0.0,1e-06,0.003115,0.0,0.0,0.0,0.000104,0.004833,0.0,0.0
"""2023-01-01 00:00:03""",0.0,1.0,20.730938,6e-05,0.0006,0.0,0.0,0.0,0.0,0.0,3e-06,0.006914,0.0,0.0,0.0,0.000285,0.010688,0.0,0.0
"""2023-01-01 00:00:04""",0.0,1.0,21.118101,8e-05,0.0008,0.0,0.0,0.0,0.0,0.0,5e-06,0.012123,0.0,0.0,0.0,0.000601,0.018669,0.0,0.0


In [6]:
metadata.head()

start_time,end_time,root_cause,affected,category
str,str,str,str,i64
"""2023-01-12 15:11:45""","""2023-01-12 15:20:05""","""bso3""","""['cfo1']""",12
"""2023-01-12 16:27:46""","""2023-01-12 17:51:06""","""bso3""","""['cfo1']""",1
"""2023-01-12 18:19:35""","""2023-01-12 18:36:15""","""bfo2""","""['cso1']""",8
"""2023-01-12 20:46:32""","""2023-01-12 20:51:32""","""bed2""","""['ced1']""",7
"""2023-01-13 05:57:10""","""2023-01-13 06:02:10""","""bfo1""","""['cfo1']""",9


# Feature Engineering

In [40]:
for col in cats_df.columns:
    unique_vals = cats_df[col].n_unique()
    data_type = cats_df[col].dtype
    bad_dtypes = [pl.String,pl.Date,pl.Datetime,pl.Utf8]
    if ((unique_vals >= 50) & (data_type not in bad_dtypes) ):
        cats_df = cats_df.with_columns(ranged_scaler(cats_df[col]))
    else:
        continue

In [41]:
cats_df.head()

timestamp,aimp,amud,arnd,asin1,asin2,adbr,adfl,bed1,bed2,bfo1,bfo2,bso1,bso2,bso3,ced1,cfo1,cso1,y,category
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""2023-01-01 00:00:00""",0.0,0.142857,-0.5,-4.1078e-14,2.0428e-14,0.0,0.0,-0.32802,-0.369237,-0.738163,-0.767181,-0.180547,-0.507953,-0.716059,-0.774361,0.100389,-0.186623,0.0,0.0
"""2023-01-01 00:00:01""",0.0,0.142857,-0.495998,2e-05,0.0002,0.0,0.0,-0.32802,-0.369237,-0.738163,-0.767181,-0.18054,-0.507953,-0.716059,-0.774361,0.100389,-0.186618,0.0,0.0
"""2023-01-01 00:00:02""",0.0,0.142857,-0.486172,4e-05,0.0004,0.0,0.0,-0.32802,-0.369237,-0.738163,-0.767181,-0.180519,-0.507953,-0.716059,-0.774361,0.10039,-0.186604,0.0,0.0
"""2023-01-01 00:00:03""",0.0,0.142857,-0.463453,6e-05,0.0006,0.0,0.0,-0.32802,-0.369237,-0.738163,-0.767181,-0.180484,-0.507953,-0.716059,-0.774361,0.100391,-0.18658,0.0,0.0
"""2023-01-01 00:00:04""",0.0,0.142857,-0.444095,8e-05,0.0008,0.0,0.0,-0.32802,-0.369237,-0.738163,-0.767181,-0.180437,-0.507953,-0.716059,-0.774361,0.100393,-0.186548,0.0,0.0


## Filter Cats by Anomalies Only

In [42]:
from mpge.rca import mpge_root_cause_diagnosis

In [43]:
new_metadata = []
iteration = 0
for row in metadata.iter_rows(named=True):
    start = row['start_time']
    end = row['end_time']
    root_cause = row['root_cause']
    mod_df = cats_df.filter( (pl.col('timestamp')>= start) & (pl.col('timestamp') <= end))
    rca_mod = mpge_root_cause_diagnosis(input_df = mod_df, cols_to_exclude=['timestamp','y','category'])
    rca_mod.fit()

    potential_cause1 = rca_mod.root_rank_score[0]['Column'][0]
    potential_cause2 = rca_mod.root_rank_score[1]['Column'][0]
    potential_cause3 = rca_mod.root_rank_score[2]['Column'][0]
    if root_cause == potential_cause1:
        row['cause_1'] = 1
    if root_cause == potential_cause2:
        row['cause_2'] = 1
    if root_cause == potential_cause3:
        row['cause_3'] = 1
    new_metadata.append(row)
    if iteration%50 == 0:
        print(iteration)
    iteration+=1

0
50
100
150


In [44]:
mpge_stats = pl.DataFrame(new_metadata)

In [45]:
agg_stats = mpge_stats.select(pl.sum("cause_1", "cause_2",'cause_3'))

In [46]:
agg_stats.select(pl.sum_horizontal(pl.all())).item()/mpge_stats.shape[0]

0.23

In [None]:
new_metadata