In [5]:
import pandas as pd
import numpy as np
import os

import time
from datetime import datetime
from alibi_detect.od import SpectralResidual

In [3]:
def get_data():
    data_path = '../../../../data/train_data/host'
    dfs = {}
    for file in os.listdir(data_path):
        print('Reading ' + file)
        dfs[file[:-4]] = pd.read_csv(data_path+'/'+file) 
    return dfs

def load_tiago_test(test):
    data_path = '../../../../data/tiago_tests/test'
    df = pd.read_csv(data_path+str(test)+'/kpis.csv')
    return df

def load_test():
    data_path = '../../../../data/test_data/host'
    df = pd.DataFrame(['item_id','name','bomc_id','timestamp','value','cmdb_id'])
    for file in os.listdir(data_path):
        df = pd.concat([df, pd.read_csv(data_path+'/'+file)], ignore_index=True) 
    return df

In [28]:
def get_key_thresh(host):
    df_thresh = pd.read_csv('thresh.csv')
    df_thresh = df_thresh[df_thresh.host==host]

    thresh = df_thresh[['name', 'thresh']].set_index('name')
    thresh = thresh.to_dict()['thresh']
    return thresh

def find_anoms(hosts, df):
    start = time.time()
    kpis = dict(tuple(df.groupby(['cmdb_id', 'name'])))
    res = {}
    anoms = []
    
    df_info = pd.read_csv('/Users/baconbaker/Documents/Studium/ANM/anm-project/impl/models/kpi_detection/kpi_summary_info.data')
    df_thresh = pd.read_csv('thresh.csv')

    for key in kpis:
        kpis[key]['timestamp'] = kpis[key]['timestamp'].apply(lambda x: datetime.fromtimestamp(x/1000.0))
        kpis[key] = kpis[key].set_index('timestamp').sort_index()


    per1_kpis = df_info[(df_info.interval=='1min') & (df_info.is_flat == False)]['kpi'].unique()
    per5_kpis = df_info[(df_info.interval=='5min') & ((df_info.is_flat == False))]['kpi'].unique()

    print('Calculating rolling window')
    for key in kpis: 
        if key[0] in hosts:
            if kpis[key]['value'].std() == 0:
                continue
            elif key[1] in per1_kpis:
                d = kpis[key]['value'].resample('T').mean().interpolate()
            elif key[1] in per5_kpis:
                d = kpis[key]['value'].resample('5T').mean().interpolate()
            else:
                continue
            d = (d - d.mean())/d.std()
            res[key] = d.rolling(10).mean()


    for key in res:
        print('Determining threshold for', key)
        if len(df_thresh[(df_thresh.host == key[0]) & (df_thresh.name==key[1])]) == 0:
            print('Anomaly,  std in train was 0, now its not')
            anoms.append((key[1],key[0]))
            continue
        thresh = df_thresh[(df_thresh.host == key[0]) & (df_thresh.name==key[1])]['thresh'].values[0]
        d = res[key].dropna()
        od = SpectralResidual(
                threshold=thresh,
                window_amp=10,
                window_local=10,
                n_est_points=5,
                n_grad_points=5
            )
        if len(d) == 0:
            print('Rolling window data empty! Skipping')
            continue
        outliers = od.predict(d.values)['data']
        if np.sum(np.sum(outliers['is_outlier'][-5:-2])) > 0:
            print(outliers['is_outlier'])
            print("ST Threshold Anomaly!")
            anoms.append((key[1],key[0]))
    print("It took", time.time()-start, "seconds to find", len(anoms),"anomalies")
    return anoms

print(find_anoms(load_test().cmdb_id.unique(), load_test()))

0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 1 1 0 0 0 0]
ST Threshold Anomaly!
Determining threshold for ('redis_008', 'used_cpu_sys')
Determining threshold for ('redis_008', 'used_cpu_user')
Determining threshold for ('redis_008', 'used_memory')
Determining threshold for ('redis_009', 'connected_clients')
Determining threshold for ('redis_009', 'instantaneous_input_kbps')
Determining threshold for ('redis_009', 'instantaneous_ops_per_sec')
Determining threshold for ('redis_009', 'instantaneous_output_kbps')
Determining threshold for ('redis_009', 'keyspace_hits')
[1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [20]:
df = load_test()
df = df[(df.cmdb_id == 'os_006') & (df.name == 'Received_packets')]

In [None]:
   # start = time.time()
    # anoms = []
    # for host in hosts:
    #     start_host = time.time()
    #     print('*'*60)
    #     print('Checking ', host)
    #     thresholds = get_key_thresh(host)
    #     for kpi in thresholds:
    #         thresh = thresholds[kpi]
    #         print('Threshold ', thresh, kpi)
    #         od = SpectralResidual(
    #             threshold=thresh,
    #             window_amp=10,
    #             window_local=10,
    #             n_est_points=5,
    #             n_grad_points=5
    #         )

    #         df_hk = df[df.cmdb_id == host][df.name == kpi]
            
    #         if len(df_hk) == 0:
    #             print('No valid data to use, skipping')
    #             continue
                
    #         data = get_past(df_hk, np.max(df_hk.timestamp.unique()), 30)['value']
    #         data = np.concatenate([data, data])
            
    #         if thresh == -1:
    #             if data[-1] > 0:
    #                 print("Non-Zero Anomaly!")
    #                 anoms.append((host, kpi))
    #         elif np.isnan(thresh):
    #             print('NaN, Skipping!')
    #         else:
    #             outliers = od.predict(data)['data']
    #             print(outliers['instance_score'])
    #             if np.mean(data) == 0 or np.std(data) == 0:
    #                 print("Mean is zero or std is zero")
    #                 anoms.append((host,kpi))
    #             if np.sum(outliers['is_outlier'][-31:-30]) > 0:
    #                 print(outliers['is_outlier'])
    #                 print("ST Threshold Anomaly!")
    #                 anoms.append((host,kpi))
    #     print(host, ' completed in ', time.time() - start_host)
    # print('Completed detection of ', len(hosts), 'hosts in ', time.time() - start, 'seconds')
    # return anoms