In [27]:
# encoding=utf-8
import os.path as osp
import os
import copy
import matplotlib.pyplot as plt
import torch
from torch.nn import Linear
from sklearn.metrics import average_precision_score, roc_auc_score
from torch_geometric.data import TemporalData

from torch_geometric.nn import TGNMemory, TransformerConv
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn.models.tgn import (LastNeighborLoader, IdentityMessage, MeanAggregator,
                                           LastAggregator)
from torch_geometric import *
from torch_geometric.utils import negative_sampling

from tqdm import tqdm
# from .autonotebook import tqdm as notebook_tqdm

import networkx as nx
import numpy as np
import math
import copy
import re
import time
import json
import pandas as pd
from random import choice
import gc
from graphviz import Digraph
import xxhash

from datetime import datetime, timezone
import time
import pytz
from time import mktime
from datetime import datetime
import time


from rich.progress import Progress
from rich.progress import (
    BarColumn,
    DownloadColumn,
    Progress,
    SpinnerColumn,
    TaskProgressColumn,
    TimeElapsedColumn,
    TimeRemainingColumn,
)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def hashgen(l):
    """Generate a single hash value from a list. @l is a list of
    string values, which can be properties of a node/edge. This
    function returns a single hashed integer value."""
    hasher = xxhash.xxh64()
    for e in l:
        hasher.update(e)
    return hasher.intdigest()


def datetime_to_ns_time(date):
    """
    :param date: str   format: %Y-%m-%d %H:%M:%S   e.g. 2013-10-10 23:40:00
    :return: nano timestamp
    """
    date,ns=date.split('.')

    timeArray = time.strptime(date, '%Y-%m-%dT%H:%M:%S')
    timeStamp = int(time.mktime(timeArray))
    timeStamp = timeStamp * 1000000000
    timeStamp += int(ns.split('Z')[0])
    return timeStamp


def datetime_to_timestamp_US(date):
    """
    :param date: str   format: %Y-%m-%d %H:%M:%S   e.g. 2013-10-10 23:40:00
    :return: nano timestamp
    """
    date=date.replace('-04:00','')
    if '.' in date:
        date,ms=date.split('.')
    else:
        ms=0
    tz = pytz.timezone('Etc/GMT+4')
    timeArray = time.strptime(date, "%Y-%m-%dT%H:%M:%S")
    dt = datetime.fromtimestamp(mktime(timeArray))
    timestamp = tz.localize(dt)
    timestamp=timestamp.timestamp()
    timeStamp = timestamp*1000+int(ms)
    return int(timeStamp)


def timestamp_to_datetime_US(ns):
    """
    :param date: str   format: %Y-%m-%d %H:%M:%S   e.g. 2013-10-10 23:40:00
    :return: nano timestamp
    """
    tz = pytz.timezone('US/Eastern')
    ms=ns%1000
    ns/=1000
    dt = pytz.datetime.datetime.fromtimestamp(int(ns), tz)
    s = dt.strftime('%Y-%m-%d %H:%M:%S')
    s+='.'+str(ms)
#     s += '.' + str(int(int(ns) % 1000000000)).zfill(9)
    return s

pid_split_symble="#_"

host_split_symble="_@"


# Database setting (Make sure the database and tables are created)

In [None]:
import psycopg2

from psycopg2 import extras as ex
# Create a postgreSQL DB connection object for storing provenance graph edges into DB
# Original '/var/run/postgresql/' has been replaced with 'localhost' since we are using docker and accessing as a service in port 5437
connect = psycopg2.connect(database = 'optc_db',
                           host = 'localhost',
                           user = 'postgres',
                           password = 'postgres',
                           port = '5437'
                          )

cur = connect.cursor()

In [29]:
# Clear all data in the database. Run it carefully!

In [None]:
# Delete events/edges inserted in previous run
tt=cur.execute("""
    delete from event_table where 1=1;
""")
print(tt)
connect.commit()

None


In [None]:
# Delete messages inserted in previous run
tt=cur.execute("""
    delete from nodeid2msg where 1=1;
""")
print(tt)
connect.commit()

None


## Parse data

In [None]:
reverse_edge_type=[
    "READ",
]


# 3 types of nodes used in kairos
node_type_used=[
    'FILE',
 'FLOW',
 'PROCESS',
#  'SHELL',
]
# Parsing source node, destination node, edge, timestamp, hostname etc.
def process_raw_dic(raw_dic):
    ans_dic={}
    
    
    ans_dic['hostname']=raw_dic['hostname'].split('.')[0]
    
    ans_dic['edge_type']=raw_dic['action']
    ans_dic['src_id']=raw_dic['actorID']
    ans_dic['dst_id']=raw_dic['objectID']
    
    ans_dic['src_type']='PROCESS'
    ans_dic['timestamp']=datetime_to_timestamp_US(raw_dic['timestamp'])
    ans_dic['dst_type']=raw_dic['object']
    
    try:
        node_uuid2path[ans_dic['src_id']]=ans_dic['hostname']+host_split_symble+raw_dic['properties']['image_path']  
        
    
        if raw_dic['object']=='FLOW':
            temp_flow=f"{raw_dic['properties']['direction']}#{raw_dic['properties']['src_ip']}:{raw_dic['properties']['src_port']}->{raw_dic['properties']['dest_ip']}:{raw_dic['properties']['dest_port']}"
            node_uuid2path[ans_dic['dst_id']]=ans_dic['hostname']+host_split_symble+temp_flow

        if raw_dic['object']=='FILE':              
            node_uuid2path[ans_dic['dst_id']]=ans_dic['hostname']+host_split_symble+raw_dic['properties']['file_path']


    except:
        ans_dic={}
    
    return ans_dic

In [33]:
node_type={'FILE',
 'FLOW',
 'MODULE',
 'PROCESS',
 'REGISTRY',
 'SHELL',
 'TASK',
 'THREAD',
 'USER_SESSION'}

# Unzip data

In [34]:
from os import walk
 
# folder path
dir_path = '/home/shahidul2k9/data/optc/plain/'
 
# list to store files name
res = []
for (dir_path, dir_names, file_names) in walk(dir_path):
    if dir_path[-1]!='/':
        dir_path+='/'
#     print(f"{dir_path=}")
#     print(f"{file_names=}")
    for f in file_names:
        temp_file_path=dir_path+f
#         print(f"{temp_file_path=}")
     
        res.append(temp_file_path)

In [None]:
# Unzip compressed json files
for r in tqdm(res):
    if ("201-225" in r or "401-425" in r or "651-675" in r or "501-525" in r or "51-75" in r) and ".gz" in r:
        os.system(f"gzip -d {r}")
        print(f" {r} Finished！")

100%|██████████| 4481/4481 [04:32<00:00, 16.43it/s]

 /home/shahidul2k9/data/optc/plain/ecar/benign/17-18Sep19/AIA-51-75/AIA-51-75.ecar-last.json.gz Finished！





# Process the features of nodes and edges

## Edge features

In [None]:
# 10 types of edges
edge_set=['OPEN',
'READ',
'CREATE',
'MESSAGE',
'MODIFY',
'START',
'RENAME',
'DELETE',
'TERMINATE',
'WRITE',]

# Generate edge type one-hot
edgevec=torch.nn.functional.one_hot(torch.arange(0, len(edge_set)), num_classes=len(edge_set))


# Allocating One-hot encoding for each edge type
edge2vec={}
for e in range(len(edge_set)):
    edge2vec[edge_set[e]]=edgevec[e]

In [37]:
edge2vec

{'OPEN': tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'READ': tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 'CREATE': tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
 'MESSAGE': tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
 'MODIFY': tensor([0, 0, 0, 0, 1, 0, 0, 0, 0, 0]),
 'START': tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0]),
 'RENAME': tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 'DELETE': tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0]),
 'TERMINATE': tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0]),
 'WRITE': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1])}

In [None]:
# Edge label to real number mapping for indexing operation
rel2id={}
index=1
for i in edge_set:
    rel2id[index]=i
    rel2id[i]=index
    index+=1

In [39]:
rel2id

{1: 'OPEN',
 'OPEN': 1,
 2: 'READ',
 'READ': 2,
 3: 'CREATE',
 'CREATE': 3,
 4: 'MESSAGE',
 'MESSAGE': 4,
 5: 'MODIFY',
 'MODIFY': 5,
 6: 'START',
 'START': 6,
 7: 'RENAME',
 'RENAME': 7,
 8: 'DELETE',
 'DELETE': 8,
 9: 'TERMINATE',
 'TERMINATE': 9,
 10: 'WRITE',
 'WRITE': 10}

## Node features

In [40]:
from sklearn.feature_extraction import FeatureHasher
from torch_geometric.transforms import NormalizeFeatures

from sklearn import preprocessing
import numpy as np


encode_len=16

FH_string=FeatureHasher(n_features=encode_len,input_type="string")
FH_dict=FeatureHasher(n_features=encode_len,input_type="dict")


def path2higlist(p):
    l=[]
    spl=p.strip().split('/')
    for i in spl:
        if len(l)!=0:
            l.append(l[-1]+'/'+i)
        else:
            l.append(i)
#     print(l)
    return l

def ip2higlist(p):
    l=[]
    if "::" not in p:
        spl=p.strip().split('.')
        for i in spl:
            if len(l)!=0:
                l.append(l[-1]+'.'+i)
            else:
                l.append(i)
    #     print(l)
        return l
    else:
        spl=p.strip().split(':')
        for i in spl:
            if len(l)!=0:
                l.append(l[-1]+':'+i)
            else:
                l.append(i)
    #     print(l)
        return l
def list2str(l):
    s=''
    for i in l:
        s+=i
    return s

def str2tensor(msg_type,msg):
    if msg_type == 'FLOW':
        h_msg=list2str(ip2higlist(msg))
    else:
        h_msg=list2str(path2higlist(msg))
    vec=FH_string.transform([msg_type+h_msg]).toarray()
    vec=torch.tensor(vec).reshape(encode_len).float()
#     print(h_msg)
    return vec


class TimeEncoder(torch.nn.Module):
    def __init__(self, out_channels):
        super().__init__()
        self.out_channels = out_channels
        self.lin = Linear(1, out_channels)

    def reset_parameters(self):
        self.lin.reset_parameters()

    def forward(self, t):
        return self.lin(t.view(-1, 1)).cos()
    
time_enc=TimeEncoder(50)

# Store the benign data to database

In [None]:
# Hashmap for storing unique nodes with associated metadata
node_uuid2path={}

In [42]:
from os import walk
 
# folder path
dir_path = '/home/shahidul2k9/data/optc/plain/ecar/benign/'

res = []
for (dir_path, dir_names, file_names) in walk(dir_path):
    if dir_path[-1]!='/':
        dir_path+='/'
#     print(f"{dir_path=}")
#     print(f"{file_names=}")
    for f in file_names:
        temp_file_path=dir_path+f
#         print(f"{temp_file_path=}")
        if "201-225" in temp_file_path or ("20-23Sep19" in temp_file_path and ("401-425" in temp_file_path or "651-675" in temp_file_path or "501-525" in temp_file_path or "51-75" in temp_file_path)):
            res.append(temp_file_path)

In [None]:
# Unzip ecar benign compressed data file
for r in tqdm(res):
    if  ".gz" in r:
        #os.system(f"gzip -d {r}")
        print(f" {r} Finished！")

100%|██████████| 36/36 [00:00<00:00, 406994.46it/s]


In [None]:
# White  listed hosts names
def is_selected_hosts(line):
    hosts=[
        'SysClient0201',
        'SysClient0402',
        'SysClient0660',
        'SysClient0501',
        'SysClient0051',        
        'SysClient0209',
    ]
    flag=False
    for h in hosts:
        if h in line:
            flag=True
            break
    return flag

In [None]:
# Iterate though all ecar benign logs, parse it and insert into DB
for file_path in res:
    
    edge_list=[]

    with open(file_path) as f:
        for line in tqdm(f):
            line=line.replace('\\\\','/')
            temp_dic=json.loads(line.strip())
            hostname=temp_dic['hostname'].split('.')[0]
            if temp_dic['object'] in node_type_used and is_selected_hosts(hostname):
                edge_list.append(process_raw_dic(temp_dic))
    
        print(f'{len(edge_list)=}')
        data_list=[]
        for e in edge_list:
            try:
                data_list.append([
                    e['src_id'],
                    e['src_type'],
                    e['edge_type'],
                    e['dst_id'],
                    e['dst_type'],
                    e['hostname'],
                    e['timestamp'],
                    "benign",
                ])
            except:
                pass

        # write to database
        sql = '''insert into event_table
                             values %s
                '''
        ex.execute_values(cur,sql, data_list,page_size=10000)
        connect.commit()
        
        print(f"{file_path} Finished! ")
        # Clear the tmp variables to release the memory.
        del edge_list
        del data_list

6190907it [00:56, 110516.43it/s]


len(edge_list)=403686
/home/shahidul2k9/data/optc/plain/ecar/benign/19Sep19/AIA-201-225/AIA-201-225.ecar-2019-12-07T16-16-05.667.json Finished! 


23189497it [03:30, 110353.53it/s]


len(edge_list)=1744136
/home/shahidul2k9/data/optc/plain/ecar/benign/19Sep19/AIA-201-225/AIA-201-225.ecar-last.json Finished! 


76699983it [10:59, 116299.58it/s]


len(edge_list)=2957067
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-501-525/AIA-501-525.ecar-2019-11-15T09-43-35.856.json Finished! 


76706216it [10:47, 118536.90it/s]


len(edge_list)=2965137
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-501-525/AIA-501-525.ecar-2019-11-15T17-22-42.923.json Finished! 


20541672it [03:16, 104274.42it/s]


len(edge_list)=785228
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-501-525/AIA-501-525.ecar-last.json Finished! 


78048775it [11:08, 116744.64it/s]


len(edge_list)=4039788
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-501-525/AIA-501-525.ecar-2019-11-15T05-59-37.208.json Finished! 


76719146it [10:39, 119990.49it/s]


len(edge_list)=2876893
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-501-525/AIA-501-525.ecar-2019-11-15T13-29-59.064.json Finished! 


36440311it [04:33, 133396.33it/s]


len(edge_list)=843401
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-501-525/AIA-501-525.ecar-2019-11-15T03-10-00.546.json Finished! 


36333460it [05:06, 118559.92it/s]


len(edge_list)=1022401
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-651-675/AIA-651-675.ecar-2019-11-15T03-09-38.187.json Finished! 


76458599it [10:36, 120164.85it/s]


len(edge_list)=2610952
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-651-675/AIA-651-675.ecar-2019-11-15T13-28-16.876.json Finished! 


32029474it [04:15, 125258.73it/s]


len(edge_list)=1099540
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-651-675/AIA-651-675.ecar-last.json Finished! 


77786670it [10:47, 120131.87it/s]


len(edge_list)=3133116
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-651-675/AIA-651-675.ecar-2019-11-15T05-48-17.579.json Finished! 


76457220it [10:25, 122299.01it/s]


len(edge_list)=2644933
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-651-675/AIA-651-675.ecar-2019-11-15T09-37-46.741.json Finished! 


76425447it [10:30, 121194.31it/s]


len(edge_list)=2617847
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-651-675/AIA-651-675.ecar-2019-11-15T17-26-42.298.json Finished! 


76383303it [10:59, 115853.86it/s]


len(edge_list)=3024043
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-401-425/AIA-401-425.ecar-2019-12-07T20-18-48.097.json Finished! 


76389825it [10:41, 119053.23it/s]


len(edge_list)=2990117
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-401-425/AIA-401-425.ecar-2019-12-07T12-19-23.521.json Finished! 


76407480it [10:32, 120726.78it/s]


len(edge_list)=3002918
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-401-425/AIA-401-425.ecar-2019-12-07T16-09-39.085.json Finished! 


53285475it [07:04, 125617.14it/s]


len(edge_list)=2101814
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-401-425/AIA-401-425.ecar-last.json Finished! 


77020972it [11:03, 116146.47it/s]


len(edge_list)=3421514
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-401-425/AIA-401-425.ecar-2019-12-07T08-33-35.028.json Finished! 


21076091it [02:49, 124576.45it/s]


len(edge_list)=480749
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-401-425/AIA-401-425.ecar-2019-12-07T06-28-53.370.json Finished! 


25224989it [03:08, 133931.88it/s]


len(edge_list)=432115
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-51-75/AIA-51-75.ecar-2019-12-07T16-15-43.163.json Finished! 


76427896it [10:44, 118605.86it/s]


len(edge_list)=2892230
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-51-75/AIA-51-75.ecar-2019-12-07T21-31-30.259.json Finished! 


76404717it [10:27, 121694.22it/s]


len(edge_list)=2919561
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-51-75/AIA-51-75.ecar-2019-12-08T00-56-58.175.json Finished! 


42840935it [06:13, 114785.74it/s]


len(edge_list)=1616861
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-51-75/AIA-51-75.ecar-last.json Finished! 


76421197it [10:28, 121583.88it/s]


len(edge_list)=2824549
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-51-75/AIA-51-75.ecar-2019-12-08T04-30-36.852.json Finished! 


76638913it [10:52, 117444.88it/s]


len(edge_list)=3417179
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-51-75/AIA-51-75.ecar-2019-12-07T18-18-31.331.json Finished! 


53321795it [08:47, 101079.06it/s]


len(edge_list)=4793249
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-201-225/AIA-201-225.ecar-2019-12-07T19-16-05.788.json Finished! 


76648515it [11:22, 112239.98it/s]


len(edge_list)=5027745
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-201-225/AIA-201-225.ecar-2019-12-07T22-06-33.589.json Finished! 


74706481it [11:24, 109147.09it/s]


len(edge_list)=5727218
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-201-225/AIA-201-225.ecar-last.json Finished! 


76495083it [11:51, 107515.72it/s]


len(edge_list)=5879664
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-201-225/AIA-201-225.ecar-2019-12-08T01-57-30.012.json Finished! 


76469610it [11:33, 110258.74it/s]


len(edge_list)=5944960
/home/shahidul2k9/data/optc/plain/ecar/benign/20-23Sep19/AIA-201-225/AIA-201-225.ecar-2019-12-08T05-46-21.658.json Finished! 


32326513it [04:42, 114486.29it/s]


len(edge_list)=2449767
/home/shahidul2k9/data/optc/plain/ecar/benign/18-19Sep19/AIA-201-225/AIA-201-225.ecar-2019-12-07T10-37-17.942.json Finished! 


69051137it [10:31, 109378.71it/s]


len(edge_list)=4978166
/home/shahidul2k9/data/optc/plain/ecar/benign/18-19Sep19/AIA-201-225/AIA-201-225.ecar-last.json Finished! 


75743706it [11:38, 108369.47it/s]


len(edge_list)=5575885
/home/shahidul2k9/data/optc/plain/ecar/benign/17-18Sep19/AIA-201-225/AIA-201-225.ecar-2019-12-07T01-57-49.366.json Finished! 


43174191it [06:58, 103262.55it/s]


len(edge_list)=3393220
/home/shahidul2k9/data/optc/plain/ecar/benign/17-18Sep19/AIA-201-225/AIA-201-225.ecar-last.json Finished! 


75798918it [11:53, 106214.84it/s]


len(edge_list)=6295186
/home/shahidul2k9/data/optc/plain/ecar/benign/17-18Sep19/AIA-201-225/AIA-201-225.ecar-2019-12-07T06-00-00.251.json Finished! 


# Store the evaluation data to database

In [None]:
from os import walk
 # Evaluation dataset subfolder listing
# folder path
dir_path = '/home/shahidul2k9/data/optc/plain/ecar/evaluation/'

res = []
for (dir_path, dir_names, file_names) in walk(dir_path):
    if dir_path[-1]!='/':
        dir_path+='/'
    for f in file_names:
        temp_file_path=dir_path+f
#         print(f"{temp_file_path=}")
        if ("201-225" in temp_file_path or "401-425" in temp_file_path or "651-675" in temp_file_path or "501-525" in temp_file_path or "51-75" in temp_file_path):
            res.append(temp_file_path)

In [None]:
# Decompress evaluation log files
for r in tqdm(res):
    if  ".gz" in r:
        os.system(f"gzip -d {r}")
        print(f" {r} Finished！")

100%|██████████| 35/35 [00:00<00:00, 304565.64it/s]


In [None]:
# White listed hostnames for evaluation
def is_selected_hosts(line):
    hosts=[
        'SysClient0201',
        'SysClient0402',
        'SysClient0660',
        'SysClient0501',
        'SysClient0051',        
        'SysClient0207',
    ]
    flag=False
    for h in hosts:
        if h in line:
            flag=True
            break
    return flag

In [None]:
# Iterating through uncompressed evaluation dataset files, extracting log and finally inserting into DB
for file_path in res:
    
    edge_list=[]

    with open(file_path) as f:
        for line in tqdm(f):
            line=line.replace('\\\\','/')
            temp_dic=json.loads(line.strip())
            hostname=temp_dic['hostname'].split('.')[0]
            if temp_dic['object'] in node_type_used and is_selected_hosts(hostname):
                edge_list.append(process_raw_dic(temp_dic))
    
        print(f'{len(edge_list)=}')
        data_list=[]
        for e in edge_list:
            try:
                data_list.append([
                    e['src_id'],
                    e['src_type'],
                    e['edge_type'],
                    e['dst_id'],
                    e['dst_type'],
                    e['hostname'],
                    e['timestamp'],
                    "evaluation",
                ])
            except:
                pass

        sql = '''insert into event_table
                             values %s
                '''
        ex.execute_values(cur,sql, data_list,page_size=10000)
        connect.commit()
        
        print(f"{file_path} Finished! ")
        # Clear the tmp variables to release the memory.
        del edge_list
        del data_list

32570231it [04:12, 129201.09it/s]


len(edge_list)=1194622
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep19-red/AIA-501-525/AIA-501-525.ecar-last.json Finished! 


32614330it [04:44, 114534.90it/s]


len(edge_list)=1131200
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep19-red/AIA-651-675/AIA-651-675.ecar-last.json Finished! 


23066470it [02:57, 129809.09it/s]


len(edge_list)=947751
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep19-red/AIA-401-425/AIA-401-425.ecar-2019-12-08T01-29-39.403.json Finished! 


9855840it [01:16, 129404.07it/s]


len(edge_list)=391973
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep19-red/AIA-401-425/AIA-401-425.ecar-last.json Finished! 


32602194it [04:35, 118389.64it/s]


len(edge_list)=1179498
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep19-red/AIA-51-75/AIA-51-75.ecar-last.json Finished! 


34146068it [04:50, 117508.21it/s]


len(edge_list)=2151310
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep19-red/AIA-201-225/AIA-201-225.ecar-last.json Finished! 


1759566it [00:15, 109979.43it/s]


len(edge_list)=139907
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep19-red/AIA-201-225/AIA-201-225.ecar-2019-12-08T11-05-10.046.json Finished! 


30125538it [04:28, 112385.02it/s]


len(edge_list)=1274143
/home/shahidul2k9/data/optc/plain/ecar/evaluation/24Sep19/AIA-501-525/AIA-501-525.ecar-2019-11-17T04-01-58.625.json Finished! 


65758362it [09:04, 120713.36it/s]


len(edge_list)=2554194
/home/shahidul2k9/data/optc/plain/ecar/evaluation/24Sep19/AIA-501-525/AIA-501-525.ecar-last.json Finished! 


18306997it [02:21, 128999.66it/s]


len(edge_list)=568693
/home/shahidul2k9/data/optc/plain/ecar/evaluation/24Sep19/AIA-651-675/AIA-651-675.ecar-2019-11-17T03-25-23.290.json Finished! 


76274003it [10:13, 124375.17it/s]


len(edge_list)=2829980
/home/shahidul2k9/data/optc/plain/ecar/evaluation/24Sep19/AIA-651-675/AIA-651-675.ecar-last.json Finished! 


73190980it [10:05, 120953.63it/s]


len(edge_list)=2825428
/home/shahidul2k9/data/optc/plain/ecar/evaluation/24Sep19/AIA-401-425/AIA-401-425.ecar-2019-12-08T07-35-11.579.json Finished! 


23196562it [03:00, 128314.47it/s]


len(edge_list)=878620
/home/shahidul2k9/data/optc/plain/ecar/evaluation/24Sep19/AIA-401-425/AIA-401-425.ecar-last.json Finished! 


76327011it [10:17, 123697.85it/s]


len(edge_list)=2839067
/home/shahidul2k9/data/optc/plain/ecar/evaluation/24Sep19/AIA-51-75/AIA-51-75.ecar-2019-12-08T15-24-26.681.json Finished! 


7422309it [01:05, 113218.80it/s]


len(edge_list)=275431
/home/shahidul2k9/data/optc/plain/ecar/evaluation/24Sep19/AIA-51-75/AIA-51-75.ecar-2019-12-08T12-56-31.374.json Finished! 


12133760it [01:56, 104511.24it/s]


len(edge_list)=457651
/home/shahidul2k9/data/optc/plain/ecar/evaluation/24Sep19/AIA-51-75/AIA-51-75.ecar-last.json Finished! 


49333875it [07:05, 115981.27it/s]


len(edge_list)=3544607
/home/shahidul2k9/data/optc/plain/ecar/evaluation/24Sep19/AIA-201-225/AIA-201-225.ecar-2019-12-08T17-41-18.327.json Finished! 


46514809it [07:15, 106872.47it/s]


len(edge_list)=3497340
/home/shahidul2k9/data/optc/plain/ecar/evaluation/24Sep19/AIA-201-225/AIA-201-225.ecar-last.json Finished! 


13921814it [01:48, 128881.99it/s]


len(edge_list)=586663
/home/shahidul2k9/data/optc/plain/ecar/evaluation/25Sept/AIA-501-525/AIA-501-525.ecar-last.json Finished! 


10781277it [01:29, 120326.08it/s]


len(edge_list)=493144
/home/shahidul2k9/data/optc/plain/ecar/evaluation/25Sept/AIA-501-525/AIA-501-525.ecar-2019-11-17T15-04-02.073.json Finished! 


333939it [00:07, 47156.81it/s]


len(edge_list)=11430
/home/shahidul2k9/data/optc/plain/ecar/evaluation/25Sept/AIA-651-675/AIA-651-675.ecar-2019-11-17T14-50-25.754.json Finished! 


24895771it [03:24, 121958.03it/s]


len(edge_list)=1035074
/home/shahidul2k9/data/optc/plain/ecar/evaluation/25Sept/AIA-651-675/AIA-651-675.ecar-last.json Finished! 


27461034it [03:33, 128625.10it/s]


len(edge_list)=1049439
/home/shahidul2k9/data/optc/plain/ecar/evaluation/25Sept/AIA-401-425/AIA-401-425.ecar-last.json Finished! 


25186287it [03:41, 113871.11it/s]


len(edge_list)=1053016
/home/shahidul2k9/data/optc/plain/ecar/evaluation/25Sept/AIA-51-75/AIA-51-75.ecar-last.json Finished! 


27031942it [03:51, 116579.47it/s]


len(edge_list)=1839580
/home/shahidul2k9/data/optc/plain/ecar/evaluation/25Sept/AIA-201-225/AIA-201-225.ecar-last.json Finished! 


46406787it [06:14, 123758.54it/s]


len(edge_list)=1768209
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep-night/AIA-501-525/AIA-501-525.ecar-last.json Finished! 


23561343it [03:06, 126386.76it/s]


len(edge_list)=897205
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep-night/AIA-501-525/AIA-501-525.ecar-2019-11-16T23-22-29.234.json Finished! 


58296574it [08:11, 118513.83it/s]


len(edge_list)=2195435
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep-night/AIA-651-675/AIA-651-675.ecar-last.json Finished! 


11713601it [01:32, 125974.33it/s]


len(edge_list)=441649
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep-night/AIA-651-675/AIA-651-675.ecar-2019-11-16T23-07-40.716.json Finished! 


66585106it [09:06, 121837.70it/s]


len(edge_list)=2522019
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep-Night/AIA-401-425/AIA-401-425.ecar-2019-12-08T04-06-31.326.json Finished! 


3341987it [00:25, 128664.66it/s]


len(edge_list)=125517
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep-Night/AIA-401-425/AIA-401-425.ecar-last.json Finished! 


69126143it [09:30, 121237.36it/s]


len(edge_list)=2654927
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep-Night/AIA-51-75/AIA-51-75.ecar-last.json Finished! 


943962it [00:06, 138007.23it/s]


len(edge_list)=14520
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep-Night/AIA-51-75/AIA-51-75.ecar-2019-12-08T10-19-52.584.json Finished! 


27304413it [03:54, 116648.46it/s]


len(edge_list)=2064339
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep-Night/AIA-201-225/AIA-201-225.ecar-last.json Finished! 


42593860it [06:27, 109872.98it/s]


len(edge_list)=3222906
/home/shahidul2k9/data/optc/plain/ecar/evaluation/23Sep-Night/AIA-201-225/AIA-201-225.ecar-2019-12-08T14-19-51.427.json Finished! 


# Store the node data into database

In [None]:
# Insert temporarily stored nodes and associated metadata into DB
data_list=[]
for n in node_uuid2path:
    try:
        data_list.append([
            n,
             node_uuid2path[n]
        ])
    except:
        pass
    

sql = '''insert into nodeid2msg
                     values %s
        '''
ex.execute_values(cur,sql, data_list,page_size=10000)
connect.commit()

In [None]:
# Log total number of nodes ~19 millions
len(node_uuid2path)

18965643

# Load node data from database

In [None]:
# Construct the map between nodeid and msg
sql="select * from nodeid2msg;"
cur.execute(sql)
rows = cur.fetchall()

node_uuid2path={}  # nodeid => msg      node hash => nodeid
for i in tqdm(rows):
    # Map node UUID to metadata
    node_uuid2path[i[0]]=i[1]

100%|██████████| 18965643/18965643 [00:11<00:00, 1622800.85it/s]


# Generate the benign datasets

## h402  22

In [None]:
# Read benign dataset generated by host SysClient0402 on 22nd September 2019 and create temporal graph
for day in tqdm(range(22,23)):
    start_timestamp=datetime_to_timestamp_US('2019-09-'+str(day)+'T00:00:00')
    end_timestamp=datetime_to_timestamp_US('2019-09-'+str(day+1)+'T00:00:00')
    hostname='SysClient0402'
    datalabel='benign'

    # Create SQL query command
    sql=f"""
    select * from event_table
    where
          timestamp>{start_timestamp} and timestamp<{end_timestamp}
          and hostname='{hostname}' and data_label='{datalabel}' ORDER BY timestamp;
    """

    # Execute SQL query
    cur.execute(sql)
    # Fetch Edges
    events = cur.fetchall()
    print(f"{len(events)=}")
    
    
    # Generate local temporal graph node indexes mapping
    node_set=set()
    node_uuid2index={}
    temp_index=0
    for e in events:
        if e[3] not in node_uuid2path or e[0]  not in node_uuid2path:
            continue

        if e[0] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[0]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[0]]
            temp_index+=1

        if e[3] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[3]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[3]]
            temp_index+=1 

    torch.save(node_uuid2index,f'node_uuid2index_9_{day}_host={hostname}_datalabel={datalabel}')
       

    dataset = TemporalData()
    src = []
    dst = []
    msg = []
    t = []
    
    # Create temporal graph with nodes, edges, messages and times
    for e in (events):
        if e[3] in node_uuid2index and e[0] in node_uuid2index:
            # If the image path of the node is not recorded, then skip this edge
            src.append(node_uuid2index[e[0]])
            dst.append(node_uuid2index[e[3]])
        #     msg.append(torch.cat([torch.from_numpy(node2higvec_bn[i[0]]), rel2vec[i[2]], torch.from_numpy(node2higvec_bn[i[1]])] ))

            msg.append(torch.cat([str2tensor(e[1],node_uuid2path[e[0]]), 
                                  edge2vec[e[2]], 
                                  str2tensor(e[4],node_uuid2path[e[3]])
                                 ]))
            t.append(int(e[6]))

    dataset.src = torch.tensor(src)
    dataset.dst = torch.tensor(dst)
    dataset.t = torch.tensor(t)
    dataset.msg = torch.vstack(msg)
    dataset.src = dataset.src.to(torch.long)
    dataset.dst = dataset.dst.to(torch.long)
    dataset.msg = dataset.msg.to(torch.float)
    dataset.t = dataset.t.to(torch.long)
    # Store temporal graph in disk
    torch.save(dataset, f"/home/shahidul2k9/data/optc/out/evaluation/9_{day}_host={hostname}_datalabel={datalabel}.TemporalData")  
    

  0%|          | 0/1 [00:00<?, ?it/s]

len(events)=4410529


100%|██████████| 1/1 [25:57<00:00, 1557.68s/it]


## h660 22

In [None]:
# Read benign dataset generated by host SysClient0660 on 22nd September 2019 and create temporal graph
for day in tqdm(range(22,23)):
    start_timestamp=datetime_to_timestamp_US('2019-09-'+str(day)+'T00:00:00')
    end_timestamp=datetime_to_timestamp_US('2019-09-'+str(day+1)+'T00:00:00')
    hostname='SysClient0660'
    datalabel='benign'
    # Create SQL query command
    sql=f"""
    select * from event_table
    where
          timestamp>{start_timestamp} and timestamp<{end_timestamp}
          and hostname='{hostname}' and data_label='{datalabel}' ORDER BY timestamp;
    """
    
    # Execute SQL query
    cur.execute(sql)
    events = cur.fetchall()
    print(f"{len(events)=}")
    
    
    # Generate local temporal graph node indexes mapping
    node_set=set()
    node_uuid2index={}
    temp_index=0
    for e in events:
        if e[3] not in node_uuid2path or e[0]  not in node_uuid2path:
            continue

        if e[0] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[0]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[0]]
            temp_index+=1

        if e[3] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[3]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[3]]
            temp_index+=1 

    torch.save(node_uuid2index,f'node_uuid2index_9_{day}_host={hostname}_datalabel={datalabel}')
       

    dataset = TemporalData()
    src = []
    dst = []
    msg = []
    t = []
    # Create temporal graph with nodes, edges, messages and times
    for e in (events):
        if e[3] in node_uuid2index and e[0] in node_uuid2index:
            # If the image path of the node is not recorded, then skip this edge
            src.append(node_uuid2index[e[0]])
            dst.append(node_uuid2index[e[3]])
        #     msg.append(torch.cat([torch.from_numpy(node2higvec_bn[i[0]]), rel2vec[i[2]], torch.from_numpy(node2higvec_bn[i[1]])] ))

            msg.append(torch.cat([str2tensor(e[1],node_uuid2path[e[0]]), 
                                  edge2vec[e[2]], 
                                  str2tensor(e[4],node_uuid2path[e[3]])
                                 ]))
            t.append(int(e[6]))

    dataset.src = torch.tensor(src)
    dataset.dst = torch.tensor(dst)
    dataset.t = torch.tensor(t)
    dataset.msg = torch.vstack(msg)
    dataset.src = dataset.src.to(torch.long)
    dataset.dst = dataset.dst.to(torch.long)
    dataset.msg = dataset.msg.to(torch.float)
    dataset.t = dataset.t.to(torch.long)
    # Store temporal graph in disk
    torch.save(dataset, f"/home/shahidul2k9/data/optc/out/evaluation/9_{day}_host={hostname}_datalabel={datalabel}.TemporalData")  
    

  0%|          | 0/1 [00:00<?, ?it/s]

len(events)=3889699


100%|██████████| 1/1 [22:04<00:00, 1324.87s/it]


## h501 21

In [None]:
# Read benign dataset generated by host SysClient0501 on 21st September 2019 and create temporal graph
for day in tqdm(range(21,22)):
    start_timestamp=datetime_to_timestamp_US('2019-09-'+str(day)+'T00:00:00')
    end_timestamp=datetime_to_timestamp_US('2019-09-'+str(day+1)+'T00:00:00')
    hostname='SysClient0501'
    datalabel='benign'
    # Create SQL query command
    sql=f"""
    select * from event_table
    where
          timestamp>{start_timestamp} and timestamp<{end_timestamp}
          and hostname='{hostname}' and data_label='{datalabel}' ORDER BY timestamp;
    """
    # Execute SQL query
    cur.execute(sql)
    events = cur.fetchall()
    print(f"{len(events)=}")
    
    
    
    node_set=set()
    node_uuid2index={}
    temp_index=0
    for e in events:
        if e[3] not in node_uuid2path or e[0]  not in node_uuid2path:
            continue

        if e[0] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[0]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[0]]
            temp_index+=1

        if e[3] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[3]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[3]]
            temp_index+=1 

    torch.save(node_uuid2index,f'node_uuid2index_9_{day}_host={hostname}_datalabel={datalabel}')
       

    dataset = TemporalData()
    src = []
    dst = []
    msg = []
    t = []
    # Create temporal graph with nodes, edges, messages and times
    for e in (events):
        if e[3] in node_uuid2index and e[0] in node_uuid2index:
            # If the image path of the node is not recorded, then skip this edge
            src.append(node_uuid2index[e[0]])
            dst.append(node_uuid2index[e[3]])
        #     msg.append(torch.cat([torch.from_numpy(node2higvec_bn[i[0]]), rel2vec[i[2]], torch.from_numpy(node2higvec_bn[i[1]])] ))

            msg.append(torch.cat([str2tensor(e[1],node_uuid2path[e[0]]), 
                                  edge2vec[e[2]], 
                                  str2tensor(e[4],node_uuid2path[e[3]])
                                 ]))
            t.append(int(e[6]))

    dataset.src = torch.tensor(src)
    dataset.dst = torch.tensor(dst)
    dataset.t = torch.tensor(t)
    dataset.msg = torch.vstack(msg)
    dataset.src = dataset.src.to(torch.long)
    dataset.dst = dataset.dst.to(torch.long)
    dataset.msg = dataset.msg.to(torch.float)
    dataset.t = dataset.t.to(torch.long)
    # Store temporal graph in disk
    torch.save(dataset, f"/home/shahidul2k9/data/optc/out/evaluation/9_{day}_host={hostname}_datalabel={datalabel}.TemporalData")  
    

  0%|          | 0/1 [00:00<?, ?it/s]

len(events)=4337416


100%|██████████| 1/1 [23:52<00:00, 1432.56s/it]


## h501 22

In [None]:
# Read benign dataset generated by host SysClient0501 on 22nd September 2019 and create temporal graph
for day in tqdm(range(22,23)):
    start_timestamp=datetime_to_timestamp_US('2019-09-'+str(day)+'T00:00:00')
    end_timestamp=datetime_to_timestamp_US('2019-09-'+str(day+1)+'T00:00:00')
    hostname='SysClient0501'
    datalabel='benign'
    # Create SQL query command
    sql=f"""
    select * from event_table
    where
          timestamp>{start_timestamp} and timestamp<{end_timestamp}
          and hostname='{hostname}' and data_label='{datalabel}' ORDER BY timestamp;
    """
    # Execute SQL query
    cur.execute(sql)
    events = cur.fetchall()
    print(f"{len(events)=}")
    
    
    
    node_set=set()
    node_uuid2index={}
    temp_index=0
    # Generate local temporal graph node indexes mapping
    for e in events:
        if e[3] not in node_uuid2path or e[0]  not in node_uuid2path:
            continue

        if e[0] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[0]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[0]]
            temp_index+=1

        if e[3] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[3]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[3]]
            temp_index+=1 

    torch.save(node_uuid2index,f'node_uuid2index_9_{day}_host={hostname}_datalabel={datalabel}')

    dataset = TemporalData()
    src = []
    dst = []
    msg = []
    t = []
    # Create temporal graph with nodes, edges, messages and times
    for e in (events):
        if e[3] in node_uuid2index and e[0] in node_uuid2index:
            # If the image path of the node is not recorded, then skip this edge
            src.append(node_uuid2index[e[0]])
            dst.append(node_uuid2index[e[3]])
        #     msg.append(torch.cat([torch.from_numpy(node2higvec_bn[i[0]]), rel2vec[i[2]], torch.from_numpy(node2higvec_bn[i[1]])] ))

            msg.append(torch.cat([str2tensor(e[1],node_uuid2path[e[0]]), 
                                  edge2vec[e[2]], 
                                  str2tensor(e[4],node_uuid2path[e[3]])
                                 ]))
            t.append(int(e[6]))

    dataset.src = torch.tensor(src)
    dataset.dst = torch.tensor(dst)
    dataset.t = torch.tensor(t)
    dataset.msg = torch.vstack(msg)
    dataset.src = dataset.src.to(torch.long)
    dataset.dst = dataset.dst.to(torch.long)
    dataset.msg = dataset.msg.to(torch.float)
    dataset.t = dataset.t.to(torch.long)
    # Store temporal graph in disk
    torch.save(dataset, f"/home/shahidul2k9/data/optc/out/evaluation/9_{day}_host={hostname}_datalabel={datalabel}.TemporalData")  
    

  0%|          | 0/1 [00:00<?, ?it/s]

len(events)=4263136


100%|██████████| 1/1 [24:09<00:00, 1449.94s/it]


## h051 22

In [None]:
# Read benign dataset generated by host SysClient0051 on 22nd September 2019 and create temporal graph
for day in tqdm(range(22,23)):
    start_timestamp=datetime_to_timestamp_US('2019-09-'+str(day)+'T00:00:00')
    end_timestamp=datetime_to_timestamp_US('2019-09-'+str(day+1)+'T00:00:00')
    hostname='SysClient0051'
    datalabel='benign'
    # Create SQL query command
    sql=f"""
    select * from event_table
    where
          timestamp>{start_timestamp} and timestamp<{end_timestamp}
          and hostname='{hostname}' and data_label='{datalabel}' ORDER BY timestamp;
    """
    # Execute SQL query
    cur.execute(sql)
    events = cur.fetchall()
    print(f"{len(events)=}")
    
    
    
    node_set=set()
    node_uuid2index={}
    temp_index=0
    # Generate local temporal graph node indexes mapping
    for e in events:
        if e[3] not in node_uuid2path or e[0]  not in node_uuid2path:
            continue

        if e[0] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[0]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[0]]
            temp_index+=1

        if e[3] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[3]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[3]]
            temp_index+=1 

    torch.save(node_uuid2index,f'node_uuid2index_9_{day}_host={hostname}_datalabel={datalabel}')
       

    dataset = TemporalData()
    src = []
    dst = []
    msg = []
    t = []
    # Create temporal graph with nodes, edges, messages and times
    for e in (events):
        if e[3] in node_uuid2index and e[0] in node_uuid2index:
            # If the image path of the node is not recorded, then skip this edge
            src.append(node_uuid2index[e[0]])
            dst.append(node_uuid2index[e[3]])
        #     msg.append(torch.cat([torch.from_numpy(node2higvec_bn[i[0]]), rel2vec[i[2]], torch.from_numpy(node2higvec_bn[i[1]])] ))

            msg.append(torch.cat([str2tensor(e[1],node_uuid2path[e[0]]), 
                                  edge2vec[e[2]], 
                                  str2tensor(e[4],node_uuid2path[e[3]])
                                 ]))
            t.append(int(e[6]))

    dataset.src = torch.tensor(src)
    dataset.dst = torch.tensor(dst)
    dataset.t = torch.tensor(t)
    dataset.msg = torch.vstack(msg)
    dataset.src = dataset.src.to(torch.long)
    dataset.dst = dataset.dst.to(torch.long)
    dataset.msg = dataset.msg.to(torch.float)
    dataset.t = dataset.t.to(torch.long)
    # Store temporal graph in disk
    torch.save(dataset, f"/home/shahidul2k9/data/optc/out/evaluation/9_{day}_host={hostname}_datalabel={datalabel}.TemporalData")  
    

  0%|          | 0/1 [00:00<?, ?it/s]

len(events)=4074941


100%|██████████| 1/1 [22:32<00:00, 1352.34s/it]


## h209 22

In [None]:
# Read benign dataset generated by host SysClient0209 on 22nd September 2019 and create temporal graph
for day in tqdm(range(22,23)):
    start_timestamp=datetime_to_timestamp_US('2019-09-'+str(day)+'T00:00:00')
    end_timestamp=datetime_to_timestamp_US('2019-09-'+str(day+1)+'T00:00:00')
    hostname='SysClient0209'
    datalabel='benign'
    # Create SQL query command
    sql=f"""
    select * from event_table
    where
          timestamp>{start_timestamp} and timestamp<{end_timestamp}
          and hostname='{hostname}' and data_label='{datalabel}' ORDER BY timestamp;
    """
    cur.execute(sql)
    events = cur.fetchall()
    print(f"{len(events)=}")
    
    # Generate local temporal graph node indexes mapping
    node_set=set()
    node_uuid2index={}
    temp_index=0
    for e in events:
        if e[3] not in node_uuid2path or e[0]  not in node_uuid2path:
            continue

        if e[0] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[0]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[0]]
            temp_index+=1

        if e[3] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[3]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[3]]
            temp_index+=1 

    torch.save(node_uuid2index,f'node_uuid2index_9_{day}_host={hostname}_datalabel={datalabel}')
       

    dataset = TemporalData()
    src = []
    dst = []
    msg = []
    t = []
    # Create temporal graph with nodes, edges, messages and times
    for e in (events):
        if e[3] in node_uuid2index and e[0] in node_uuid2index:
            # If the image path of the node is not recorded, then skip this edge
            src.append(node_uuid2index[e[0]])
            dst.append(node_uuid2index[e[3]])
        #     msg.append(torch.cat([torch.from_numpy(node2higvec_bn[i[0]]), rel2vec[i[2]], torch.from_numpy(node2higvec_bn[i[1]])] ))

            msg.append(torch.cat([str2tensor(e[1],node_uuid2path[e[0]]), 
                                  edge2vec[e[2]], 
                                  str2tensor(e[4],node_uuid2path[e[3]])
                                 ]))
            t.append(int(e[6]))

    dataset.src = torch.tensor(src)
    dataset.dst = torch.tensor(dst)
    dataset.t = torch.tensor(t)
    dataset.msg = torch.vstack(msg)
    dataset.src = dataset.src.to(torch.long)
    dataset.dst = dataset.dst.to(torch.long)
    dataset.msg = dataset.msg.to(torch.float)
    dataset.t = dataset.t.to(torch.long)
    # Store temporal graph in disk
    torch.save(dataset, f"/home/shahidul2k9/data/optc/out/evaluation/9_{day}_host={hostname}_datalabel={datalabel}.TemporalData")  
    

  0%|          | 0/1 [00:00<?, ?it/s]

len(events)=3853947


100%|██████████| 1/1 [21:45<00:00, 1305.96s/it]


# Generate the validation set

## h209 23

In [None]:
# Read benign dataset generated by host SysClient0209 on 23rd September 2019 and create temporal graph
for day in tqdm(range(23,24)):
    start_timestamp=datetime_to_timestamp_US('2019-09-'+str(day)+'T00:00:00')
    end_timestamp=datetime_to_timestamp_US('2019-09-'+str(day+1)+'T00:00:00')
    hostname='SysClient0209'
    datalabel='benign'
    # Create SQL query command
    sql=f"""
    select * from event_table
    where
          timestamp>{start_timestamp} and timestamp<{end_timestamp}
          and hostname='{hostname}' and data_label='{datalabel}' ORDER BY timestamp;
    """
    # Execute SQL query
    cur.execute(sql)
    events = cur.fetchall()
    print(f"{len(events)=}")
    
    
    
    node_set=set()
    node_uuid2index={}
    temp_index=0
    # Generate local temporal graph node indexes mapping
    for e in events:
        if e[3] not in node_uuid2path or e[0]  not in node_uuid2path:
            continue

        if e[0] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[0]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[0]]
            temp_index+=1

        if e[3] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[3]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[3]]
            temp_index+=1 

    torch.save(node_uuid2index,f'node_uuid2index_9_{day}_host={hostname}_datalabel={datalabel}')
       

    dataset = TemporalData()
    src = []
    dst = []
    msg = []
    t = []
    # Create temporal graph with nodes, edges, messages and times
    for e in (events):
        if e[3] in node_uuid2index and e[0] in node_uuid2index:
            # If the image path of the node is not recorded, then skip this edge
            src.append(node_uuid2index[e[0]])
            dst.append(node_uuid2index[e[3]])
        #     msg.append(torch.cat([torch.from_numpy(node2higvec_bn[i[0]]), rel2vec[i[2]], torch.from_numpy(node2higvec_bn[i[1]])] ))

            msg.append(torch.cat([str2tensor(e[1],node_uuid2path[e[0]]), 
                                  edge2vec[e[2]], 
                                  str2tensor(e[4],node_uuid2path[e[3]])
                                 ]))
            t.append(int(e[6]))

    dataset.src = torch.tensor(src)
    dataset.dst = torch.tensor(dst)
    dataset.t = torch.tensor(t)
    dataset.msg = torch.vstack(msg)
    dataset.src = dataset.src.to(torch.long)
    dataset.dst = dataset.dst.to(torch.long)
    dataset.msg = dataset.msg.to(torch.float)
    dataset.t = dataset.t.to(torch.long)
    # Store temporal graph in disk
    torch.save(dataset, f"/home/shahidul2k9/data/optc/out/evaluation/9_{day}_host={hostname}_datalabel={datalabel}.TemporalData")  
    

  0%|          | 0/1 [00:00<?, ?it/s]

len(events)=1462775


100%|██████████| 1/1 [09:01<00:00, 541.35s/it]


# Generate the evaluation set

## h201 23-25

In [61]:
# Read evaluation dataset generated by host SysClient0201 on 23rd-25th September 2019 and create temporal graph
for day in tqdm(range(23,26)):
    start_timestamp=datetime_to_timestamp_US('2019-09-'+str(day)+'T00:00:00')
    end_timestamp=datetime_to_timestamp_US('2019-09-'+str(day+1)+'T00:00:00')
    hostname='SysClient0201'
    datalabel='evaluation'
    # Create SQL query command
    sql=f"""
    select * from event_table
    where
          timestamp>{start_timestamp} and timestamp<{end_timestamp}
          and hostname='{hostname}' and data_label='{datalabel}' ORDER BY timestamp;
    """
    # Execute SQL query
    cur.execute(sql)
    events = cur.fetchall()
    print(f"{len(events)=}")
    
    
    
    node_set=set()
    node_uuid2index={}
    temp_index=0
    # Generate local temporal graph node indexes mapping
    for e in events:
        if e[3] not in node_uuid2path or e[0]  not in node_uuid2path:
            continue

        if e[0] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[0]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[0]]
            temp_index+=1

        if e[3] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[3]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[3]]
            temp_index+=1 

    torch.save(node_uuid2index,f'node_uuid2index_9_{day}_host={hostname}_datalabel={datalabel}')

    dataset = TemporalData()
    src = []
    dst = []
    msg = []
    t = []
    # Create temporal graph with nodes, edges, messages and times
    for e in (events):
        if e[3] in node_uuid2index and e[0] in node_uuid2index:
            # If the image path of the node is not recorded, then skip this edge
            src.append(node_uuid2index[e[0]])
            dst.append(node_uuid2index[e[3]])
        #     msg.append(torch.cat([torch.from_numpy(node2higvec_bn[i[0]]), rel2vec[i[2]], torch.from_numpy(node2higvec_bn[i[1]])] ))

            msg.append(torch.cat([str2tensor(e[1],node_uuid2path[e[0]]), 
                                  edge2vec[e[2]], 
                                  str2tensor(e[4],node_uuid2path[e[3]])
                                 ]))
            t.append(int(e[6]))

    dataset.src = torch.tensor(src)
    dataset.dst = torch.tensor(dst)
    dataset.t = torch.tensor(t)
    dataset.msg = torch.vstack(msg)
    dataset.src = dataset.src.to(torch.long)
    dataset.dst = dataset.dst.to(torch.long)
    dataset.msg = dataset.msg.to(torch.float)
    dataset.t = dataset.t.to(torch.long)
    # Store temporal graph in disk
    torch.save(dataset, f"/home/shahidul2k9/data/optc/out/evaluation/9_{day}_host={hostname}_datalabel={datalabel}.TemporalData")  
    

  0%|          | 0/3 [00:00<?, ?it/s]

len(events)=2354159


 33%|███▎      | 1/3 [13:12<26:25, 792.93s/it]

len(events)=3720913


 67%|██████▋   | 2/3 [34:13<17:48, 1068.30s/it]

len(events)=2195398


100%|██████████| 3/3 [46:37<00:00, 932.51s/it] 


## h402 23-25

In [62]:
# Read evaluation dataset generated by host SysClient0402 on 23rd-25th September 2019 and create temporal graph
for day in tqdm(range(23,26)):
    start_timestamp=datetime_to_timestamp_US('2019-09-'+str(day)+'T00:00:00')
    end_timestamp=datetime_to_timestamp_US('2019-09-'+str(day+1)+'T00:00:00')
    hostname='SysClient0402'
    datalabel='evaluation'
    # Create SQL query command
    sql=f"""
    select * from event_table
    where
          timestamp>{start_timestamp} and timestamp<{end_timestamp}
          and hostname='{hostname}' and data_label='{datalabel}' ORDER BY timestamp;
    """
    # Execute SQL query
    cur.execute(sql)
    events = cur.fetchall()
    print(f"{len(events)=}")
    
    
    
    node_set=set()
    node_uuid2index={}
    temp_index=0
    # Generate local temporal graph node indexes mapping
    for e in events:
        if e[3] not in node_uuid2path or e[0]  not in node_uuid2path:
            continue

        if e[0] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[0]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[0]]
            temp_index+=1

        if e[3] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[3]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[3]]
            temp_index+=1 

    torch.save(node_uuid2index,f'node_uuid2index_9_{day}_host={hostname}_datalabel={datalabel}')
       

    dataset = TemporalData()
    src = []
    dst = []
    msg = []
    t = []
    # Create temporal graph with nodes, edges, messages and times
    for e in (events):
        if e[3] in node_uuid2index and e[0] in node_uuid2index:
            # If the image path of the node is not recorded, then skip this edge
            src.append(node_uuid2index[e[0]])
            dst.append(node_uuid2index[e[3]])
        #     msg.append(torch.cat([torch.from_numpy(node2higvec_bn[i[0]]), rel2vec[i[2]], torch.from_numpy(node2higvec_bn[i[1]])] ))

            msg.append(torch.cat([str2tensor(e[1],node_uuid2path[e[0]]), 
                                  edge2vec[e[2]], 
                                  str2tensor(e[4],node_uuid2path[e[3]])
                                 ]))
            t.append(int(e[6]))

    dataset.src = torch.tensor(src)
    dataset.dst = torch.tensor(dst)
    dataset.t = torch.tensor(t)
    dataset.msg = torch.vstack(msg)
    dataset.src = dataset.src.to(torch.long)
    dataset.dst = dataset.dst.to(torch.long)
    dataset.msg = dataset.msg.to(torch.float)
    dataset.t = dataset.t.to(torch.long)
    # Store temporal graph in disk
    torch.save(dataset, f"/home/shahidul2k9/data/optc/out/evaluation/9_{day}_host={hostname}_datalabel={datalabel}.TemporalData")  
    

  0%|          | 0/3 [00:00<?, ?it/s]

len(events)=2513800


 33%|███▎      | 1/3 [14:04<28:08, 844.37s/it]

len(events)=3844461


 67%|██████▋   | 2/3 [35:05<18:09, 1089.36s/it]

len(events)=2317807


100%|██████████| 3/3 [48:59<00:00, 979.86s/it] 


## h660 23-25

In [None]:
# Read evaluation dataset generated by host SysClient0660 on 23rd-25th September 2019 and create temporal graph
for day in tqdm(range(23,26)):
    start_timestamp=datetime_to_timestamp_US('2019-09-'+str(day)+'T00:00:00')
    end_timestamp=datetime_to_timestamp_US('2019-09-'+str(day+1)+'T00:00:00')
    hostname='SysClient0660'
    datalabel='evaluation'
    # Create SQL query command
    sql=f"""
    select * from event_table
    where
          timestamp>{start_timestamp} and timestamp<{end_timestamp}
          and hostname='{hostname}' and data_label='{datalabel}' ORDER BY timestamp;
    """
    # Execute SQL query
    cur.execute(sql)
    events = cur.fetchall()
    print(f"{len(events)=}")
    
    
    
    node_set=set()
    node_uuid2index={}
    temp_index=0
    # Generate local temporal graph node indexes mapping
    for e in events:
        if e[3] not in node_uuid2path or e[0]  not in node_uuid2path:
            continue

        if e[0] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[0]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[0]]
            temp_index+=1

        if e[3] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[3]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[3]]
            temp_index+=1 

    torch.save(node_uuid2index,f'node_uuid2index_9_{day}_host={hostname}_datalabel={datalabel}')
       

    dataset = TemporalData()
    src = []
    dst = []
    msg = []
    t = []
    # Create temporal graph with nodes, edges, messages and times
    for e in (events):
        if e[3] in node_uuid2index and e[0] in node_uuid2index:
            # If the image path of the node is not recorded, then skip this edge
            src.append(node_uuid2index[e[0]])
            dst.append(node_uuid2index[e[3]])
        #     msg.append(torch.cat([torch.from_numpy(node2higvec_bn[i[0]]), rel2vec[i[2]], torch.from_numpy(node2higvec_bn[i[1]])] ))

            msg.append(torch.cat([str2tensor(e[1],node_uuid2path[e[0]]), 
                                  edge2vec[e[2]], 
                                  str2tensor(e[4],node_uuid2path[e[3]])
                                 ]))
            t.append(int(e[6]))

    dataset.src = torch.tensor(src)
    dataset.dst = torch.tensor(dst)
    dataset.t = torch.tensor(t)
    dataset.msg = torch.vstack(msg)
    dataset.src = dataset.src.to(torch.long)
    dataset.dst = dataset.dst.to(torch.long)
    dataset.msg = dataset.msg.to(torch.float)
    dataset.t = dataset.t.to(torch.long)
    # Store temporal graph in disk
    torch.save(dataset, f"/home/shahidul2k9/data/optc/out/evaluation/9_{day}_host={hostname}_datalabel={datalabel}.TemporalData")  
    

  0%|          | 0/3 [00:00<?, ?it/s]

len(events)=2317440


 33%|███▎      | 1/3 [13:32<27:04, 812.45s/it]

len(events)=3558940


 67%|██████▋   | 2/3 [34:00<17:37, 1057.14s/it]

len(events)=2314759


## h501 23-25

In [None]:
# Read evaluation dataset generated by host SysClient0501 on 23rd-25th September 2019 and create temporal graph
for day in tqdm(range(23,26)):
    start_timestamp=datetime_to_timestamp_US('2019-09-'+str(day)+'T00:00:00')
    end_timestamp=datetime_to_timestamp_US('2019-09-'+str(day+1)+'T00:00:00')
    hostname='SysClient0501'
    datalabel='evaluation'
    # Create SQL query command
    sql=f"""
    select * from event_table
    where
          timestamp>{start_timestamp} and timestamp<{end_timestamp}
          and hostname='{hostname}' and data_label='{datalabel}' ORDER BY timestamp;
    """
    # Execute SQL query
    cur.execute(sql)
    events = cur.fetchall()
    print(f"{len(events)=}")
    
    
    
    node_set=set()
    node_uuid2index={}
    temp_index=0
    for e in events:
        if e[3] not in node_uuid2path or e[0]  not in node_uuid2path:
            continue

        if e[0] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[0]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[0]]
            temp_index+=1

        if e[3] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[3]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[3]]
            temp_index+=1 

    torch.save(node_uuid2index,f'node_uuid2index_9_{day}_host={hostname}_datalabel={datalabel}')
       

    dataset = TemporalData()
    src = []
    dst = []
    msg = []
    t = []
    # Generate local temporal graph node indexes mapping
    for e in (events):
        if e[3] in node_uuid2index and e[0] in node_uuid2index:
            # If the image path of the node is not recorded, then skip this edge
            src.append(node_uuid2index[e[0]])
            dst.append(node_uuid2index[e[3]])
        #     msg.append(torch.cat([torch.from_numpy(node2higvec_bn[i[0]]), rel2vec[i[2]], torch.from_numpy(node2higvec_bn[i[1]])] ))

            msg.append(torch.cat([str2tensor(e[1],node_uuid2path[e[0]]), 
                                  edge2vec[e[2]], 
                                  str2tensor(e[4],node_uuid2path[e[3]])
                                 ]))
            t.append(int(e[6]))

    dataset.src = torch.tensor(src)
    dataset.dst = torch.tensor(dst)
    dataset.t = torch.tensor(t)
    dataset.msg = torch.vstack(msg)
    dataset.src = dataset.src.to(torch.long)
    dataset.dst = dataset.dst.to(torch.long)
    dataset.msg = dataset.msg.to(torch.float)
    dataset.t = dataset.t.to(torch.long)
    # Store temporal graph in disk
    torch.save(dataset, f"/home/shahidul2k9/data/optc/out/evaluation/9_{day}_host={hostname}_datalabel={datalabel}.TemporalData")  
    

## h051 23-25

In [None]:
# Read evaluation dataset generated by host SysClient0051 on 23rd-25th September 2019 and create temporal graph
for day in tqdm(range(23,26)):
    start_timestamp=datetime_to_timestamp_US('2019-09-'+str(day)+'T00:00:00')
    end_timestamp=datetime_to_timestamp_US('2019-09-'+str(day+1)+'T00:00:00')
    hostname='SysClient0051'
    datalabel='evaluation'
    # Create SQL query command
    sql=f"""
    select * from event_table
    where
          timestamp>{start_timestamp} and timestamp<{end_timestamp}
          and hostname='{hostname}' and data_label='{datalabel}' ORDER BY timestamp;
    """
    # Execute SQL query
    cur.execute(sql)
    events = cur.fetchall()
    print(f"{len(events)=}")
    
    
    
    node_set=set()
    node_uuid2index={}
    temp_index=0
    # Generate local temporal graph node indexes mapping
    for e in events:
        if e[3] not in node_uuid2path or e[0]  not in node_uuid2path:
            continue

        if e[0] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[0]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[0]]
            temp_index+=1

        if e[3] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[3]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[3]]
            temp_index+=1 

    torch.save(node_uuid2index,f'node_uuid2index_9_{day}_host={hostname}_datalabel={datalabel}')
       

    dataset = TemporalData()
    src = []
    dst = []
    msg = []
    t = []
    # Create temporal graph with nodes, edges, messages and times
    for e in (events):
        if e[3] in node_uuid2index and e[0] in node_uuid2index:
            # If the image path of the node is not recorded, then skip this edge
            src.append(node_uuid2index[e[0]])
            dst.append(node_uuid2index[e[3]])
        #     msg.append(torch.cat([torch.from_numpy(node2higvec_bn[i[0]]), rel2vec[i[2]], torch.from_numpy(node2higvec_bn[i[1]])] ))

            msg.append(torch.cat([str2tensor(e[1],node_uuid2path[e[0]]), 
                                  edge2vec[e[2]], 
                                  str2tensor(e[4],node_uuid2path[e[3]])
                                 ]))
            t.append(int(e[6]))

    dataset.src = torch.tensor(src)
    dataset.dst = torch.tensor(dst)
    dataset.t = torch.tensor(t)
    dataset.msg = torch.vstack(msg)
    dataset.src = dataset.src.to(torch.long)
    dataset.dst = dataset.dst.to(torch.long)
    dataset.msg = dataset.msg.to(torch.float)
    dataset.t = dataset.t.to(torch.long)
    # Store temporal graph in disk
    torch.save(dataset, f"/home/shahidul2k9/data/optc/out/evaluation/9_{day}_host={hostname}_datalabel={datalabel}.TemporalData")  
    

## h207 23-25

In [None]:
# Read evaluation dataset generated by host SysClient0207 on 23rd-25th September 2019 and create temporal graph
for day in tqdm(range(23,26)):
    start_timestamp=datetime_to_timestamp_US('2019-09-'+str(day)+'T00:00:00')
    end_timestamp=datetime_to_timestamp_US('2019-09-'+str(day+1)+'T00:00:00')
    hostname='SysClient0207'
    datalabel='evaluation'
    # Create SQL query command
    sql=f"""
    select * from event_table
    where
          timestamp>{start_timestamp} and timestamp<{end_timestamp}
          and hostname='{hostname}' and data_label='{datalabel}' ORDER BY timestamp;
    """
    # Execute SQL query
    cur.execute(sql)
    events = cur.fetchall()
    print(f"{len(events)=}")
    
    
    
    node_set=set()
    node_uuid2index={}
    temp_index=0
    # Generate local temporal graph node indexes mapping
    for e in events:
        if e[3] not in node_uuid2path or e[0]  not in node_uuid2path:
            continue

        if e[0] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[0]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[0]]
            temp_index+=1

        if e[3] in node_uuid2index:
            pass
        else:
            node_uuid2index[e[3]]=temp_index
            node_uuid2index[temp_index]=node_uuid2path[e[3]]
            temp_index+=1 

    torch.save(node_uuid2index,f'node_uuid2index_9_{day}_host={hostname}_datalabel={datalabel}')
       

    dataset = TemporalData()
    src = []
    dst = []
    msg = []
    t = []
    # Create temporal graph with nodes, edges, messages and times
    for e in (events):
        if e[3] in node_uuid2index and e[0] in node_uuid2index:
            # If the image path of the node is not recorded, then skip this edge
            src.append(node_uuid2index[e[0]])
            dst.append(node_uuid2index[e[3]])
        #     msg.append(torch.cat([torch.from_numpy(node2higvec_bn[i[0]]), rel2vec[i[2]], torch.from_numpy(node2higvec_bn[i[1]])] ))

            msg.append(torch.cat([str2tensor(e[1],node_uuid2path[e[0]]), 
                                  edge2vec[e[2]], 
                                  str2tensor(e[4],node_uuid2path[e[3]])
                                 ]))
            t.append(int(e[6]))

    dataset.src = torch.tensor(src)
    dataset.dst = torch.tensor(dst)
    dataset.t = torch.tensor(t)
    dataset.msg = torch.vstack(msg)
    dataset.src = dataset.src.to(torch.long)
    dataset.dst = dataset.dst.to(torch.long)
    dataset.msg = dataset.msg.to(torch.float)
    dataset.t = dataset.t.to(torch.long)
    # Store temporal graph in disk
    torch.save(dataset, f"/home/shahidul2k9/data/optc/out/evaluation/9_{day}_host={hostname}_datalabel={datalabel}.TemporalData")  
    

# A CSV file containing the ground truth nodes&edges

In [None]:
#Load ground truth labels
label_df=pd.read_csv("./labels.csv")

In [None]:
#Log ground truth labels
label_df

In [None]:

nodes_attack={}
edges_attack_list=[]
# Attack edge listing
for idx,row in label_df.iterrows():
    flag=False
    if row['objectID'] in node_uuid2path:
        nodes_attack[row['objectID']]=node_uuid2path[row['objectID']]
        flag=True
    if row['actorID'] in node_uuid2path:
        nodes_attack[row['actorID']]=node_uuid2path[row['actorID']]
        flag=True
    if flag and row['action'] in edge2vec:    
#         and row['action'] in edge2vec
        temp_dic={}
        temp_dic['src_uuid']=row['actorID']
        temp_dic['dst_uuid']=row['objectID']
        temp_dic['edge_type']=row['action']
        temp_dic['timestamp']=datetime_to_timestamp_US(row['timestamp'])

        edges_attack_list.append(temp_dic)


In [None]:
# Log number of attack edges
len(edges_attack_list)

In [None]:
# Log number of attack nodes
len(nodes_attack)

# Statistics (Num of nodes and edges)

In [None]:
# Load evaluation temporal graph
graph_9_22_h201=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_22_host=SysClient0201_datalabel=benign.TemporalData")
graph_9_22_h402=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_22_host=SysClient0402_datalabel=benign.TemporalData")
graph_9_22_h660=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_22_host=SysClient0660_datalabel=benign.TemporalData")
graph_9_22_h501=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_22_host=SysClient0501_datalabel=benign.TemporalData")
graph_9_22_h051=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_22_host=SysClient0051_datalabel=benign.TemporalData")
graph_9_22_h209=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_22_host=SysClient0209_datalabel=benign.TemporalData")

In [None]:
# Load evaluation temporal graph
graph_9_23_h201=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_23_host=SysClient0201_datalabel=evaluation.TemporalData")
graph_9_24_h201=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_24_host=SysClient0201_datalabel=evaluation.TemporalData")
graph_9_25_h201=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_25_host=SysClient0201_datalabel=evaluation.TemporalData")

In [None]:
# Load evaluation temporal graph
graph_9_23_h402=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_23_host=SysClient0402_datalabel=evaluation.TemporalData")
graph_9_24_h402=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_24_host=SysClient0402_datalabel=evaluation.TemporalData")
graph_9_25_h402=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_25_host=SysClient0402_datalabel=evaluation.TemporalData")

In [None]:
# Load evaluation temporal graph
graph_9_23_h660=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_23_host=SysClient0660_datalabel=evaluation.TemporalData")
graph_9_24_h660=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_24_host=SysClient0660_datalabel=evaluation.TemporalData")
graph_9_25_h660=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_25_host=SysClient0660_datalabel=evaluation.TemporalData")

In [None]:
# Load evaluation temporal graph
graph_9_23_h501=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_23_host=SysClient0501_datalabel=evaluation.TemporalData")
graph_9_24_h501=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_24_host=SysClient0501_datalabel=evaluation.TemporalData")
graph_9_25_h501=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_25_host=SysClient0501_datalabel=evaluation.TemporalData")

In [None]:
# Load evaluation temporal graph
graph_9_23_h051=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_23_host=SysClient0051_datalabel=evaluation.TemporalData")
graph_9_24_h051=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_24_host=SysClient0051_datalabel=evaluation.TemporalData")
graph_9_25_h051=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_25_host=SysClient0051_datalabel=evaluation.TemporalData")

In [None]:
# Load evaluation temporal graph
graph_9_23_h207=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_23_host=SysClient0207_datalabel=evaluation.TemporalData")
graph_9_24_h207=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_24_host=SysClient0207_datalabel=evaluation.TemporalData")
graph_9_25_h207=torch.load("/home/shahidul2k9/data/optc/out/evaluation/9_25_host=SysClient0207_datalabel=evaluation.TemporalData")

In [None]:
# Create an array of loaded temporal graphs
graphs=[
    graph_9_22_h201,
    graph_9_22_h402,
    graph_9_22_h660,
    graph_9_22_h501,
    graph_9_22_h051,
    graph_9_22_h209,
    
    graph_9_23_h201,
    graph_9_24_h201,
    graph_9_25_h201,
    
    graph_9_23_h402,
    graph_9_24_h402,
    graph_9_25_h402,
    
    graph_9_23_h660,
    graph_9_24_h660,
    graph_9_25_h660,
    
    graph_9_23_h501,
    graph_9_24_h501,
    graph_9_25_h501,
    
    graph_9_23_h051,
    graph_9_24_h051,
    graph_9_25_h051,
    
    graph_9_23_h207,
    graph_9_24_h207,
    graph_9_25_h207,
]

In [None]:
# Count total number of edges over all evaluation temporal graphs
edges_count=0
for g in graphs:
     edges_count+=len(g.t)

In [None]:
# Log temporal graphs edge count
edges_count

In [None]:
# Load unique node ID(UUID) to integer indexing
node_uuid2index_9_22_h201=torch.load("node_uuid2index_9_22_host=SysClient0201_datalabel=benign")
node_uuid2index_9_22_h402=torch.load("node_uuid2index_9_22_host=SysClient0402_datalabel=benign")
node_uuid2index_9_22_h660=torch.load("node_uuid2index_9_22_host=SysClient0660_datalabel=benign")
node_uuid2index_9_22_h501=torch.load("node_uuid2index_9_22_host=SysClient0501_datalabel=benign")
node_uuid2index_9_22_h051=torch.load("node_uuid2index_9_22_host=SysClient0051_datalabel=benign")
node_uuid2index_9_22_h209=torch.load("node_uuid2index_9_22_host=SysClient0209_datalabel=benign")


node_uuid2index_9_23_h201=torch.load("node_uuid2index_9_23_host=SysClient0201_datalabel=evaluation")
node_uuid2index_9_24_h201=torch.load("node_uuid2index_9_24_host=SysClient0201_datalabel=evaluation")
node_uuid2index_9_25_h201=torch.load("node_uuid2index_9_25_host=SysClient0201_datalabel=evaluation")

node_uuid2index_9_23_h402=torch.load("node_uuid2index_9_23_host=SysClient0402_datalabel=evaluation")
node_uuid2index_9_24_h402=torch.load("node_uuid2index_9_24_host=SysClient0402_datalabel=evaluation")
node_uuid2index_9_25_h402=torch.load("node_uuid2index_9_25_host=SysClient0402_datalabel=evaluation")

node_uuid2index_9_23_h660=torch.load("node_uuid2index_9_23_host=SysClient0660_datalabel=evaluation")
node_uuid2index_9_24_h660=torch.load("node_uuid2index_9_24_host=SysClient0660_datalabel=evaluation")
node_uuid2index_9_25_h660=torch.load("node_uuid2index_9_25_host=SysClient0660_datalabel=evaluation")

node_uuid2index_9_23_h501=torch.load("node_uuid2index_9_23_host=SysClient0501_datalabel=evaluation")
node_uuid2index_9_24_h501=torch.load("node_uuid2index_9_24_host=SysClient0501_datalabel=evaluation")
node_uuid2index_9_25_h501=torch.load("node_uuid2index_9_25_host=SysClient0501_datalabel=evaluation")

node_uuid2index_9_23_h051=torch.load("node_uuid2index_9_23_host=SysClient0051_datalabel=evaluation")
node_uuid2index_9_24_h051=torch.load("node_uuid2index_9_24_host=SysClient0051_datalabel=evaluation")
node_uuid2index_9_25_h051=torch.load("node_uuid2index_9_25_host=SysClient0051_datalabel=evaluation")

node_uuid2index_9_23_h207=torch.load("node_uuid2index_9_23_host=SysClient0207_datalabel=evaluation")
node_uuid2index_9_24_h207=torch.load("node_uuid2index_9_24_host=SysClient0207_datalabel=evaluation")
node_uuid2index_9_25_h207=torch.load("node_uuid2index_9_25_host=SysClient0207_datalabel=evaluation")





In [None]:
# Create a list of node indexing files
node_dics=[
    node_uuid2index_9_22_h201,
    node_uuid2index_9_22_h402,
    node_uuid2index_9_22_h660,
    node_uuid2index_9_22_h501,
    node_uuid2index_9_22_h051,
    node_uuid2index_9_22_h209,
    node_uuid2index_9_23_h201,
    node_uuid2index_9_24_h201,
    node_uuid2index_9_25_h201,
    node_uuid2index_9_23_h402,
    node_uuid2index_9_24_h402,
    node_uuid2index_9_25_h402,
    node_uuid2index_9_23_h660,
    node_uuid2index_9_24_h660,
    node_uuid2index_9_25_h660,
    node_uuid2index_9_23_h501,
    node_uuid2index_9_24_h501,
    node_uuid2index_9_25_h501,
    node_uuid2index_9_23_h051,
    node_uuid2index_9_24_h051,
    node_uuid2index_9_25_h051,
    node_uuid2index_9_23_h207,
    node_uuid2index_9_24_h207,
    node_uuid2index_9_25_h207,
]

In [None]:
# Create a set of unique nodes
nodes=set()
for dic in node_dics:
    for n in dic:
        if type(n)==str:
            nodes.add(n)

In [None]:
# Log nodes
len(nodes)