# Normal dataset

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import json

sys.path.append('../')
sys.path.append('../src/')
sys.path.append('../spell/')

import Reader
import ParamsExtractor3
import DataPreprocessor
import EncodeCommand
import DeepLearningAnomalyDetection2

2024-09-05 12:12:55.369119: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-05 12:12:55.418876: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
log_types = ['laurel']

# Generate the list of file paths
file_paths = [f'../data/new_laurel_conf/audit.log' for logtype in log_types]

# Filter the list to include only existing files
fps = [path for path in file_paths if os.path.exists(path)]
print(fps)

['../data/new_laurel_conf/audit.log']


In [3]:
fp = fps[0]

data_list = []
lines_limit = 100000 # limit reading lines, done for memory constraints
i = 0

with open(fp, 'r') as file:
    for line in file:
        i+=1
        if i > lines_limit:
            break
            # Find the index of the first '{'
        index = line.find('{')
        if index != -1:
            json_data = line[index:]
            data = json.loads(json_data)
            data_list.append(data)
                
df = pd.DataFrame(data_list)

In [4]:
def read(fp):
    data_list = []
    lines_limit = 1000 # limit reading lines, done for memory constraints
    i = 0
    
    with open(fp, 'r') as file:
        for line in file:
            i+=1
            if i > lines_limit:
                break
                # Find the index of the first '{'
            index = line.find('{')
            if index != -1:
                json_data = line[index:]
                data = json.loads(json_data)
                data_list.append(data)
                    
    df = pd.DataFrame(data_list)
    return df

In [5]:
def normalize(df, col):
    if col not in df.columns:
        print(f"Error, column {col} not in df")
        return None
    df_expanded = pd.json_normalize(df[col])
    #print(df_expanded.head())
    df = df.drop(col, axis=1)
    df = df.join(df_expanded, rsuffix=f'_{col}')
    return df

In [6]:
# Final Preprocessing Function
def preprocess_laurel(df):
    
    df = normalize(df, 'CWD')
    df = normalize(df, 'PATH')
    df = normalize(df, 'SYSCALL')
    df = normalize(df, 'PROCTITLE')
    df = normalize(df, 'EXECVE')
    df = normalize(df, 0)
    df = normalize(df, 1)
    print(df.head())

    columns_to_keep = ['cwd', 'exit', 'items', 'ppid', 'pid', 'comm', 'timedelta', 'pid_timedelta', 'ppid_timedelta', 'id_anomalies', 'num_id_anomalies', 'ARGV_PROCTITLE_str']
    columns_to_drop = ['ID', 'PID.EVENT_ID', 'PPID.EVENT_ID', 'unix_time', 'ARGV_PROCTITLE', 'time', 'pid_time', 'ppid_time']
    
    try:
        df['ARGV_PROCTITLE_str'] = df['ARGV_PROCTITLE'].apply(lambda x: ' '.join(x))
    except:
        columns_to_keep.remove('ARGV_PROCTITLE_str')
        pass
    
    df['unix_time'] = pd.to_numeric(df['ID'].str.split(':').str[0])
    df['timedelta'] = df['unix_time'].diff()
    df['time'] = pd.to_datetime(df['unix_time'], unit='s')

    try:
        df['ppid_time'] = pd.to_datetime(pd.to_numeric(df['PPID.EVENT_ID'].str.split(':').str[0]), unit='s')
        df['ppid_timedelta'] = df['ppid_time'].diff()
        columns_to_keep.remove('ppid_timedelta')
        columns_to_drop.remove('ppid_time')
    except:
        pass
    
    df['pid_time'] = pd.to_datetime(pd.to_numeric(df['PID.EVENT_ID'].str.split(':').str[0]), unit='s')
    df['pid_timedelta'] = df['pid_time'].diff()
    
    df = df.drop(columns_to_drop, axis=1)

    columns = ['auid', 'uid', 'gid', 'euid', 'suid', 'fsuid', 'egid', 'sgid', 'fsgid']
    
    def check_row_id_num(row):
        return pd.Series({col: row[col] == 0 for col in columns})
    
    checks = df.apply(check_row_id_num, axis=1)
    df['num_id_anomalies'] = checks.sum(axis=1)
    df = df.drop(columns, axis=1)

    columns = ['AUID', 'UID', 'GID', 'EUID', 'SUID', 'FSUID', 'EGID', 'SGID', 'FSGID']
    
    # Use apply to efficiently check values for each row
    def check_id_row(row):
        try:
            return pd.Series({col: int(not(row[col] in row['UID_GROUPS'])) for col in columns})
        except:
            pass
    
    checks = df.apply(check_id_row, axis=1)
    
    df['id_anomalies'] = checks.sum(axis=1)
    df = df.drop(columns, axis=1)
    df = df.drop('UID_GROUPS', axis=1)

    # filter final columns
    df = df[columns_to_keep]

    return df

In [7]:
def save(df, save_fp):
    # Save this df to file
    #save_fp = '../data/laurel_anomalous_new/save1.csv'
    df.to_csv(save_fp, index=False)

In [8]:
an_df_list = []

for fp in fps:
    df = read(fp)
    df = preprocess_laurel(df)
    save(df, fp + '.csv')

                   ID SERVICE_STOP BPRM_FCAPS SERVICE_START SYSTEM_RUNLEVEL  \
0  1721464625.772:929          NaN        NaN           NaN             NaN   
1  1721464625.816:930          NaN        NaN           NaN             NaN   
2  1721464625.641:933          NaN        NaN           NaN             NaN   
3  1721464625.512:934          NaN        NaN           NaN             NaN   
4  1721464625.636:935          NaN        NaN           NaN             NaN   

   BPF USER_AUTH USER_ACCT CRED_ACQ LOGIN  ... rdev_1  \
0  NaN       NaN       NaN      NaN   NaN  ...  00:00   
1  NaN       NaN       NaN      NaN   NaN  ...  00:00   
2  NaN       NaN       NaN      NaN   NaN  ...  00:00   
3  NaN       NaN       NaN      NaN   NaN  ...  00:00   
4  NaN       NaN       NaN      NaN   NaN  ...  00:00   

                          obj_1 nametype_1 cap_fp_1 cap_fi_1 cap_fe_1  \
0  system_u:object_r:ld_so_t:s0     NORMAL      0x0      0x0      0.0   
1  system_u:object_r:ld_so_t:s0     

In [9]:
normal_dataset_fp = fp + '.csv'
new_path = '../cleaned_data/normal.csv'
save(df, new_path)