# Analysis on secure logs

In [1]:
import os
import sys
import numpy as np
import pandas as pd

In [2]:
sys.path.append('../')
sys.path.append('../src/')
sys.path.append('../spell/')

In [3]:
import Reader
import ParamsExtractor3
import DataPreprocessor
import DeepLearningAnomalyDetection2

2024-07-06 14:50:21.934123: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-06 14:50:21.934742: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-06 14:50:21.938465: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-06 14:50:21.984984: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Training

In [4]:
log_types = ['laurel']
dates = ['20240418', '20240420', '20240429']
login_node_numbers = ['01', '02', '03', '10']

# Generate the list of file paths
file_paths = [f'/../../../temp_logs/{date}/login{num}.{logtype}.log' for date in dates for num in login_node_numbers for logtype in log_types]

# Filter the list to include only existing files
existing_file_paths = [path for path in file_paths if os.path.exists(path)]
print(existing_file_paths)

df_list = []

for file_path in existing_file_paths:
    r = Reader.Reader(file_path)
    df = r.read_file(file_path)
    df_list.append(df)

['/../../../temp_logs/20240418/login01.laurel.log', '/../../../temp_logs/20240418/login02.laurel.log', '/../../../temp_logs/20240418/login03.laurel.log', '/../../../temp_logs/20240418/login10.laurel.log', '/../../../temp_logs/20240420/login01.laurel.log', '/../../../temp_logs/20240420/login02.laurel.log', '/../../../temp_logs/20240420/login03.laurel.log', '/../../../temp_logs/20240420/login10.laurel.log']


In [5]:
len(df_list)

8

In [6]:
df_after = []

for i, df in enumerate(df_list):
    p = ParamsExtractor3.ParamsExtractor(df)
    df = p.convert_params(df)
    new_df = p.get_params()
    df_after.append(new_df)
    df_list[i] = pd.concat([df_list[i], new_df], axis=1)

In [7]:
df_list[0].head()

Unnamed: 0,host,ident,message,severity,facility,time,severity_numbers,facility_numbers,severity_scores,timedelta,suid,cap_fp,comm,parent_comm,n_dang,n_dang_no_cron,fp_length
0,login01,laurel,"""1713391196.350:55343008"",""NODE"":""login01"",""SY...",info,local6,2024-04-18 00:00:03 +0200,6,22,1.0,0.0,126858,0,squeue,bash,2,0,92
1,login01,laurel,"""1713391198.212:55343009"",""NODE"":""login01"",""SY...",info,local6,2024-04-18 00:00:03 +0200,6,22,1.0,0.0,0,0,ping,pacemaker-execd,0,0,38
2,login01,laurel,"""1713391198.214:55343010"",""NODE"":""login01"",""SY...",info,local6,2024-04-18 00:00:03 +0200,6,22,1.0,0.0,0,0,basename,ping,0,0,28
3,login01,laurel,"""1713391198.220:55343011"",""NODE"":""login01"",""SY...",info,local6,2024-04-18 00:00:03 +0200,6,22,1.0,0.0,0,0,egrep,ping,0,0,28
4,login01,laurel,"""1713391198.221:55343012"",""NODE"":""login01"",""SY...",info,local6,2024-04-18 00:00:03 +0200,6,22,1.0,0.0,0,0,grep,ping,0,0,21


In [8]:
df_list[0].iloc[0]['message']

'"1713391196.350:55343008","NODE":"login01","SYSCALL":{"arch":"0xc000003e","syscall":59,"success":"yes","exit":0,"items":2,"ppid":29378,"pid":37632,"auid":126858,"uid":126858,"gid":25200,"euid":126858,"suid":126858,"fsuid":126858,"egid":25200,"sgid":25200,"fsgid":25200,"tty":"pts38","ses":703112,"comm":"squeue","exe":"/opt/slurm/23.11.5/bin/squeue","key":"auditcmd","ARCH":"x86_64","SYSCALL":"execve","AUID":"cmatteuc","UID":"cmatteuc","GID":"interactive","EUID":"cmatteuc","SUID":"cmatteuc","FSUID":"cmatteuc","EGID":"interactive","SGID":"interactive","FSGID":"interactive","ARGV":["0x564021c6aff0","0x564021c6bdf0","0x564021c34470","0x8"]},"EXECVE":{"argc":3,"ARGV":["squeue","-u","cmatteuc"],"ENV":{"LD_LIBRARY_PATH":"/g100_work/PROJECTS/spack/v0.17/prod/0.17.1/install/0.17/linux-centos8-skylake_avx512/gcc-8.4.1/anaconda3-2021.05-alnfpd33vtv6tt7sf6iq5ngwv7nqgeah/lib:/cineca/prod/opt/compilers/intel/oneapi-2022/binary/mpi/latest/libfabric/lib/prov:/cineca/prod/opt/compilers/intel/oneapi-2022

In [9]:
for i, df in enumerate(df_list):
    d = DataPreprocessor.DataPreprocessor(df)
    enc = d.drop_and_hash_encode()
    df_list[i] = enc

print(df_list[0].head())

   severity_scores  timedelta    suid  cap_fp      comm      parent_comm  \
0              1.0        0.0  126858       0    squeue             bash   
1              1.0        0.0       0       0      ping  pacemaker-execd   
2              1.0        0.0       0       0  basename             ping   
3              1.0        0.0       0       0     egrep             ping   
4              1.0        0.0       0       0      grep             ping   

   n_dang  n_dang_no_cron  fp_length  col_0  ...  col_10  col_11  col_12  \
0       2               0         92      0  ...       0       0       0   
1       0               0         38      0  ...       0       0       0   
2       0               0         28      0  ...       0       0       0   
3       0               0         28      0  ...       0       0       0   
4       0               0         21      0  ...       0       0       0   

   col_13  col_14  col_15  col_16  col_17  col_18  col_19  
0       0       0       0 

In [10]:
for df in df_list:
    print(df.shape)

(1000, 29)
(1000, 29)
(1000, 29)
(1000, 29)
(1000, 29)
(1000, 29)
(1000, 29)
(1000, 29)


In [11]:
normal_dataset = df_list[0]

for i in range(1,len(df_list)):
    normal_dataset = np.vstack([normal_dataset, df_list[i]])

In [12]:
normal_dataset.shape

(8000, 29)

## Anomalies' dataset

In [13]:
log_types = ['laurel']

# Generate the list of file paths
file_paths = [f'../data/linpeas_logs_1/{logtype}_logs.log' for logtype in log_types]

# Filter the list to include only existing files
existing_file_paths = [path for path in file_paths if os.path.exists(path)]

an_df_list = []

for file_path in existing_file_paths:
    r = Reader.Reader(file_path)
    df = r.read_file_2(file_path)
    an_df_list.append(df)

In [16]:
an_df_list[0].head()

In [15]:
an_df_after = []

for i, df in enumerate(an_df_list):
    p = ParamsExtractor3.ParamsExtractor(df)
    df = p.convert_params(df)
    new_df = p.get_params()
    an_df_after.append(new_df)
    an_df_list[i] = pd.concat([an_df_list[i], new_df], axis=1)

KeyError: 'severity'

In [None]:
an_df_list[0].head()

In [None]:
for i, df in enumerate(an_df_list):
    d = DataPreprocessor.DataPreprocessor(df)
    enc = d.drop_and_hash_encode()
    an_df_list[i] = enc

print(an_df_list[0].head())

In [None]:
# we test secure
# we take the lines from the 230th (detected pwd bruteforcing)
anomalous_dataset = an_df_list[0][230:]

In [None]:
anomalous_dataset.shape

In [None]:
anomalous_dataset.columns

## Library implementation

In [None]:
normal_dataset_df = pd.DataFrame(normal_dataset, columns=anomalous_dataset.columns)

In [None]:
normal_dataset_reduced = normal_dataset_df[['severity_scores', 'timedelta', 'port', 'log key', 'log key spell',
       'n_dang', 'n_dang_no_cron', 'fp_length']]
anomalous_dataset_reduced = anomalous_dataset[['severity_scores', 'timedelta', 'port', 'log key', 'log key spell',
       'n_dang', 'n_dang_no_cron', 'fp_length']]

In [None]:
d_an_det = DeepLearningAnomalyDetection2.DeepLearningAnomalyDetection()
true_anomalies, reconstructed_anomalies = d_an_det.train_test_model(normal_dataset_df, anomalous_dataset, 'autoencoder', plots=[0,0,1,1])

In [None]:
d_an_det = DeepLearningAnomalyDetection2.DeepLearningAnomalyDetection()
true_anomalies_vae, reconstructed_anomalies_vae = d_an_det.train_test_model(normal_dataset_df, anomalous_dataset, 'vae', plots=[0,0,1,1])

## Ensemble method

In [None]:
d_an_det.ensemble_method(normal_dataset_df, anomalous_dataset, plots=[1])

In [None]:
# Example predictions from two models (replace with your actual predictions)
predictions_model1 = reconstructed_anomalies
predictions_model2 = reconstructed_anomalies_vae

# Combine predictions using logical OR (voting scheme)
combined_predictions = np.logical_and(predictions_model1, predictions_model2).astype(int)

# Example thresholding (adjust as needed)
threshold = 0.5  # Simple majority voting

# Convert to final anomaly predictions based on threshold
final_predictions = (combined_predictions >= threshold).astype(int)

# Print or use the final predictions
print("Combined Predictions:", combined_predictions)
print("Final Anomaly Predictions:", final_predictions)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Example ground truth and predictions (replace with your actual data)
ground_truth = true_anomalies
predictions = final_predictions
            
# Calculate confusion matrix
cm = confusion_matrix(ground_truth, predictions)
            
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                        xticklabels=['Normal', 'Anomaly'], 
                        yticklabels=['Normal', 'Anomaly'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()