# Imports

In [41]:
import pandas as pd
import json
from tqdm import tqdm

# Provenance Data

## Load original logs

In [42]:
# Provenance data paths
artifacts_vertices_path = "../provenance_data/app_artifact_vertices.csv"
processes_vertices_path = "../provenance_data/app_process_vertices.csv"
edges_path = "../provenance_data/app_edges.csv"

In [55]:
# Load provenance data to dataframes
artifacts_df = pd.read_csv(artifacts_vertices_path, index_col=0)
processes_df = pd.read_csv(processes_vertices_path, index_col=0)
edges_df = pd.read_csv(edges_path, index_col=0)

## Add generated attack simulation logs

In [44]:
generated_artifacts_path = "attack_scenario123_gen_artifacts.csv"
generated_processes_path = "attack_scenario123_gen_processes.csv"
generated_edges_path = "attack_scenario123_gen_edges.csv"

In [45]:
generated_artifacts_df = pd.read_csv(generated_artifacts_path)
generated_processes_df = pd.read_csv(generated_processes_path)
generated_edges_df = pd.read_csv(generated_edges_path)

In [46]:
artifacts_df = pd.concat([artifacts_df, generated_artifacts_df], ignore_index=True)

In [47]:
# Get the common columns between the two dataframes
common_columns = processes_df.columns.intersection(generated_processes_df.columns)

# Select only the common columns from both dataframes
processes_df = processes_df[common_columns]
generated_processes_df = generated_processes_df[common_columns]

# Now you can concatenate
processes_df = pd.concat([processes_df, generated_processes_df], ignore_index=True)

processes_df

Unnamed: 0,id,type,gid,euid,children pid namespace,pid,seen time,source,net namespace,ipc namespace,...,uid,egid,exe,mount namespace,name,user namespace,start time,command line,ns pid,svc
0,0ef366a12c22d68dcc35e410ab69719e,Process,1000,1000,-1.000000e+00,1711.0,,syscall,-1.000000e+00,-1.000000e+00,...,1000,1000,/bin/grpc_health_probe,-1.000000e+00,grpc_health_pro,-1.000000e+00,1.702120e+09,/bin/grpc_health_probe -addr=:8080,,frontend
1,3f921fd1ce1cc790417d2071cc0a3c34,Process,1000,1000,4.026533e+09,1723.0,,syscall,4.026532e+09,4.026533e+09,...,1000,1000,/bin/grpc_health_probe,4.026533e+09,grpc_health_pro,4.026532e+09,1.702120e+09,/bin/grpc_health_probe -addr=:8080,2489.0,frontend
2,e6103611b0969dad371a1d6e665772e7,Process,1000,1000,4.026533e+09,1724.0,,syscall,4.026532e+09,4.026533e+09,...,1000,1000,/bin/grpc_health_probe,4.026533e+09,grpc_health_pro,4.026532e+09,1.702120e+09,/bin/grpc_health_probe -addr=:8080,2490.0,frontend
3,ff8a80868c60e4a5a1503a31124cd340,Process,1000,1000,4.026533e+09,1725.0,,syscall,4.026532e+09,4.026533e+09,...,1000,1000,/bin/grpc_health_probe,4.026533e+09,grpc_health_pro,4.026532e+09,1.702120e+09,/bin/grpc_health_probe -addr=:8080,2491.0,frontend
4,b6acb3e5c8618efc89e85cd9e2ee9823,Process,1000,1000,4.026533e+09,1726.0,,syscall,4.026532e+09,4.026533e+09,...,1000,1000,/bin/grpc_health_probe,4.026533e+09,grpc_health_pro,4.026532e+09,1.702120e+09,/bin/grpc_health_probe -addr=:8080,2492.0,frontend
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504,7a8b9c0d1e2f3a4b5c6d7e8f9a0b1c2d,Process,1000,1000,-1.000000e+00,1725.0,1.681923e+09,recommendationservice,4.026532e+09,4.026532e+09,...,1000,1000,/bin/sh,4.026532e+09,sh,4.026532e+09,1.681923e+09,/bin/sh -c /usr/sbin/tcpdump -i eth0 -w /tmp/r...,,recommendationservice
505,8f9a0b1c2d3e4f5a6b7c8d9e0a1b2c3d,Process,0,0,-1.000000e+00,9877.0,1.681924e+09,frontend,4.026532e+09,4.026532e+09,...,0,0,/usr/bin/sudo,4.026532e+09,sudo,4.026532e+09,1.681924e+09,/usr/bin/sudo -u root /bin/bash,,frontend
506,9a0b1c2d3e4f5a6b7c8d9e0a1b2c3d4e,Process,0,0,-1.000000e+00,9879.0,1.681924e+09,recommendationservice,4.026532e+09,4.026532e+09,...,0,0,,4.026532e+09,python3,4.026532e+09,1.681924e+09,/usr/bin/python3 /app/recommendation_server.py,,recommendationservice
507,a0b1c2d3e4f5a6b7c8d9e0a1b2c3d4e5f,Process,0,0,-1.000000e+00,9880.0,1.681924e+09,recommendationservice,4.026532e+09,4.026532e+09,...,0,0,,4.026532e+09,python3,4.026532e+09,1.681924e+09,/usr/bin/python3 /tmp/deserialization_exploit.py,,recommendationservice


In [48]:
# Concat the edges dataframes without 'svc_from' and 'svc_to' columns
edges_df = pd.concat([edges_df, generated_edges_df.drop(columns=['svc_from', 'svc_to'])], ignore_index=True)

## Save updated provenance data

In [49]:
# Save the updated provenance data
artifacts_df.to_csv('artifacts_with_generated.csv', index=True)
processes_df.to_csv('processes_with_generated.csv', index=True)
edges_df.to_csv('edges_with_generated.csv', index=True)

# Alerts Mechanism - CHECK MATCHES


In [32]:
sigma_rules_values_path = "../alerts_mechanism/sigma_rules_values_with_names.json"

with open(sigma_rules_values_path, 'r') as f:
    sigma_rules_values_with_names = json.load(f)

In [33]:
artifacts_matches = []

for index, row in tqdm(artifacts_df.iterrows(), total=len(artifacts_df)):
    for column, value in row.items():
        if value in list(sigma_rules_values_with_names.keys()) and value not in artifacts_matches:
            artifacts_matches.append(value)
                
artifacts_matches

100%|██████████| 1591/1591 [00:29<00:00, 54.80it/s]


['0.0.0.0', 'tcp', '127.0.0.1', 'udp', '/etc/sudoers']

In [34]:
processes_matches = []

for index, row in tqdm(processes_df.iterrows(), total=len(processes_df)):
    for column, value in row.items():
        if value in list(sigma_rules_values_with_names.keys()) and value not in processes_matches:
            processes_matches.append(value)
                
processes_matches

100%|██████████| 509/509 [00:09<00:00, 53.36it/s]


['Process', '/bin/sh', 'sh', 'sudo', '/bin/bash', 'bash']

In [35]:
edges_matches = []

for index, row in tqdm(edges_df.iterrows(), total=len(edges_df)):
    for column, value in row.items():
        if value in list(sigma_rules_values_with_names.keys()) and value not in edges_matches:
            edges_matches.append(value)
                
edges_matches

100%|██████████| 2585/2585 [00:21<00:00, 117.91it/s]


['accept', 'execve', 'open', 'connect']

In [36]:
# Initialize the list to store matches
artifacts_matches = []

# Iterate over the dataframe
for index, row in tqdm(artifacts_df.iterrows(), total=len(artifacts_df)):
    for column, value in row.items():
        # Check if the value is in the keys of the sigma_rules_values_with_names dictionary
        if value in sigma_rules_values_with_names.keys():
            # If a match is found, create a dictionary with the matched value, the dataframe row, and the associated rule
            match = {
                'matched_value': value,
                'dataframe_row': row.to_dict(),
                'associated_rule': sigma_rules_values_with_names[value]
            }
            # Append the match to the list of matches
            artifacts_matches.append(match)

# Print the matches
artifacts_matches

100%|██████████| 1591/1591 [00:00<00:00, 18932.66it/s]


[{'matched_value': '0.0.0.0',
  'dataframe_row': {'id': 'a41db92088a31a823a23ed48145dd661',
   'type': 'Artifact',
   'path': nan,
   'subtype': 'network socket',
   'permissions': nan,
   'epoch': 0.0,
   'source': 'syscall',
   'version': 0,
   'local address': '0.0.0.0',
   'remote port': 39612.0,
   'protocol': 'tcp',
   'remote address': '172.31.30.227',
   'local port': 6379.0,
   'net namespace': -1.0,
   'memory address': nan,
   'size': nan,
   'tgid': nan,
   'read fd': nan,
   'write fd': nan,
   'svc': 'redis-cart'},
  'associated_rule': ['net_connection_lnx_back_connect_shell_dev.yml',
   'zeek_dns_mining_pools.yml']},
 {'matched_value': 'tcp',
  'dataframe_row': {'id': 'a41db92088a31a823a23ed48145dd661',
   'type': 'Artifact',
   'path': nan,
   'subtype': 'network socket',
   'permissions': nan,
   'epoch': 0.0,
   'source': 'syscall',
   'version': 0,
   'local address': '0.0.0.0',
   'remote port': 39612.0,
   'protocol': 'tcp',
   'remote address': '172.31.30.227',
  

In [37]:
# Initialize the list to store matches
processes_matches = []

# Iterate over the dataframe
for index, row in tqdm(processes_df.iterrows(), total=len(processes_df)):
    for column, value in row.items():
        # Check if the value is in the keys of the sigma_rules_values_with_names dictionary
        if value in sigma_rules_values_with_names.keys():
            # If a match is found, create a dictionary with the matched value, the dataframe row, and the associated rule
            match = {
                'matched_value': value,
                'dataframe_row': row.to_dict(),
                'associated_rule': sigma_rules_values_with_names[value]
            }
            # Append the match to the list of matches
            processes_matches.append(match)

# Print the matches
for match in processes_matches:
    print(match['matched_value'])

100%|██████████| 509/509 [00:00<00:00, 12109.41it/s]

Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process
Process





In [38]:
# Initialize the list to store matches
edges_matches = []

# Iterate over the dataframe
for index, row in tqdm(edges_df.iterrows(), total=len(edges_df)):
    for column, value in row.items():
        # Check if the value is in the keys of the sigma_rules_values_with_names dictionary
        if value in sigma_rules_values_with_names.keys():
            # If a match is found, create a dictionary with the matched value, the dataframe row, and the associated rule
            match = {
                'matched_value': value,
                'dataframe_row': row.to_dict(),
                'associated_rule': sigma_rules_values_with_names[value]
            }
            # Append the match to the list of matches
            edges_matches.append(match)

# Print the matches
for match in edges_matches:
    print(match['matched_value'])

100%|██████████| 2585/2585 [00:00<00:00, 20676.21it/s]

accept
accept
execve
open
open
execve
open
open
open
open
connect
accept
connect
accept
execve
open
open
execve
open
open
accept
connect
open
execve
accept
open
connect
open
open
connect
accept
accept
execve
open
open
connect
accept
execve
open
open
connect
accept
accept
execve
open
open
accept
connect
execve
open
open
connect
accept
execve
open
open
connect
accept
execve
open
open
connect
accept
execve
open
open
connect
accept
execve
open
open
connect
accept
execve
open
open
connect
accept
accept
accept
execve
open
open
open
connect
accept
execve
open
open
open
connect
accept
accept
connect
accept
connect
execve
open
execve
open
open
accept
connect
execve
open
open
accept
connect
execve
open
open
connect
accept
accept
accept
execve
open
execve
open
open
open
connect
accept
open
open
connect
accept
execve
open
open
execve
open
open
accept
accept
connect
accept
execve
open
open
execve
open
connect
open
connect
accept
connect
accept
execve
open
open
open
accept
connect
execve
open
open
c




In [39]:
# Extract list of rule that were matched
artifacts_matched_rules = [match['associated_rule'] for match in artifacts_matches]
processes_matched_rules = [match['associated_rule'] for match in processes_matches]
edges_matched_rules = [match['associated_rule'] for match in edges_matches]

# Flatten the lists
artifacts_matched_rules = [rule for sublist in artifacts_matched_rules for rule in sublist]
processes_matched_rules = [rule for sublist in processes_matched_rules for rule in sublist]
edges_matched_rules = [rule for sublist in edges_matched_rules for rule in sublist]

In [40]:
# Create set of all matched rules
matched_rules = set(artifacts_matched_rules + processes_matched_rules + edges_matched_rules)
matched_rules

{'lnx_auditd_data_compressed.yml',
 'lnx_auditd_masquerading_crond.yml',
 'lnx_auditd_network_sniffing.yml',
 'lnx_auditd_omigod_scx_runasprovider_executeshellcommand.yml',
 'lnx_auditd_susp_cmds.yml',
 'lnx_auditd_web_rce.yml',
 'lnx_sudo_cve_2019_14287_user.yml',
 'net_connection_lnx_back_connect_shell_dev.yml',
 'net_connection_win_excel_outbound_network_connection.yml',
 'net_connection_win_powershell_network_connection.yml',
 'net_connection_win_python.yml',
 'net_connection_win_rdp_outbound_over_non_standard_tools.yml',
 'net_connection_win_remote_powershell_session_network.yml',
 'net_connection_win_susp_epmap.yml',
 'net_firewall_cleartext_protocols.yml',
 'proc_creation_lnx_cve_2022_26134_atlassian_confluence.yml',
 'proc_creation_lnx_gtfobin_vim.yml',
 'proc_creation_lnx_local_account.yml',
 'proc_creation_lnx_netcat_reverse_shell.yml',
 'proc_creation_lnx_omigod_scx_runasprovider_executeshellcommand.yml',
 'proc_creation_lnx_perl_reverse_shell.yml',
 'proc_creation_lnx_php_r