1. Import and define

In [2]:
import pandas as pd
import sys
import json
from nfstream import NFStreamer
import os
import datetime


root_folder = os.path.join("/", "RanD", "CREMEv2_Result", "20230207", "logs")
scenario = ["mirai", "disk_wipe", "ransomware", "resource_hijacking", "end_point_dos"]
scenario_folder = ["01_mirai", "02_disk_wipe", "03_ransomware", "04_resource_hijacking", "05_end_point_dos"]
traffic_folder = "traffic"
label_folder = "label_traffic"
filename = "traffic.pcap"
label = {'mirai': [1, 2, 3, 5, 9, 11, 12, 13],
         'disk_wipe': [1, 2, 4, 6, 8, 14],
         'ransomware': [1, 2, 4, 6, 7, 12, 15],
         'resource_hijacking': [1, 2, 4, 6, 8, 12, 16],
         'end_point_dos': [1, 2, 4, 8, 10, 12, 17]
         }

ips = {"controller": ["192.168.56.111"],
       "data-logger": ["192.168.56.121"],
       "attacker": ["192.168.56.131"],
       "non-vulnerable": ["192.168.56.141", "192.168.56.142"],
       "vulnerable": ["192.168.56.151"],
       "malicious": ["192.168.56.161"],
       "benign": ["192.168.56.171"],
       "target": ["192.168.56.181"]
       }

2. Extract each scenario into dataset

In [20]:

try:
    for n in range(len(scenario)):
        path = os.path.join(root_folder, scenario_folder[n], traffic_folder, filename)
        final_filename = "extract_traffic_{}_nfstream.csv".format(scenario[n])
        final_filepath = os.path.join(root_folder, label_folder, "nfstream", final_filename)
        print("Processing the "+scenario[n]+" on "+path + " to " + final_filepath)
        flows_count = NFStreamer(source=path,
                                 n_dissections=0,
                                 statistical_analysis=True).to_csv(path=final_filepath,
                                                                   columns_to_anonymize=(),
                                                                   flows_per_file=0,
                                                                   rotate_files=0)
        print("==================================================================")
        print(flows_count)

except Exception as e:
    print(e)

Processing the mirai on /RanD/CREMEv2_Result/20230207/logs/01_mirai/traffic/traffic.pcap to /RanD/CREMEv2_Result/20230207/logs/label_traffic/nfstream/extract_traffic_mirai_nfstream.csv
271098
Processing the disk_wipe on /RanD/CREMEv2_Result/20230207/logs/02_disk_wipe/traffic/traffic.pcap to /RanD/CREMEv2_Result/20230207/logs/label_traffic/nfstream/extract_traffic_disk_wipe_nfstream.csv
196906
Processing the ransomware on /RanD/CREMEv2_Result/20230207/logs/03_ransomware/traffic/traffic.pcap to /RanD/CREMEv2_Result/20230207/logs/label_traffic/nfstream/extract_traffic_ransomware_nfstream.csv
240643
Processing the resource_hijacking on /RanD/CREMEv2_Result/20230207/logs/04_resource_hijacking/traffic/traffic.pcap to /RanD/CREMEv2_Result/20230207/logs/label_traffic/nfstream/extract_traffic_resource_hijacking_nfstream.csv
237974
Processing the end_point_dos on /RanD/CREMEv2_Result/20230207/logs/05_end_point_dos/traffic/traffic.pcap to /RanD/CREMEv2_Result/20230207/logs/label_traffic/nfstream/

3. Labeling

In [8]:
print("========================================================================================")
filename = os.path.join(root_folder, label_folder, "nfstream", "extract_traffic_mirai_nfstream.csv") 
print(filename)

labeling_abs_file = os.path.join(root_folder, "01_mirai", "labeling_file_path.txt")
labeling_list = None
with open(labeling_abs_file, "r") as fp:
    labeling_list = json.load(fp)
    
print(labeling_list)

result_abs_path = os.path.join(root_folder, label_folder, "nfstream", "labeled")
result_file_name = "label_traffic_mirai_nfstream.csv"

# filter: start <-> end
start = labeling_list[0][3]
print(start)
end = labeling_list[-1][4]
print(end)

df = pd.read_csv(filename)

df.head()
# convert the timetstamps to 10 digits
df["bidirectional_first_seen_ms"] = df['bidirectional_first_seen_ms'].apply(lambda x: (int(x)/1000))
df = df[(df['bidirectional_first_seen_ms'] >= start) & (df['bidirectional_first_seen_ms'] <= end)]
df.head()

df.reset_index(drop=True)

if not os.path.exists(result_abs_path):
    os.makedirs(result_abs_path)
df.to_csv(os.path.join(result_abs_path, result_file_name), index=False)

/RanD/CREMEv2_Result/20230207/logs/label_traffic/nfstream/extract_traffic_mirai_nfstream.csv
[['Reconnaissance', 'Active Scanning', 'Scanning IP Blocks', 1675414494.0, 1675414913.0, ['192.168.56.132'], ['192.168.56.181'], ['192.168.56.171', '192.168.56.161', '192.168.56.151', '192.168.56.141', '192.168.56.142'], ['benign-server', 'vulnerable-machine', 'non-vulnerable-machine-1', 'non-vulnerable-machine-2'], ['target-server'], ['kworker'], [], 1], ['Initial Access', 'Exploit Public-Facing Application', 'Exploit Public-Facing Application', 1675414923.0, 1675414936.0, ['192.168.56.132'], ['192.168.56.181'], ['192.168.56.171', '192.168.56.161', '192.168.56.151', '192.168.56.141', '192.168.56.142'], ['benign-server', 'vulnerable-machine', 'non-vulnerable-machine-1', 'non-vulnerable-machine-2'], ['target-server'], ['kworker'], [], 2], ['Execution', 'Command and Scripting Interpreter', 'Unix Shell', 1675414937.0, 1675414939.0, ['192.168.56.161'], ['192.168.56.151'], ['192.168.56.141', '192.16

In [1]:
for i in range(len(scenario)):
    print("========================================================================================")
    filename = os.path.join(root_folder, label_folder, "nfstream", "extract_traffic_{}_nfstream.csv".format(scenario[i])) 
    print(filename)
    
    labeling_abs_file = os.path.join(root_folder, scenario_folder[i],"labeling_file_path.txt")
    labeling_list = None
    with open(labeling_abs_file, "r") as fp:
        labeling_list = json.load(fp)
        
    print(labeling_list)

    result_abs_path = os.path.join(root_folder, label_folder, "nfstream", "labeled")
    result_file_name = "label_traffic_{}_nfstream.csv".format(scenario[i])

    # filter: start <-> end
    start = labeling_list[0][3]
    print(start)
    end = labeling_list[-1][4]
    print(end)
    
    df = pd.read_csv(filename)
        
    # convert the timetstamps to 10 digits
    df["bidirectional_first_seen_ms"] = df['bidirectional_first_seen_ms'].apply(lambda x: (int(x)/1000))
    df.head()  
    # df = df[(df['bidirectional_first_seen_ms'] >= start) & (df['bidirectional_first_seen_ms'] <= end)]
    
    # df.loc['bidirectional_first_seen_ms'].head()
    
    # # add label column
    # label = [-1]*len(df)  # -1: delete, 0: normal, 1: abnormal
    # df['Label'] = label
    # label_lifecycle = [-1]*len(df)  # -1: delete, 0: normal, 1: abnormal
    # df['Label_lifecycle'] = label_lifecycle
        

    # for stage_list in labeling_list:
    #     tactic_name = stage_list[0]
    #     technique_name = stage_list[1]
    #     sub_technique_name = stage_list[2]
    #     start_time = stage_list[3]
    #     end_time = stage_list[4]
    #     srcip_list = stage_list[5]
    #     dstip_list = stage_list[6]
    #     normalip_list = stage_list[7]
    #     label = stage_list[12]

    #     stage = df[(df['bidirectional_first_seen_ms'] >= start_time) & (df['bidirectional_first_seen_ms'] < end_time)]
    #     normal_idx = stage[stage['src_ip'].isin(normalip_list) | stage['dst_ip'].isin(normalip_list)].index
    #     df.loc[normal_idx, 'Label'] = 0
    #     df.loc[normal_idx, 'Label_lifecycle'] = 0
    #         # df.loc[normal_idx, 'Tactic'] = 'Normal'
    #         # df.loc[normal_idx, 'Technique'] = 'Normal'
    #         # df.loc[normal_idx, 'SubTechnique'] = 'Normal'

    #     abnormal_idx = stage[((stage['src_ip'].isin(srcip_list)) & (stage['dst_ip'].isin(dstip_list))) | ((stage['src_ip'].isin(dstip_list)) & (stage['dst_ip'].isin(srcip_list)))].index
    #     df.loc[abnormal_idx, 'Label'] = label
    #     df.loc[abnormal_idx, 'Label_lifecycle'] = i+1
    #         # df.loc[abnormal_idx, 'Tactic'] = tactic_name
    #         # df.loc[abnormal_idx, 'Technique'] = technique_name
    #         # df.loc[abnormal_idx, 'SubTechnique'] = sub_technique_name

    #     stage = df[(df['bidirectional_first_seen_ms'] >= start_time) & (df['bidirectional_first_seen_ms'] < end_time)]
    #     del_idx = stage[stage['Label'] == -1].index
    #     df = df.drop(del_idx)

    # df = df[df['Label'] != -1]

    # df.reset_index(drop=True)

    # if not os.path.exists(result_abs_path):
    #     os.makedirs(result_abs_path)
    # df.to_csv(os.path.join(result_abs_path, result_file_name), index=False)


NameError: name 'scenario' is not defined

In [None]:
def get_timestamps_non_mirai(scenario, log_folder):
    label_number = 0
    timestamp_namelist = []
    timestamps_nonmirai = []
    timestamp_num = label_number * 2
    
    try:
        # get start and end timestamps
        for i in range(label_number):
            i += 1
            timestamp_namelist.append(os.path.join(log_folder, "time_step_"+str(i)+"_start.txt"))
            timestamp_namelist.append(os.path.join(log_folder, "time_step_"+str(i)+"_end.txt"))

        for i in range(timestamp_num):
            with open(timestamp_namelist[i], 'rt') as f:
                # read to sec
                timestamps_nonmirai.append(f.readline(10))

        timestamps = [int(float(i)) for i in timestamps_nonmirai]
        print(timestamps_nonmirai)
        return timestamps_nonmirai

    except Exception as e:
        print(e)
        