In [None]:
# We should be using Zeek. 

In [4]:
import pandas as pd
import os

In [29]:
import pandas as pd
import os

def parse_zeek_log(file_path, separator='\t', chunksize=10000):
    """
    Parses a Zeek log file in chunks and returns a DataFrame.
    """
    with open(file_path, 'r') as file:
        # Read the header line starting with '#fields'
        for line in file:
            if line.startswith('#fields'):
                fields = line.strip().split(separator)[1:]
                break
    # Read the file in chunks
    chunks = pd.read_csv(file_path, sep=separator, comment='#', names=fields, chunksize=chunksize, low_memory=False)
    return chunks

def process_zeek_logs(log_directory, chunksize=10000):
    """
    Processes multiple Zeek log files in chunks, extracts key features, and merges them on 'uid'.
    Returns a single DataFrame combining all processed logs.
    """
    all_conn = []
    all_dns = []
    all_http = []

    # Read and process logs
    for log_file in os.listdir(log_directory):
        file_path = os.path.join(log_directory, log_file)
        if log_file.endswith('.log'):
            chunks = parse_zeek_log(file_path, chunksize=chunksize)
            for chunk in chunks:
                if 'conn.log' in log_file:
                    all_conn.append(extract_conn_features(chunk))
                elif 'dns.log' in log_file:
                    all_dns.append(extract_dns_features(chunk))
                elif 'http.log' in log_file:
                    all_http.append(extract_http_features(chunk))

    # Concatenate each log type separately
    df_conn = pd.concat(all_conn, ignore_index=True) if all_conn else pd.DataFrame()
    df_dns = pd.concat(all_dns, ignore_index=True) if all_dns else pd.DataFrame()
    df_http = pd.concat(all_http, ignore_index=True) if all_http else pd.DataFrame()

    # Merge logs on 'uid'
    df_combined = df_conn
    if not df_dns.empty:
        df_combined = df_combined.merge(df_dns, on='uid', how='left', suffixes=('', '_dns'))
    if not df_http.empty:
        df_combined = df_combined.merge(df_http, on='uid', how='left', suffixes=('', '_http'))

    return df_combined.fillna(0)  # Replace NaN values with 0 for ML compatibility

def extract_conn_features(df):
    """
    Extracts connection-related features.
    Adds a derived feature 'is_incomplete_conn' to indicate if the connection is incomplete.
    """
    required_cols = ['ts', 'uid', 'id.orig_h', 'id.resp_h', 'id.resp_p', 'proto', 'service', 
                     'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 'history']
    available_cols = [col for col in required_cols if col in df.columns]
    df = df[available_cols].copy()

    df['ts'] = pd.to_numeric(df['ts'], errors='coerce')
    df['duration'] = pd.to_numeric(df['duration'], errors='coerce').fillna(0)
    df['orig_bytes'] = pd.to_numeric(df['orig_bytes'], errors='coerce').fillna(0)
    df['resp_bytes'] = pd.to_numeric(df['resp_bytes'], errors='coerce').fillna(0)
    df['log_type'] = 'conn'  

    # Derived feature: Check if connection is incomplete (e.g., missing conn_state)
    df['is_incomplete_conn'] = df['conn_state'].isna().astype(int)

    return df

def extract_dns_features(df):
    required_cols = ['ts', 'uid', 'id.orig_h', 'query']
    available_cols = [col for col in required_cols if col in df.columns]  
    df = df[available_cols].copy()
    
    df['log_type'] = 'dns'
    return df

def extract_http_features(df):
    required_cols = ['ts', 'uid', 'id.orig_h', 'id.resp_h', 'method']
    available_cols = [col for col in required_cols if col in df.columns]
    df = df[available_cols].copy()

    df['log_type'] = 'http'  
    return df


In [9]:
def make_zeek_log(folder, subfolder):
    if not os.path.exists(f"../logs/{subfolder}"):
        normal_capture_loc = f"data/{folder}/{subfolder}.pcap"
        os.system(f"mkdir ../logs/{subfolder}")
        os.system(f"zeek -r {normal_capture_loc} Log::default_logdir=logs/{subfolder}/")
    else:
        print(f"../logs/{subfolder} already exists.")

In [10]:
for pcap in ["attack_traffic", "flood\ attack", "port\ scan"]:
    make_zeek_log("Attack", pcap)

../logs/attack_traffic already exists.


mkdir: cannot create directory ‘../logs/flood attack’: File exists
fatal error in <params>, line 1: problem with trace file data/Attack/flood attack.pcap (unable to open data/Attack/flood attack.pcap: No such file or directory)
mkdir: cannot create directory ‘../logs/port scan’: File exists
fatal error in <params>, line 1: problem with trace file data/Attack/port scan.pcap (unable to open data/Attack/port scan.pcap: No such file or directory)


In [30]:
attack_traffic_df = process_zeek_logs("../logs/attack_traffic")
attack_traffic_df

Unnamed: 0,ts,uid,id.orig_h,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,history,log_type,is_incomplete_conn,ts_dns,id.orig_h_dns,query,log_type_dns
0,1740775000.0,CJHIUf3KRCJvIMPQsh,10.0.2.15,52.168.112.67,443,tcp,-,0.479902,0.0,4839.0,SHR,^hadf,conn,0,0.0,0,0,0
1,1740775000.0,CLtzLi4vW9ay9hF5v8,10.0.2.15,20.69.137.228,443,tcp,-,0.0,0.0,0.0,RSTRH,^r,conn,0,0.0,0,0,0
2,1740775000.0,C1fCCw3NHcU4WAd9f5,10.0.2.15,10.0.2.3,53,udp,dns,0.0,0.0,0.0,SHR,^d,conn,0,1740775000.0,10.0.2.15,v10.events.data.microsoft.com,dns
3,1740775000.0,Cw4rur3X91prCEb74c,10.0.2.15,10.0.2.3,53,udp,dns,0.0,0.0,0.0,SHR,^d,conn,0,1740775000.0,10.0.2.15,v10.events.data.microsoft.com,dns
4,1740775000.0,CKJkLE2bBycuC8zUS4,10.0.2.15,10.0.2.3,53,udp,dns,0.0,0.0,0.0,SHR,^d,conn,0,1740775000.0,10.0.2.15,edge-consumer-static.azureedge.net,dns
5,1740775000.0,CjwmZE4GMoQNaiOuG7,10.0.2.15,10.0.2.3,53,udp,dns,0.0,0.0,0.0,SHR,^d,conn,0,1740775000.0,10.0.2.15,edge-consumer-static.azureedge.net,dns
6,1740775000.0,CzFUFO3dWZ0Q4XSXUk,10.0.2.15,10.0.2.3,53,udp,dns,0.0,0.0,0.0,SHR,^d,conn,0,1740775000.0,10.0.2.15,edge-consumer-static.azureedge.net,dns
7,1740775000.0,CXBlMA2fTjnh14qhv4,10.0.2.15,13.107.246.57,443,tcp,-,0.0,0.0,0.0,OTH,^a,conn,0,0.0,0,0,0
8,1740775000.0,CaDyTu18W6P0M0Xlh7,10.0.2.15,13.107.246.57,443,tcp,-,0.0,0.0,0.0,OTH,^a,conn,0,0.0,0,0,0
9,1740775000.0,CTkcdz3oz4VhsF682f,10.0.2.15,13.107.246.57,443,tcp,-,0.0,0.0,0.0,OTH,^a,conn,0,0.0,0,0,0


In [31]:
flood_attack_df = process_zeek_logs("../logs/flood attack")
flood_attack_df

Unnamed: 0,ts,uid,id.orig_h,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,history,log_type,is_incomplete_conn,ts_dns,id.orig_h_dns,query,log_type_dns
0,1.741555e+09,CoqARw9uTGNyYiOT4,192.168.56.105,192.168.56.104,443,tcp,-,0.000000,0.0,0.0,S0,S,conn,0,0.0,0,0,0
1,1.741555e+09,CHY7Ll5AgR6S3niNa,192.168.56.105,192.168.56.104,443,tcp,-,0.000000,0.0,0.0,S0,S,conn,0,0.0,0,0,0
2,1.741555e+09,CWLMcU15wbSxErUA3c,192.168.56.105,192.168.56.104,443,tcp,-,0.000000,0.0,0.0,S0,S,conn,0,0.0,0,0,0
3,1.741555e+09,Cy8HWM6OzlQuHPi77,192.168.56.105,192.168.56.104,443,tcp,-,0.000000,0.0,0.0,S0,S,conn,0,0.0,0,0,0
4,1.741555e+09,CrQBDp3DyVkImHfna,192.168.56.105,192.168.56.104,443,tcp,-,0.000000,0.0,0.0,S0,S,conn,0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74979,1.741555e+09,C96t51QpjFoQzDbX4,192.168.56.105,192.168.56.104,443,tcp,-,0.000000,0.0,0.0,S0,S,conn,0,0.0,0,0,0
74980,1.741555e+09,COeqfn31yDu6vekR14,192.168.56.105,192.168.56.104,443,tcp,-,0.000000,0.0,0.0,S0,S,conn,0,0.0,0,0,0
74981,1.741555e+09,CCuj1kXBiTavBOnXb,192.168.56.105,192.168.56.104,443,tcp,-,0.000000,0.0,0.0,S0,S,conn,0,0.0,0,0,0
74982,1.741555e+09,CV7YlC46BMofdxv8Df,192.168.56.105,192.168.56.104,443,tcp,-,0.000000,0.0,0.0,S0,S,conn,0,0.0,0,0,0


In [32]:
port_scan_df = process_zeek_logs("../logs/port scan")
port_scan_df

Unnamed: 0,ts,uid,id.orig_h,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,...,is_incomplete_conn,ts_dns,id.orig_h_dns,query,log_type_dns,ts_http,id.orig_h_http,id.resp_h_http,method,log_type_http
0,1.741721e+09,CEwCxf1afj0kWISkUf,fe80::51ce:219c:48cb:74aa,ff02::fb,5353,udp,dns,0.833550,804.0,0.0,...,0,1.741721e+09,fe80::51ce:219c:48cb:74aa,_dosvc._tcp.local,dns,0.0,0,0,0,0
1,1.741721e+09,CEwCxf1afj0kWISkUf,fe80::51ce:219c:48cb:74aa,ff02::fb,5353,udp,dns,0.833550,804.0,0.0,...,0,1.741721e+09,fe80::51ce:219c:48cb:74aa,target._dosvc._tcp.local,dns,0.0,0,0,0,0
2,1.741721e+09,CEwCxf1afj0kWISkUf,fe80::51ce:219c:48cb:74aa,ff02::fb,5353,udp,dns,0.833550,804.0,0.0,...,0,1.741721e+09,fe80::51ce:219c:48cb:74aa,target._dosvc._tcp.local,dns,0.0,0,0,0,0
3,1.741721e+09,CEwCxf1afj0kWISkUf,fe80::51ce:219c:48cb:74aa,ff02::fb,5353,udp,dns,0.833550,804.0,0.0,...,0,1.741721e+09,fe80::51ce:219c:48cb:74aa,target._dosvc._tcp.local,dns,0.0,0,0,0,0
4,1.741721e+09,CXIYLQ1jpDx7vl3We2,fe80::51ce:219c:48cb:74aa,fe80::fd7:361b:ca40:d330,5353,udp,dns,0.000000,0.0,0.0,...,0,1.741721e+09,fe80::51ce:219c:48cb:74aa,_dosvc._tcp.local,dns,0.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2489,1.741721e+09,CEMEez3bPBBq7OsUd8,192.168.56.105,192.168.56.104,33131,udp,-,0.000000,0.0,0.0,...,0,0.000000e+00,0,0,0,0.0,0,0,0,0
2490,1.741721e+09,Cr556f4wKlU6HD3R74,192.168.56.105,192.168.56.104,1434,udp,-,0.000000,0.0,0.0,...,0,0.000000e+00,0,0,0,0.0,0,0,0,0
2491,1.741721e+09,C6b3VF1hlnwI7Crrme,fe80::fd7:361b:ca40:d330,ff02::16,0,icmp,-,0.113688,100.0,0.0,...,0,0.000000e+00,0,0,0,0.0,0,0,0,0
2492,1.741721e+09,CQgRlY2xkMneMj2UAj,192.168.56.102,192.168.56.100,67,udp,dhcp,0.016169,316.0,548.0,...,0,0.000000e+00,0,0,0,0.0,0,0,0,0


In [15]:
make_zeek_log("Normal", "normal_traffic")

../logs/normal_traffic already exists.


In [33]:
normal_traffic_df = process_zeek_logs("../logs/normal_traffic")
normal_traffic_df

Unnamed: 0,ts,uid,id.orig_h,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,...,is_incomplete_conn,ts_dns,id.orig_h_dns,query,log_type_dns,ts_http,id.orig_h_http,id.resp_h_http,method,log_type_http
0,1.740775e+09,CiDkso44FNpfcZ3j5e,10.0.2.15,20.69.137.228,443,tcp,-,0.063953,0.0,0.0,...,0,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0
1,1.740775e+09,CJwd182PibrZUPl4Ma,fd00::94aa:11c6:a50d:cd41,2620:1ec:bdf::57,443,tcp,-,0.000720,0.0,0.0,...,0,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0
2,1.740775e+09,CU4qcm32PuSUiYLPy4,fd00::94aa:11c6:a50d:cd41,2620:1ec:bdf::57,443,tcp,-,0.000457,0.0,0.0,...,0,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0
3,1.740775e+09,CVcwvnNrDSIDFCgQf,10.0.2.15,10.0.2.3,53,udp,dns,0.000000,0.0,0.0,...,0,1.740775e+09,10.0.2.15,amazon.com,dns,0.000000e+00,0,0,0,0
4,1.740775e+09,Ck0w3t26Ebu5Jn1Lm3,10.0.2.15,10.0.2.3,53,udp,dns,0.000000,0.0,0.0,...,0,1.740775e+09,10.0.2.15,edge-consumer-static.azureedge.net,dns,0.000000e+00,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,1.740775e+09,CfgWhN2Suc67Vyf2t3,10.0.2.15,52.94.236.248,80,tcp,-,35.444550,0.0,0.0,...,0,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0
75,1.740775e+09,Crgn8y37NLoMZwSmpk,10.0.2.15,142.250.113.139,443,tcp,-,36.393371,0.0,0.0,...,0,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0
76,1.740775e+09,CB3c1U2cqHVBeZGo11,10.0.2.15,142.250.138.91,80,tcp,-,50.705655,0.0,0.0,...,0,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0
77,1.740775e+09,CTIZLf4XYOsSxEDi6k,10.0.2.15,23.198.7.173,443,udp,-,0.000000,0.0,0.0,...,0,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0


In [18]:
normal_traffic_df.loc[normal_traffic_df['is_incomplete_conn'] == 0]

Unnamed: 0,ts,uid,id.orig_h,id.resp_h,proto,service,duration,orig_bytes,resp_bytes,conn_state,...,is_incomplete_conn,ts_dns,id.orig_h_dns,query,log_type_dns,ts_http,id.orig_h_http,id.resp_h_http,method,log_type_http
0,1.740775e+09,CiDkso44FNpfcZ3j5e,10.0.2.15,20.69.137.228,tcp,-,0.063953,0.0,0.0,SHR,...,0,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0
1,1.740775e+09,CJwd182PibrZUPl4Ma,fd00::94aa:11c6:a50d:cd41,2620:1ec:bdf::57,tcp,-,0.000720,0.0,0.0,REJ,...,0,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0
2,1.740775e+09,CU4qcm32PuSUiYLPy4,fd00::94aa:11c6:a50d:cd41,2620:1ec:bdf::57,tcp,-,0.000457,0.0,0.0,REJ,...,0,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0
3,1.740775e+09,CVcwvnNrDSIDFCgQf,10.0.2.15,10.0.2.3,udp,dns,0.000000,0.0,0.0,SHR,...,0,1.740775e+09,10.0.2.15,amazon.com,dns,0.000000e+00,0,0,0,0
4,1.740775e+09,Ck0w3t26Ebu5Jn1Lm3,10.0.2.15,10.0.2.3,udp,dns,0.000000,0.0,0.0,SHR,...,0,1.740775e+09,10.0.2.15,edge-consumer-static.azureedge.net,dns,0.000000e+00,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,1.740775e+09,CfgWhN2Suc67Vyf2t3,10.0.2.15,52.94.236.248,tcp,-,35.444550,0.0,0.0,S1,...,0,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0
75,1.740775e+09,Crgn8y37NLoMZwSmpk,10.0.2.15,142.250.113.139,tcp,-,36.393371,0.0,0.0,S1,...,0,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0
76,1.740775e+09,CB3c1U2cqHVBeZGo11,10.0.2.15,142.250.138.91,tcp,-,50.705655,0.0,0.0,S1,...,0,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0
77,1.740775e+09,CTIZLf4XYOsSxEDi6k,10.0.2.15,23.198.7.173,udp,-,0.000000,0.0,0.0,SHR,...,0,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0


In [34]:
malicious_dfs = [port_scan_df, flood_attack_df, attack_traffic_df]
for df in malicious_dfs:
    df['Malicious'] = 1
malicious_dfs = pd.concat(malicious_dfs, ignore_index=True)
malicious_dfs

Unnamed: 0,ts,uid,id.orig_h,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,...,ts_dns,id.orig_h_dns,query,log_type_dns,ts_http,id.orig_h_http,id.resp_h_http,method,log_type_http,Malicious
0,1.741721e+09,CEwCxf1afj0kWISkUf,fe80::51ce:219c:48cb:74aa,ff02::fb,5353,udp,dns,0.833550,804.0,0.0,...,1.741721e+09,fe80::51ce:219c:48cb:74aa,_dosvc._tcp.local,dns,0.0,0,0,0,0,1
1,1.741721e+09,CEwCxf1afj0kWISkUf,fe80::51ce:219c:48cb:74aa,ff02::fb,5353,udp,dns,0.833550,804.0,0.0,...,1.741721e+09,fe80::51ce:219c:48cb:74aa,target._dosvc._tcp.local,dns,0.0,0,0,0,0,1
2,1.741721e+09,CEwCxf1afj0kWISkUf,fe80::51ce:219c:48cb:74aa,ff02::fb,5353,udp,dns,0.833550,804.0,0.0,...,1.741721e+09,fe80::51ce:219c:48cb:74aa,target._dosvc._tcp.local,dns,0.0,0,0,0,0,1
3,1.741721e+09,CEwCxf1afj0kWISkUf,fe80::51ce:219c:48cb:74aa,ff02::fb,5353,udp,dns,0.833550,804.0,0.0,...,1.741721e+09,fe80::51ce:219c:48cb:74aa,target._dosvc._tcp.local,dns,0.0,0,0,0,0,1
4,1.741721e+09,CXIYLQ1jpDx7vl3We2,fe80::51ce:219c:48cb:74aa,fe80::fd7:361b:ca40:d330,5353,udp,dns,0.000000,0.0,0.0,...,1.741721e+09,fe80::51ce:219c:48cb:74aa,_dosvc._tcp.local,dns,0.0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77490,1.740775e+09,C3XRPn48yx1btjnaNe,10.0.2.15,13.107.246.57,443,tcp,-,0.049190,0.0,442.0,...,0.000000e+00,0,0,0,,,,,,1
77491,1.740775e+09,CaAGIi4h9mncZpH3lh,fd00::94aa:11c6:a50d:cd41,2620:1ec:bdf::57,443,tcp,-,0.000584,0.0,0.0,...,0.000000e+00,0,0,0,,,,,,1
77492,1.740775e+09,Cx4rokijkDX2RJRRk,10.0.2.15,20.59.87.227,443,tcp,-,0.059458,0.0,169.0,...,0.000000e+00,0,0,0,,,,,,1
77493,1.740775e+09,C6ZwHPR2b3AoKSsdf,10.0.2.15,23.198.7.176,443,udp,-,10.006460,0.0,121.0,...,0.000000e+00,0,0,0,,,,,,1


In [35]:
normal_traffic_df['Malicious'] = 0
total_dfs = pd.concat([malicious_dfs, normal_traffic_df], ignore_index=True)
total_dfs

Unnamed: 0,ts,uid,id.orig_h,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,...,ts_dns,id.orig_h_dns,query,log_type_dns,ts_http,id.orig_h_http,id.resp_h_http,method,log_type_http,Malicious
0,1.741721e+09,CEwCxf1afj0kWISkUf,fe80::51ce:219c:48cb:74aa,ff02::fb,5353,udp,dns,0.833550,804.0,0.0,...,1.741721e+09,fe80::51ce:219c:48cb:74aa,_dosvc._tcp.local,dns,0.000000e+00,0,0,0,0,1
1,1.741721e+09,CEwCxf1afj0kWISkUf,fe80::51ce:219c:48cb:74aa,ff02::fb,5353,udp,dns,0.833550,804.0,0.0,...,1.741721e+09,fe80::51ce:219c:48cb:74aa,target._dosvc._tcp.local,dns,0.000000e+00,0,0,0,0,1
2,1.741721e+09,CEwCxf1afj0kWISkUf,fe80::51ce:219c:48cb:74aa,ff02::fb,5353,udp,dns,0.833550,804.0,0.0,...,1.741721e+09,fe80::51ce:219c:48cb:74aa,target._dosvc._tcp.local,dns,0.000000e+00,0,0,0,0,1
3,1.741721e+09,CEwCxf1afj0kWISkUf,fe80::51ce:219c:48cb:74aa,ff02::fb,5353,udp,dns,0.833550,804.0,0.0,...,1.741721e+09,fe80::51ce:219c:48cb:74aa,target._dosvc._tcp.local,dns,0.000000e+00,0,0,0,0,1
4,1.741721e+09,CXIYLQ1jpDx7vl3We2,fe80::51ce:219c:48cb:74aa,fe80::fd7:361b:ca40:d330,5353,udp,dns,0.000000,0.0,0.0,...,1.741721e+09,fe80::51ce:219c:48cb:74aa,_dosvc._tcp.local,dns,0.000000e+00,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77569,1.740775e+09,CfgWhN2Suc67Vyf2t3,10.0.2.15,52.94.236.248,80,tcp,-,35.444550,0.0,0.0,...,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0,0
77570,1.740775e+09,Crgn8y37NLoMZwSmpk,10.0.2.15,142.250.113.139,443,tcp,-,36.393371,0.0,0.0,...,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0,0
77571,1.740775e+09,CB3c1U2cqHVBeZGo11,10.0.2.15,142.250.138.91,80,tcp,-,50.705655,0.0,0.0,...,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0,0
77572,1.740775e+09,CTIZLf4XYOsSxEDi6k,10.0.2.15,23.198.7.173,443,udp,-,0.000000,0.0,0.0,...,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0,0


In [22]:
import pandas as pd

# Load the CSV files into DataFrames
a_df = pd.read_csv('blended_packetsv2.csv')
b_df = pd.read_csv('blended_packets.csv')

# Check the number of rows where 'malicious' == 0 in both DataFrames
a_malicious_0_count = len(a_df[a_df['Malicious'] == 1])
b_malicious_0_count = len(b_df[b_df['Label'] == 0])

# Print the results
print(f"Number of rows with 'malicious' == 0 in a.csv: {a_malicious_0_count}")
print(f"Number of rows with 'malicious' == 0 in b.csv: {b_malicious_0_count}")

Number of rows with 'malicious' == 0 in a.csv: 77495
Number of rows with 'malicious' == 0 in b.csv: 95462


  a_df = pd.read_csv('blended_packetsv2.csv')


In [23]:
def normalizeTime(in_df, start_point = 0):
    df = in_df.copy()
    timetable = []
    start_time = float(df['ts'][0])
    
    for packet in df.itertuples():
        time_diff = float(packet[1]) - start_time
        timetable.append(start_point + time_diff)
    
    df['ts'] = timetable
    return df

normalizeTime(total_dfs)

Unnamed: 0,ts,uid,id.orig_h,id.resp_h,proto,service,duration,orig_bytes,resp_bytes,conn_state,...,ts_dns,id.orig_h_dns,query,log_type_dns,ts_http,id.orig_h_http,id.resp_h_http,method,log_type_http,Malicious
0,0.000000,CEwCxf1afj0kWISkUf,fe80::51ce:219c:48cb:74aa,ff02::fb,udp,dns,0.833550,804.0,0.0,S0,...,1.741721e+09,fe80::51ce:219c:48cb:74aa,_dosvc._tcp.local,dns,0.000000e+00,0,0,0,0,1
1,0.000000,CEwCxf1afj0kWISkUf,fe80::51ce:219c:48cb:74aa,ff02::fb,udp,dns,0.833550,804.0,0.0,S0,...,1.741721e+09,fe80::51ce:219c:48cb:74aa,target._dosvc._tcp.local,dns,0.000000e+00,0,0,0,0,1
2,0.000000,CEwCxf1afj0kWISkUf,fe80::51ce:219c:48cb:74aa,ff02::fb,udp,dns,0.833550,804.0,0.0,S0,...,1.741721e+09,fe80::51ce:219c:48cb:74aa,target._dosvc._tcp.local,dns,0.000000e+00,0,0,0,0,1
3,0.000000,CEwCxf1afj0kWISkUf,fe80::51ce:219c:48cb:74aa,ff02::fb,udp,dns,0.833550,804.0,0.0,S0,...,1.741721e+09,fe80::51ce:219c:48cb:74aa,target._dosvc._tcp.local,dns,0.000000e+00,0,0,0,0,1
4,0.000865,CXIYLQ1jpDx7vl3We2,fe80::51ce:219c:48cb:74aa,fe80::fd7:361b:ca40:d330,udp,dns,0.000000,0.0,0.0,S0,...,1.741721e+09,fe80::51ce:219c:48cb:74aa,_dosvc._tcp.local,dns,0.000000e+00,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77569,-945372.344382,CfgWhN2Suc67Vyf2t3,10.0.2.15,52.94.236.248,tcp,-,35.444550,0.0,0.0,S1,...,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0,0
77570,-945367.027479,Crgn8y37NLoMZwSmpk,10.0.2.15,142.250.113.139,tcp,-,36.393371,0.0,0.0,S1,...,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0,0
77571,-945382.822954,CB3c1U2cqHVBeZGo11,10.0.2.15,142.250.138.91,tcp,-,50.705655,0.0,0.0,S1,...,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0,0
77572,-945382.637871,CTIZLf4XYOsSxEDi6k,10.0.2.15,23.198.7.173,udp,-,0.000000,0.0,0.0,SHR,...,0.000000e+00,0,0,0,0.000000e+00,0,0,0,0,0


In [36]:
total_dfs.columns

Index(['ts', 'uid', 'id.orig_h', 'id.resp_h', 'id.resp_p', 'proto', 'service',
       'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 'history',
       'log_type', 'is_incomplete_conn', 'ts_dns', 'id.orig_h_dns', 'query',
       'log_type_dns', 'ts_http', 'id.orig_h_http', 'id.resp_h_http', 'method',
       'log_type_http', 'Malicious'],
      dtype='object')

In [40]:
# ts
columns = ['ts', 'uid', 'id.orig_h', 'id.resp_h', 'id.resp_p', 'proto', 'duration', 'orig_bytes', 'resp_bytes', 'is_incomplete_conn', 'history', 'Malicious']
output_df = total_dfs[columns]

In [41]:
output_df.to_csv('output.csv', index=False)