In [15]:
import pyshark as pys
import pandas as pd

In [16]:
# Needed for pyshark
import nest_asyncio
nest_asyncio.apply()

In [17]:
# PCAP file to DataFrame
def getSrcIP(packet):
    try:
        return packet.ip.src
    except:
        return None
def getDstIP(packet):
    try:
        return packet.ip.dst
    except:
        return None
def getProtocol(packet):
    try:
        if (packet.transport_layer):
            return packet.transport_layer
        else:
            return packet.highest_layer
    except:
        return None
def getSrcPort(packet):
    try:
        if (packet.transport_layer):
            return packet[packet.transport_layer].srcport
        else:
            return packet[packet.highest_layer].srcport
    except:
        return None
def getDstPort(packet):
    try:
        if (packet.transport_layer):
            return packet[packet.transport_layer].dstport
        else:
            return packet[packet.highest_layer].dstport
    except:
        return None
def getLength(packet):
    try:
        return packet.ip.len
    except:
        return None
def getTCPFlags(packet):
    try:
        if (packet.transport_layer):
            return packet[packet.transport_layer].flags
        else:
            return packet[packet.highest_layer].flags
    except:
        return None
def pcap2DF(capture, label):
    table = []
    for packet in capture:
        row = [packet.sniff_timestamp, 
               getSrcIP(packet), 
               getDstIP(packet), 
               getProtocol(packet), 
               getSrcPort(packet), 
               getDstPort(packet), 
               getLength(packet),
               getTCPFlags(packet),
               label]
        table.append(row)
    return pd.DataFrame(table, columns=['Timestamp', 'Source IP', 'Destination IP', 'Protocol', 'Source Port', 'Destination Port', 'Packet Length', 'TCP Flags', 'Label'])

In [18]:
normal_capture = pys.FileCapture("B:\\Downloads\\wg0-capture-2.pcapng")
norm_df = pcap2DF(normal_capture, 0)

In [36]:
attack_capture = pys.FileCapture("B:\\Downloads\\test2.pcap")
flood_df = pcap2DF(attack_capture, 1)

In [37]:
attack2_capture = pys.FileCapture("B:\\Downloads\\port scan.pcapng")
scan_df = pcap2DF(attack2_capture, 2)

In [38]:
# Backups made for debugging
df_archive_normal = norm_df.copy()
df_archive_flood = flood_df.copy()
df_archive_scan = scan_df.copy()

In [39]:
# DataFrame modifiers
def normalizeTime(in_df, start_point = 0):
    df = in_df.copy()
    timetable = []
    start_time = float(df['Timestamp'][0])
    
    for packet in df.itertuples():
        time_diff = float(packet[1]) - start_time
        timetable.append(start_point + time_diff)
    
    df['Timestamp'] = timetable
    return df
def noIP(in_df):
    df = in_df.copy()
    #  Network IDs replace IPs in dataset
    known_machines = {}
    
    i = 0
    for packet in df.itertuples():
        src = packet[2]
        dst = packet[3]
        
        if (not src in known_machines):
            known_machines[src] = i
            i += 1
        elif (not dst in known_machines):
            known_machines[dst] = i
            i += 1
            
    # Replace Source IP with Network ID
    new_src = []
    for src in df['Source IP']:
        new_src.append(known_machines[src])
    df['Source IP'] = new_src
    
    # Replace Destination IP with Network ID
    new_dst = []
    for dst in df['Destination IP']:
        new_dst.append(known_machines[dst])
    df['Destination IP'] = new_dst
    return df
def applyDFModifiers(in_df, start_point = 0):
    df = in_df.copy()
    return noIP(normalizeTime(df, start_point))

In [40]:
norm_df = applyDFModifiers(norm_df)
# Flood injected at 1000s
flood_df = applyDFModifiers(flood_df, norm_df['Timestamp'][37005])
# Scan injected at 500s
scan_df = applyDFModifiers(scan_df, norm_df['Timestamp'][15523])

In [41]:
mix_df = pd.concat([norm_df, flood_df, scan_df])

In [43]:
mix_df = mix_df.sort_values(by=['Timestamp'])

In [45]:
# Backup made for debugging
df_archive_mix = mix_df.copy()

In [87]:
df_archive_mix['Protocol'].value_counts()

Protocol
TCP       179529
UDP        20306
ICMP         643
IGMP         118
ARP           10
ICMPV6        10
Name: count, dtype: int64

In [47]:
# Start of cleaning
mix_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200616 entries, 0 to 95461
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Timestamp         200616 non-null  float64
 1   Source IP         200616 non-null  int64  
 2   Destination IP    200616 non-null  int64  
 3   Protocol          200616 non-null  object 
 4   Source Port       199835 non-null  object 
 5   Destination Port  199835 non-null  object 
 6   Packet Length     200446 non-null  object 
 7   TCP Flags         179529 non-null  object 
 8   Label             200616 non-null  int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 15.3+ MB


In [55]:
# Protocol: obj -> int
new_col = []
# Numbers retrieved from IANA standards
for proto in mix_df['Protocol'].values:
    if (proto == 'TCP'):
        new_col.append(6)
    elif (proto == 'UDP' or proto == 'MDNS'):
        new_col.append(17)
    elif (proto == 'ICMP'):
        new_col.append(1)
    elif (proto == 'IGMP'):
        new_col.append(2)
    elif (proto == 'ICMPV6'):
        new_col.append(58)
    else:
        new_col.append(0)
mix_df['Protocol'] = new_col

In [56]:
mix_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200616 entries, 0 to 95461
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Timestamp         200616 non-null  float64
 1   Source IP         200616 non-null  int64  
 2   Destination IP    200616 non-null  int64  
 3   Protocol          200616 non-null  int64  
 4   Source Port       199835 non-null  object 
 5   Destination Port  199835 non-null  object 
 6   Packet Length     200446 non-null  object 
 7   TCP Flags         179529 non-null  object 
 8   Label             200616 non-null  int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 15.3+ MB


In [66]:
# Source Port: Fill null && obj -> int -- Replace with 0
new_col = []
for item in mix_df['Source Port'].values:
    if (item == None):
        new_col.append(0)
    else:
        new_col.append(int(item))
mix_df['Source Port'] = new_col

In [68]:
mix_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200616 entries, 0 to 95461
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Timestamp         200616 non-null  float64
 1   Source IP         200616 non-null  int64  
 2   Destination IP    200616 non-null  int64  
 3   Protocol          200616 non-null  int64  
 4   Source Port       200616 non-null  int64  
 5   Destination Port  199835 non-null  object 
 6   Packet Length     200446 non-null  object 
 7   TCP Flags         179529 non-null  object 
 8   Label             200616 non-null  int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 15.3+ MB


In [69]:
# Destination Port: Fill null && obj -> int
new_col = []
for item in mix_df['Destination Port'].values:
    if (item == None):
        new_col.append(0)
    else:
        new_col.append(int(item))
mix_df['Destination Port'] = new_col

In [70]:
mix_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200616 entries, 0 to 95461
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Timestamp         200616 non-null  float64
 1   Source IP         200616 non-null  int64  
 2   Destination IP    200616 non-null  int64  
 3   Protocol          200616 non-null  int64  
 4   Source Port       200616 non-null  int64  
 5   Destination Port  200616 non-null  int64  
 6   Packet Length     200446 non-null  object 
 7   TCP Flags         179529 non-null  object 
 8   Label             200616 non-null  int64  
dtypes: float64(1), int64(6), object(2)
memory usage: 15.3+ MB


In [78]:
# Packet Length: Fill null && obj -> int
new_col = []
for item in mix_df['Packet Length'].values:
    if (item == None):
        new_col.append(0)
    else:
        new_col.append(int(item))
mix_df['Packet Length'] = new_col

In [79]:
mix_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200616 entries, 0 to 95461
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Timestamp         200616 non-null  float64
 1   Source IP         200616 non-null  int64  
 2   Destination IP    200616 non-null  int64  
 3   Protocol          200616 non-null  int64  
 4   Source Port       200616 non-null  int64  
 5   Destination Port  200616 non-null  int64  
 6   Packet Length     200616 non-null  int64  
 7   TCP Flags         179529 non-null  object 
 8   Label             200616 non-null  int64  
dtypes: float64(1), int64(7), object(1)
memory usage: 15.3+ MB


In [81]:
new_col = []
for item in mix_df['TCP Flags']:
    try:
        new_col.append(item.int_value)
    except:
        new_col.append(0)
mix_df['TCP Flags'] = new_col

In [83]:
mix_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200616 entries, 0 to 95461
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Timestamp         200616 non-null  float64
 1   Source IP         200616 non-null  int64  
 2   Destination IP    200616 non-null  int64  
 3   Protocol          200616 non-null  int64  
 4   Source Port       200616 non-null  int64  
 5   Destination Port  200616 non-null  int64  
 6   Packet Length     200616 non-null  int64  
 7   TCP Flags         200616 non-null  int64  
 8   Label             200616 non-null  int64  
dtypes: float64(1), int64(8)
memory usage: 15.3 MB


In [84]:
mix_df

Unnamed: 0,Timestamp,Source IP,Destination IP,Protocol,Source Port,Destination Port,Packet Length,TCP Flags,Label
0,0.000000,0,1,6,5228,58250,1280,18,0
1,0.000723,0,1,6,5228,58250,425,18,0
2,0.344932,0,1,6,5228,58250,425,18,0
3,1.576458,0,1,6,5228,58250,1280,18,0
4,3.485089,2,1,6,443,46528,134,18,0
...,...,...,...,...,...,...,...,...,...
95457,2587.553109,41,67,6,57696,443,325,18,0
95458,2587.553136,41,67,6,57696,443,57,18,0
95459,2587.556349,67,41,6,443,57696,40,10,0
95460,2587.556358,67,41,6,443,57696,40,10,0


In [85]:
mix_df.to_csv('blended_packets.csv', index=False)