# Gen features

In [16]:
import pandas as pd
import numpy as np
import copy

import warnings
warnings.filterwarnings('ignore')

# Read data

In [17]:
af = 'vv'
dt = 't'
path = f'./data/{dt}.csv'
df = pd.read_csv(path)

# Label

In [18]:
df['LabelStr'] = df['Label']
df['Label'] = 0
df.loc[df['LabelStr'].str.contains('To-Backgro'), 'Label'] = 1
df.loc[df['LabelStr'].str.contains('From-Backg'), 'Label'] = 2
df.loc[df['LabelStr'].str.contains('From-Norma'), 'Label'] = 3
df.loc[df['LabelStr'].str.contains('To-Normal-'), 'Label'] = 4
df.loc[df['LabelStr'].str.contains('From-Botne'), 'Label'] = 5
df.loc[df['LabelStr'].str.contains('Normal-'), 'Label'] = 3

# Calculate

In [19]:
#? packets per second
df['PktsPerSec'] = df['TotPkts']/df['Dur'].replace(0, np.inf)

#? bytes (sent both directions) per second
df['BytesPerSec'] = df['TotBytes']/df['Dur'].replace(0, np.inf)

#? bytes (from src -> dst) per second
df['SrcBytesPerSec'] = df['SrcBytes']/df['Dur'].replace(0, np.inf)

#? bytes per packet
df['BytesPerPkt'] = df['TotBytes']/df['TotPkts'].replace(0, np.inf)


#? dst bytes
df['DstBytes'] = df['TotBytes'] - df['SrcBytes']
#? bytes (from dst -> src) per second
df['DstBytesPerSec'] = df['DstBytes']/(df['Dur'].replace(0, np.inf))
#? if dst bytes < 0
df.loc[df['DstBytes'] < 0, 'DstBytes'] = 0
df.loc[df['DstBytesPerSec'] < 0, 'DstBytesPerSec'] = 0

# sTos & dTos

In [20]:
""" Some sTos and dTos has nan value """
df.loc[df['sTos'].isnull(), 'sTos'] = -1
df.loc[df['dTos'].isnull(), 'dTos'] = -1

df = df.drop(df[df['sTos'] == 192.0].index)
df = df.drop(df[df['dTos'] == 192.0].index)

""" dirty way to onehot sTos and dTos """
stos_list = [line.strip() for line in open(f'data/{af}1.tr.stos.txt').readlines()]
dtos_list = [line.strip() for line in open(f'data/{af}1.tr.dtos.txt').readlines()]
for tos in stos_list:
    df.loc[df['sTos'] == float(tos), f'sTos_{tos}'] = 1
    df.loc[df['sTos'] != float(tos), f'sTos_{tos}'] = 0
for tos in dtos_list:
    df.loc[df['dTos'] == float(tos), f'dTos_{tos}'] = 1
    df.loc[df['dTos'] != float(tos), f'dTos_{tos}'] = 0

In [21]:
print(df['sTos'].value_counts())
print()
print(df['dTos'].value_counts())
print()
# df.loc[df['sTos'] == 192]

 0.0    758468
-1.0      5604
 3.0       294
 2.0       126
 1.0        50
Name: sTos, dtype: int64

 0.0    710839
-1.0     53516
 3.0       102
 2.0        73
 1.0        12
Name: dTos, dtype: int64



# State

Just to calculate number of packets with certain states  

State column ->  S_  | CON |  A_  |  TCP  | ...

- CON = Connected (UDP); (A lot in Background)
- INT = Initial (UDP); 
- URP = Urgent Pointer (UDP); 
- F = FIN (TCP); 
- S = SYN = Synchronization (TCP); 
- P = Push (TCP); 
- A = ACK = Acknowledgement (TCP); 
- R = Reset (TCP); 
- FSPA = All flags : FIN, SYN, PUSH, ACK (TCP)

=> FSPA, SRPA, etc. => convert to all

In [22]:
""" some records have NaN state value """
df.loc[df['State'].isnull(), 'State'] = 'nanvalue'

In [23]:
# print(df['Proto'].value_counts(), '\n')
# print(df['State'].value_counts(), '\n')

In [24]:
""" some state is just to indicate the tcp states, replace these states with alltcp and use Flag_ fields to indicate triggered flags """
df['State_orig'] = df['State']
df.loc[(df['State_orig'].str.len() > 2) & (df['State_orig'].str.contains('_')), 'State'] = 'alltcp'

In [25]:
""" this field shall be set based on seen_values from train set """
state_list = [line.strip() for line in open(f'data/{af}1.tr.state.txt').readlines()]
df.loc[(~df['State'].isin(state_list)), 'State'] = 'other'

""" dirty way to onehot state """
for state in state_list:
    # df[f'Proto_{proto}'] = df['Proto'].apply(lambda x: x.count(proto))
    df.loc[df['State'] == state, f'State_{state}'] = 1
    df.loc[df['State'] != state, f'State_{state}'] = 0

df.head()

Unnamed: 0,StreamID,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,...,State_ECR,State_URH,State_TXD,State_URFIL,State_R_,State_URN,State_RSP,State_URHPRO,State_A_,State_other
0,1,2021-08-12 20:56:02.042618,0.248864,udp,150.35.87.62,59827,<->,150.35.83.12,53,CON,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,2021-08-12 20:56:02.051273,0.000458,udp,216.149.170.165,15135,<->,150.35.87.232,13365,CON,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,2021-08-12 20:56:02.056605,0.140449,tcp,81.114.119.142,3577,->,150.35.87.133,80,alltcp,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,2021-08-12 20:56:02.108284,0.000666,udp,150.35.87.141,55454,<->,150.35.83.12,53,CON,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,2021-08-12 20:56:02.108367,0.000749,udp,150.35.87.141,51411,<->,150.35.83.12,53,CON,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# TCP states to flag

In [26]:
df['Flag_nan'] = 0
df.loc[df['State_orig'] == 'nanvalue', 'Flag_nan'] = 1

""" encode tcp state field by counting the number of flags triggered """
flag_list = ['S','A','P','R','F']
for flag in flag_list:
    # df.loc[(df['State'] == 'alltcp'), f'Flag_{flag}'] = df.loc[(df['State'] == 'alltcp')]['State_orig'].apply(lambda x: x.count(flag) if '_' in x else 0)
    df[f'Flag_{flag}'] = df['State_orig'].apply(lambda x: x.count(flag) if '_' in x else 0)

# df = df.fillna(0)

# Proto

In [27]:
""" this field shall be set based on seen_values from train set """
proto_list = [line.strip() for line in open(f'data/{af}1.tr.proto.txt').readlines()]
df.loc[(~df['Proto'].isin(proto_list)), 'Proto'] = 'other'

""" dirty way to onehot proto """
for proto in proto_list:
    # df[f'Proto_{proto}'] = df['Proto'].apply(lambda x: x.count(proto))
    df.loc[df['Proto'] == proto, f'Proto_{proto}'] = 1
    df.loc[df['Proto'] != proto, f'Proto_{proto}'] = 0

df.head()

Unnamed: 0,StreamID,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,...,Flag_R,Flag_F,Proto_udp,Proto_tcp,Proto_icmp,Proto_rtp,Proto_rtcp,Proto_igmp,Proto_arp,Proto_other
0,1,2021-08-12 20:56:02.042618,0.248864,udp,150.35.87.62,59827,<->,150.35.83.12,53,CON,...,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,2021-08-12 20:56:02.051273,0.000458,udp,216.149.170.165,15135,<->,150.35.87.232,13365,CON,...,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,2021-08-12 20:56:02.056605,0.140449,tcp,81.114.119.142,3577,->,150.35.87.133,80,alltcp,...,0,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,2021-08-12 20:56:02.108284,0.000666,udp,150.35.87.141,55454,<->,150.35.83.12,53,CON,...,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,2021-08-12 20:56:02.108367,0.000749,udp,150.35.87.141,51411,<->,150.35.83.12,53,CON,...,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Dport to Service

Keep common Dport only.  
Other Dport, save as -1

In [28]:
df['Service'] = 'other'
common_dports = [80,443,21,22,25,6667,]
df.loc[df['Dport'].isin(common_dports), 'Service'] = df['Dport'].apply(str)

service_list = [str(i) for i in common_dports] + ['other']

""" dirty way to onehot service """
for service in service_list:
    # print('\n--------------------\nservice', service)
    # print(len(df.loc[df['Service'] == service]))
    df.loc[df['Service'] == service, f'Service_{service}'] = 1
    df.loc[df['Service'] != service, f'Service_{service}'] = 0

df

Unnamed: 0,StreamID,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,...,Proto_arp,Proto_other,Service,Service_80,Service_443,Service_21,Service_22,Service_25,Service_6667,Service_other
0,1,2021-08-12 20:56:02.042618,0.248864,udp,150.35.87.62,59827,<->,150.35.83.12,53,CON,...,0.0,0.0,other,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,2021-08-12 20:56:02.051273,0.000458,udp,216.149.170.165,15135,<->,150.35.87.232,13365,CON,...,0.0,0.0,other,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,2021-08-12 20:56:02.056605,0.140449,tcp,81.114.119.142,3577,->,150.35.87.133,80,alltcp,...,0.0,0.0,80,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,2021-08-12 20:56:02.108284,0.000666,udp,150.35.87.141,55454,<->,150.35.83.12,53,CON,...,0.0,0.0,other,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,2021-08-12 20:56:02.108367,0.000749,udp,150.35.87.141,51411,<->,150.35.83.12,53,CON,...,0.0,0.0,other,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764718,764719,2021-08-13 01:18:26.694269,0.020107,tcp,150.35.87.62,1118,->,150.35.83.16,80,alltcp,...,0.0,0.0,80,1.0,0.0,0.0,0.0,0.0,0.0,0.0
764719,764720,2021-08-13 01:18:26.744751,0.000309,udp,150.35.89.23,63043,<->,150.35.83.12,53,CON,...,0.0,0.0,other,0.0,0.0,0.0,0.0,0.0,0.0,1.0
764720,764721,2021-08-13 01:18:26.745285,0.077215,tcp,150.35.89.23,1313,->,191.141.87.242,80,alltcp,...,0.0,0.0,80,1.0,0.0,0.0,0.0,0.0,0.0,0.0
764721,764722,2021-08-13 01:18:26.750279,0.000243,tcp,81.97.170.121,65016,->,150.35.87.121,6882,alltcp,...,0.0,0.0,other,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Save the processed data

In [29]:
df.to_csv(path.replace('.csv', f'.{af}1.csv'))

In [30]:
#? cleanup
del df