# Gen features

In [3]:
import pandas as pd
import numpy as np
import copy

import warnings
warnings.filterwarnings('ignore')

# Read data

In [4]:
af = 'vv'
dt = 'tr'
path = f'./data/{dt}.csv'
df = pd.read_csv(path)

In [5]:
df.loc[(df['sTos'] == 192)]
# df.loc[(df['sTos'] == 192) & (df['Label'].str.contains('Bot'))]
df.loc[(df['sTos'] != 192) & (df['Proto'] == 'igmp')]

Unnamed: 0,StreamID,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
1390,1391,2021-08-08 19:25:01.091230,0.0,igmp,150.35.90.64,0,->,242.5.5.3,0,INT,0.0,,1,59,59,flow=Background
1420,1421,2021-08-08 19:25:01.249516,0.0,igmp,150.35.90.27,0,->,242.5.5.253,0,INT,0.0,,0,59,59,flow=Background
1819,1820,2021-08-08 19:25:04.238294,0.0,igmp,150.35.90.6,0,->,227.3.4.25,0,INT,0.0,,0,59,59,flow=Background
2448,2449,2021-08-08 19:25:09.189511,0.0,igmp,150.35.90.10,0,->,227.3.4.38,0,INT,0.0,,1,59,59,flow=Background
9104,9105,2021-08-08 19:26:00.325662,0.0,igmp,150.35.90.10,0,->,227.3.4.25,0,INT,0.0,,1,59,60,flow=Background
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521859,1521860,2021-08-12 20:54:33.532909,0.0,igmp,150.35.90.10,0,->,227.3.4.38,0,INT,0.0,,1,59,60,flow=Background
1522178,1522179,2021-08-12 20:54:37.045394,0.0,igmp,150.35.90.41,0,->,242.5.5.253,0,INT,0.0,,0,60,59,flow=Background
1526737,1526738,2021-08-12 20:55:32.045717,0.0,igmp,150.35.90.41,0,->,242.5.5.253,0,INT,0.0,,1,60,60,flow=Background
1526763,1526764,2021-08-12 20:55:32.552933,0.0,igmp,150.35.90.7,0,->,227.3.4.25,0,INT,0.0,,1,59,59,flow=Background


# Label

In [6]:
df['LabelStr'] = df['Label']
df['Label'] = 0
df.loc[df['LabelStr'].str.contains('To-Backgro'), 'Label'] = 1
df.loc[df['LabelStr'].str.contains('From-Backg'), 'Label'] = 2
df.loc[df['LabelStr'].str.contains('From-Norma'), 'Label'] = 3
df.loc[df['LabelStr'].str.contains('To-Normal-'), 'Label'] = 4
df.loc[df['LabelStr'].str.contains('From-Botne'), 'Label'] = 5
df.loc[df['LabelStr'].str.contains('Normal-'), 'Label'] = 3

# Calculate

In [7]:
#? packets per second
df['PktsPerSec'] = df['TotPkts']/df['Dur'].replace(0, np.inf)

#? bytes (sent both directions) per second
df['BytesPerSec'] = df['TotBytes']/df['Dur'].replace(0, np.inf)

#? bytes (from src -> dst) per second
df['SrcBytesPerSec'] = df['SrcBytes']/df['Dur'].replace(0, np.inf)

#? bytes per packet
df['BytesPerPkt'] = df['TotBytes']/df['TotPkts'].replace(0, np.inf)


#? dst bytes
df['DstBytes'] = df['TotBytes'] - df['SrcBytes']
#? bytes (from dst -> src) per second
df['DstBytesPerSec'] = df['DstBytes']/(df['Dur'].replace(0, np.inf))
#? if dst bytes < 0
df.loc[df['DstBytes'] < 0, 'DstBytes'] = 0
df.loc[df['DstBytesPerSec'] < 0, 'DstBytesPerSec'] = 0

# sTos & dTos

In [8]:
""" some sTos and dTos has nan value """
df.loc[df['sTos'].isnull(), 'sTos'] = -1
df.loc[df['dTos'].isnull(), 'dTos'] = -1

""" tos 192 is not of our concern """
df = df.drop(df[df['sTos'] == 192.0].index)
df = df.drop(df[df['dTos'] == 192.0].index)

""" dirty way to onehot sTos and dTos """
stos_list = [str(int(v)) for v in list(df['sTos'].unique())]
for tos in stos_list:
    df.loc[df['sTos'] == float(tos), f'sTos_{tos}'] = 1
    df.loc[df['sTos'] != float(tos), f'sTos_{tos}'] = 0
dtos_list = [str(int(v)) for v in list(df['dTos'].unique())]
for tos in dtos_list:
    df.loc[df['dTos'] == float(tos), f'dTos_{tos}'] = 1
    df.loc[df['dTos'] != float(tos), f'dTos_{tos}'] = 0

""" store to encode test and val set """
open(f'data/{af}1.tr.stos.txt', 'w').write('\n'.join(stos_list))
open(f'data/{af}1.tr.dtos.txt', 'w').write('\n'.join(dtos_list))

10

In [9]:
print(df['sTos'].value_counts())
print()
print(df['dTos'].value_counts())
print()
# df.loc[df['sTos'] == 192]

 0.0    1521793
-1.0       6041
 2.0        580
 3.0        522
 1.0        229
Name: sTos, dtype: int64

 0.0    1273939
-1.0     254997
 2.0        111
 3.0        100
 1.0         18
Name: dTos, dtype: int64



# State

Just to calculate number of packets with certain states  

State column ->  S_  | CON |  A_  |  TCP  | ...

- CON = Connected (UDP); (A lot in Background)
- INT = Initial (UDP); 
- URP = Urgent Pointer (UDP); 
- F = FIN (TCP); 
- S = SYN = Synchronization (TCP); 
- P = Push (TCP); 
- A = ACK = Acknowledgement (TCP); 
- R = Reset (TCP); 
- FSPA = All flags : FIN, SYN, PUSH, ACK (TCP)

=> FSPA, SRPA, etc. => convert to all

In [10]:
""" some records have NaN state value """
df.loc[df['State'].isnull(), 'State'] = 'nanvalue'

In [11]:
# print(df['Proto'].value_counts(), '\n')
# print(df['State'].value_counts(), '\n')

In [12]:
""" some state is just to indicate the tcp states, replace these states with alltcp and use Flag_ fields to indicate triggered flags """
df['State_orig'] = df['State']
df.loc[(df['State_orig'].str.len() > 2) & (df['State_orig'].str.contains('_')), 'State'] = 'alltcp'

In [13]:
# print(df['Proto'].value_counts(), '\n')
# print(df['State'].value_counts(), '\n')

In [14]:
""" replace infrequent state with other """
fr = df['State'].value_counts()
# print(fr)
filt = fr[fr > 100]
common = list(filt.index)
# print('common', common)
df.loc[(df['State'] != 'nanvalue') & (~df['State_orig'].str.contains('_')) & (~df['State'].isin(common)), 'State'] = 'other'
# df['State'].value_counts()

""" get common proto list (with other) """
state_list = common + ['other']
open(f'data/{af}1.tr.state.txt', 'w').write('\n'.join(state_list))

""" dirty way to onehot state """
for state in state_list:
    # df[f'Proto_{proto}'] = df['Proto'].apply(lambda x: x.count(proto))
    df.loc[df['State'] == state, f'State_{state}'] = 1
    df.loc[df['State'] != state, f'State_{state}'] = 0

df.head()

Unnamed: 0,StreamID,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,...,State_ECR,State_URH,State_TXD,State_URFIL,State_R_,State_URN,State_RSP,State_URHPRO,State_A_,State_other
0,1,2021-08-08 19:24:49.898441,2414.425067,udp,216.194.26.34,11931,<->,150.35.87.232,13361,CON,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,2021-08-08 19:24:49.907604,602.160441,udp,192.124.197.236,51200,<->,150.35.87.232,13360,CON,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,2021-08-08 19:24:49.949670,0.188379,tcp,150.35.87.168,1573,->,220.166.24.44,80,alltcp,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,2021-08-08 19:24:49.966218,1897.815694,udp,81.144.184.224,34035,<->,150.35.87.232,13361,CON,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,2021-08-08 19:24:49.976393,0.0,tcp,125.180.172.204,10299,?>,150.35.87.174,6880,R_,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# TCP states to flag

In [15]:
df['Flag_nan'] = 0
df.loc[df['State_orig'] == 'nanvalue', 'Flag_nan'] = 1

""" encode tcp state field by counting the number of flags triggered """
flag_list = ['S','A','P','R','F']
for flag in flag_list:
    # df.loc[(df['State'] == 'alltcp'), f'Flag_{flag}'] = df.loc[(df['State'] == 'alltcp')]['State_orig'].apply(lambda x: x.count(flag) if '_' in x else 0)
    df[f'Flag_{flag}'] = df['State_orig'].apply(lambda x: x.count(flag) if '_' in x else 0)

""" store common flag list (with other) """
flag_list.append('nan')

# Proto

In [16]:
""" replace infrequent Proto with other """
fr = df['Proto'].value_counts()
# print(fr)
filt = fr[fr > 100]
common = list(filt.index)
# print('common', common)

df['Proto_orig'] = df['Proto']
df.loc[(~df['Proto_orig'].isin(common)), 'Proto'] = 'other'

""" get common proto list (with other) """
proto_list = common + ['other']
open(f'data/{af}1.tr.proto.txt', 'w').write('\n'.join(proto_list))

""" dirty way to onehot proto """
for proto in proto_list:
    # df[f'Proto_{proto}'] = df['Proto'].apply(lambda x: x.count(proto))
    df.loc[df['Proto'] == proto, f'Proto_{proto}'] = 1
    df.loc[df['Proto'] != proto, f'Proto_{proto}'] = 0

df.head()

Unnamed: 0,StreamID,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,...,Flag_F,Proto_orig,Proto_udp,Proto_tcp,Proto_icmp,Proto_rtp,Proto_rtcp,Proto_igmp,Proto_arp,Proto_other
0,1,2021-08-08 19:24:49.898441,2414.425067,udp,216.194.26.34,11931,<->,150.35.87.232,13361,CON,...,0,udp,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,2021-08-08 19:24:49.907604,602.160441,udp,192.124.197.236,51200,<->,150.35.87.232,13360,CON,...,0,udp,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,2021-08-08 19:24:49.949670,0.188379,tcp,150.35.87.168,1573,->,220.166.24.44,80,alltcp,...,2,tcp,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,2021-08-08 19:24:49.966218,1897.815694,udp,81.144.184.224,34035,<->,150.35.87.232,13361,CON,...,0,udp,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,2021-08-08 19:24:49.976393,0.0,tcp,125.180.172.204,10299,?>,150.35.87.174,6880,R_,...,0,tcp,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Dport to Service

Keep common Dport only.  
Other Dport, save as -1

In [17]:
df['Service'] = 'other'
common_dports = [80,443,21,22,25,6667,]
df.loc[df['Dport'].isin(common_dports), 'Service'] = df['Dport'].apply(str)

service_list = [str(i) for i in common_dports] + ['other']

""" dirty way to onehot service """
for service in service_list:
    # print('\n--------------------\nservice', service)
    # print(len(df.loc[df['Service'] == service]))
    df.loc[df['Service'] == service, f'Service_{service}'] = 1
    df.loc[df['Service'] != service, f'Service_{service}'] = 0

df

Unnamed: 0,StreamID,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,...,Proto_arp,Proto_other,Service,Service_80,Service_443,Service_21,Service_22,Service_25,Service_6667,Service_other
0,1,2021-08-08 19:24:49.898441,2414.425067,udp,216.194.26.34,11931,<->,150.35.87.232,13361,CON,...,0.0,0.0,other,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,2021-08-08 19:24:49.907604,602.160441,udp,192.124.197.236,51200,<->,150.35.87.232,13360,CON,...,0.0,0.0,other,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,2021-08-08 19:24:49.949670,0.188379,tcp,150.35.87.168,1573,->,220.166.24.44,80,alltcp,...,0.0,0.0,80,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,2021-08-08 19:24:49.966218,1897.815694,udp,81.144.184.224,34035,<->,150.35.87.232,13361,CON,...,0.0,0.0,other,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,2021-08-08 19:24:49.976393,0.000000,tcp,125.180.172.204,10299,?>,150.35.87.174,6880,R_,...,0.0,0.0,other,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1529442,1529443,2021-08-12 20:56:01.987136,0.253840,tcp,65.171.5.189,54026,->,150.35.88.37,110,alltcp,...,0.0,0.0,other,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1529443,1529444,2021-08-12 20:56:02.023204,0.000703,udp,150.35.88.37,50217,<->,150.35.83.12,53,CON,...,0.0,0.0,other,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1529444,1529445,2021-08-12 20:56:02.024145,0.000915,udp,150.35.88.37,56110,<->,150.35.83.12,53,CON,...,0.0,0.0,other,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1529445,1529446,2021-08-12 20:56:02.025321,0.000604,udp,150.35.88.37,54662,<->,150.35.83.12,53,CON,...,0.0,0.0,other,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Save the processed data

In [18]:
df.to_csv(path.replace('.csv', f'.{af}1.csv'))

In [19]:
#? cleanup
del df