# Análise e Pré-processamento de dados

Lendo o dataset com nomes de colunas pré-definidos (não presentes no arquivo)

In [1]:
import pandas as pd
import numpy as np

columns = ['srcip',
'srcport',
'dstip',
'dstport',
'proto',
'total_fpackets',
'total_fvolume',
'total_bpackets',
'total_bvolume',
'min_fpktl',
'mean_fpktl',
'max_fpktl',
'std_fpktl',
'min_bpktl',
'mean_bpktl',
'max_bpktl',
'std_bpktl',
'min_fiat',
'mean_fiat',
'max_fiat',
'std_fiat',
'min_biat',
'mean_biat',
'max_biat',
'std_biat',
'duration',
'min_active',
'mean_active',
'max_active',
'std_active',
'min_idle',
'mean_idle',
'max_idle',
'std_idle',
'sflow_fpackets',
'sflow_fbytes',
'sflow_bpackets',
'sflow_bbytes',
'fpsh_cnt',
'bpsh_cnt',
'furg_cnt',
'burg_cnt',
'total_fhlen',
'total_bhlen',
'dscp']

df = pd.read_csv('Datasets/goflowtest-trainset.csv',names=columns)

In [18]:
df.head()

Unnamed: 0,srcip,srcport,dstip,dstport,proto,total_fpackets,total_fvolume,total_bpackets,total_bvolume,min_fpktl,...,sflow_fbytes,sflow_bpackets,sflow_bbytes,fpsh_cnt,bpsh_cnt,furg_cnt,burg_cnt,total_fhlen,total_bhlen,dscp
0,147.32.84.180,1027,74.125.232.195,80,6,7,1082,3,205,40,...,1082,3,205,2,1,0,0,296,128,0
1,147.32.84.180,1040,94.63.149.152,80,6,32,1476,11,26238,40,...,1476,11,26238,2,2,0,0,1296,448,0
2,147.32.84.180,1041,60.190.223.75,2012,6,60,2618,28,57221,40,...,2618,28,57221,2,1,0,0,2416,1128,0
3,147.32.84.180,1044,60.190.223.75,888,6,9,1006,3,1882,40,...,1006,3,1882,2,2,0,0,376,128,0
4,147.32.84.180,1046,147.32.84.171,139,6,21,2906,11,1364,40,...,2906,11,1364,16,8,0,0,856,448,0


Existem dois tipos de protocolo no dataset, TCP e UDP. Como o foco da análise são os fluxos de Botnet do protocolo IRC, que por sua vez utiliza o protocolo TCP para transporte, convém manter somente os fluxos TCP.

In [19]:
df = df[df['proto']==6]

Verificando valores nulos

In [54]:
df.columns[(df == 0).all()]

Index(['std_active', 'min_idle', 'mean_idle', 'max_idle', 'std_idle',
       'furg_cnt', 'burg_cnt'],
      dtype='object')

## Rotulação de dados

Abaixo constam duas listas, uma de IPs que pertencem a botnets e outra que contém os fluxos bidirecionais que correspondem a um ataque IRC. FLuxos dentro dessas especificações serão rotulados como 1 (Botnet), e 0 caso contrário (não-botnet).

In [20]:
malicious_ips = ['147.32.84.180',
                 '147.32.84.170'
               ]

irc_attacks = [('192.168.2.112','131.202.243.84'), ('192.168.5.122', '198.164.30.2'), 
('192.168.2.110','192.168.5.122'), ('192.168.4.118', '192.168.5.122'), ('192.168.2.113', '192.168.5.122'),
('192.168.1.103','192.168.5.122'), ('192.168.4.120','192.168.5.122'),
('192.168.2.112','192.168.2.110'), ('192.168.2.112','192.168.4.120'), ('192.168.2.112','192.168.1.103'),
('192.168.2.112','192.168.2.113'), ('192.168.2.112','192.168.4.118'), ('192.168.2.112','192.168.2.109'),
('192.168.2.112','192.168.2.105'), ('192.168.1.105','192.168.5.122')
]

In [21]:
def flow_label(df):
    labels = []
    
    for index,data in df.iterrows():

        src = data['srcip']
        dst = data['dstip']
        

        if((src in malicious_ips) or (dst in malicious_ips)):
            labels.append(1)
        elif(((src,dst) in irc_attacks) or ((dst,src) in irc_attacks)):
            labels.append(1)
        else:
            labels.append(0)
    
    return labels

df['label'] = flow_label(df)

In [21]:
def flow_label(df):
    labels = []
    
    for index,data in df.iterrows():

        src = data['srcip']
        dst = data['dstip']
        

        if((src in malicious_ips) or (dst in malicious_ips)):
            labels.append(1)
        elif(((src,dst) in irc_attacks) or ((dst,src) in irc_attacks)):
            labels.append(1)
        else:
            labels.append(0)
    
    return labels

df['label'] = flow_label(df)

## Eliminando fluxos de outras botnets

O dataset contém fluxos de botnets que utilizam outros tipos de protocolo. Sendo assim, esses fluxos serão eliminados tendo em vista que as características deles podem impactar a classificação.

In [22]:
other_botnets = ['147.32.84.160','192.168.3.35', '192.168.3.25', '192.168.3.65', '172.29.0.116']

In [24]:
df = df[~df['srcip'].isin(other_botnets)]

In [25]:
df = df[~df['dstip'].isin(other_botnets)]

## Balanceamento de dados

In [27]:
from collections import Counter

Counter(df['label'])

Counter({0: 86695, 1: 6379})

Como observado acima, existe uma quantidade muito maior de dados que que não são botnet do que os que são. Convém então balancear o conjunto de dados através do undersampling da classe 0.

In [14]:
_underscore = 6379
# Obtendo o numero de itens a serem eliminados
_total = len(df[df['label'] == 0]) - _underscore
# Obtendo sub-dataset a ser eliminado
_df_underscore_index = df[df['label'] == 0].head(_total).index
# eliminando sub-dataset do dataset principal
df.drop(_df_underscore_index, inplace=True)
# resetando index dos datasets
df.reset_index(drop=True,inplace=True)

In [15]:
Counter(df['label'])

Counter({0: 6379, 1: 6379})

## Geração de dados

O dataset gerado não contém todas as características a serem utilizadas no trabalho, e sendo assim é necessário criá-las.

In [31]:
def flow_total_bytes():
    total_bytes = []
    for index,data in df.iterrows():

        bytes_forward = data['total_fvolume']
        bytes_backward = data['total_bvolume']

        bytes_sum = bytes_forward + bytes_backward

        total_bytes.append(bytes_sum)

    df['total_bytes'] = total_bytes
    
def flow_total_packets():
    total_packets = []
    for index,data in df.iterrows():

        packets_forward = data['total_fpackets']
        packets_backward = data['total_bpackets']

        packets_sum = packets_forward + packets_backward

        total_packets.append(packets_sum)

    df['total_packets'] = total_packets
    
def flow_total_bits():
    total_bits = []
    for index,data in df.iterrows():

        total_bytes = data['total_bytes']

        bits = total_bytes * 8

        total_bits.append(bits)

    df['total_bits'] = total_bits

def bytes_per_packet():
    bpp = []
    for index,data in df.iterrows():

        _bytes = data['total_bytes']
        packets = data['total_packets']
        val = _bytes / packets

        bpp.append(val)

    df['bpp'] = bpp
    
def bits_per_sec():
    bps = []
    for index,data in df.iterrows():

        bits = data['total_bytes'] * 8
        secs = data['duration'] * 0.000006
        
        if secs != 0:
            val = bits/secs
        else:
            val = 0

        bps.append(val)

    df['bps'] = bps   

def packets_per_sec():
    pps = []
    for index,data in df.iterrows():

        packets = data['total_packets']
        secs = data['duration'] * 0.000006

        if secs != 0:
            val = packets/secs
        else:
            val = 0

        pps.append(val)

    df['pps'] = pps   
    
def avg_var_iat():
    iat = []
    for index,data in df.iterrows():

        f_iat = data['std_fiat'] 
        b_iat = data['std_biat']

        f_iat = f_iat * f_iat
        b_iat = b_iat * b_iat

        avg = (f_iat + b_iat)/2

        iat.append(avg)

    df['var_iat'] = iat
    
def avg_iat():
    iat = []
    for index,data in df.iterrows():

        f_iat = data['mean_fiat'] 
        b_iat = data['mean_biat']

        avg = (f_iat + b_iat)/2

        iat.append(avg)

    df['avg_iat'] = iat
    
def pct_packets_pushed():
    pctpp = []
    for index,data in df.iterrows():
        
        packets_pushed = data['total_fpackets']
        total_packets = data['total_packets']
        
        if total_packets != 0:
            val = packets_pushed/total_packets
        else:
            val = 0
        
        pctpp.append(val)
    
    df['pct_packets_pushed'] = pctpp
    
def iopr():
    iopr = []
    for index,data in df.iterrows():
        
        packets_pushed = data['total_fpackets']
        packets_pulled = data['total_bpackets']
        
        if packets_pushed != 0:
            val = packets_pulled/packets_pushed
        else:
            val = 0
            
        iopr.append(val)
        
    df['iopr'] = iopr
    
def avg_payload_length():
    # (bytes_header_forward + bytes_header_back) - total_bytes = payload length
    # payload_length / packets = average payload length
    avg_pl = []
    for index,data in df.iterrows():
        
        header_f = data['total_fhlen']
        header_b = data['total_bhlen']
        total_b = data['total_bytes']
        packets = data['total_packets']
        
        if packets != 0:
            payload_length = total_b - (header_b + header_f)
            avg = payload_length / packets
        else:
            avg = 0
        
        avg_pl.append(avg)
    
    df['avg_payload_length'] = avg_pl

In [32]:
flow_total_bytes()
flow_total_packets()
flow_total_bits()
bytes_per_packet()
bits_per_sec()
packets_per_sec()
avg_var_iat()
avg_iat()
packets_per_sec()
pct_packets_pushed()
avg_payload_length()
iopr()

In [53]:
df.columns

Index(['srcip', 'srcport', 'dstip', 'dstport', 'proto', 'total_fpackets',
       'total_fvolume', 'total_bpackets', 'total_bvolume', 'min_fpktl',
       'mean_fpktl', 'max_fpktl', 'std_fpktl', 'min_bpktl', 'mean_bpktl',
       'max_bpktl', 'std_bpktl', 'min_fiat', 'mean_fiat', 'max_fiat',
       'std_fiat', 'min_biat', 'mean_biat', 'max_biat', 'std_biat', 'duration',
       'min_active', 'mean_active', 'max_active', 'std_active', 'min_idle',
       'mean_idle', 'max_idle', 'std_idle', 'sflow_fpackets', 'sflow_fbytes',
       'sflow_bpackets', 'sflow_bbytes', 'fpsh_cnt', 'bpsh_cnt', 'furg_cnt',
       'burg_cnt', 'total_fhlen', 'total_bhlen', 'dscp', 'label',
       'total_bytes', 'total_packets', 'total_bits', 'bpp', 'bps', 'pps',
       'var_iat', 'avg_iat', 'pct_packets_pushed', 'avg_payload_length',
       'iopr'],
      dtype='object')

## Exportando dataset

In [55]:
df.to_csv('Datasets/trainset.csv')