In [1]:
# Author: Antoine DELPLACE
# Last update: 17/01/2020
"""
Pre-processing program to extract window-related normalized entropy from Netflow files

Parameters
----------
window_width  : window width in seconds
window_stride : window stride in seconds
data          : pandas DataFrame of the Netflow file

Return
----------
Create 3 output files:
- data_window3_botnetx.h5         : DataFrame with the extracted data: Sport (RU), DstAddr (RU), Dport (RU)
- data_window_botnetx_id3.npy     : Numpy array containing SrcAddr
- data_window_botnetx_labels3.npy : Numpy array containing Label
"""

'\nPre-processing program to extract window-related normalized entropy from Netflow files\n\nParameters\n----------\nwindow_width  : window width in seconds\nwindow_stride : window stride in seconds\ndata          : pandas DataFrame of the Netflow file\n\nReturn\n----------\nCreate 3 output files:\n- data_window3_botnetx.h5         : DataFrame with the extracted data: Sport (RU), DstAddr (RU), Dport (RU)\n- data_window_botnetx_id3.npy     : Numpy array containing SrcAddr\n- data_window_botnetx_labels3.npy : Numpy array containing Label\n'

In [2]:
import pandas as pd
import numpy as np
import datetime
import h5py

In [3]:
from scipy.stats import mode

In [4]:
window_width = 120 # seconds
window_stride = 60 # seconds

In [5]:
print("Import data")
data = pd.read_csv("CTU-13-Dataset/1/capture20110810.binetflow")
#with pd.option_context('display.max_rows', None, 'display.max_columns', 15):
#    print(data.shape)
#    print(data.head())
#    print(data.dtypes)

Import data


In [6]:
print("Preprocessing")
def normalize_column(dt, column):
    mean = dt[column].mean()
    std = dt[column].std()
    print(mean, std)

    dt[column] = (dt[column]-mean) / std

Preprocessing


In [7]:
data['StartTime'] = pd.to_datetime(data['StartTime']).astype(np.int64)*1e-9
datetime_start = data['StartTime'].min()

In [8]:
data['Window_lower'] = (data['StartTime']-datetime_start-window_width)/window_stride+1
data['Window_lower'].clip(lower=0, inplace=True)
data['Window_upper_excl'] = (data['StartTime']-datetime_start)/window_stride+1
data = data.astype({"Window_lower": int, "Window_upper_excl": int})
data.drop('StartTime', axis=1, inplace=True)

In [9]:
data['Label'], labels = pd.factorize(data['Label'].str.slice(0, 15))
#print(data.dtypes)

In [10]:
def RU(df):
    if df.shape[0] == 1:
        return 1.0
    else:
        proba = df.value_counts()/df.shape[0]
        h = proba*np.log10(proba)
        return -h.sum()/np.log10(df.shape[0])

In [11]:
X = pd.DataFrame()
nb_windows = data['Window_upper_excl'].max()
print(nb_windows)

368


In [12]:
for i in range(0, nb_windows):
    gb = data.loc[(data['Window_lower'] <= i) & (data['Window_upper_excl'] > i)].groupby('SrcAddr')
    X = X.append(gb.agg({'Sport':[RU], 
                         'DstAddr':[RU], 
                         'Dport':[RU]}).reset_index())
    print(X.shape)

(9787, 4)
(18611, 4)
(27165, 4)
(35618, 4)
(44085, 4)
(52590, 4)
(60893, 4)
(69155, 4)
(77348, 4)
(85099, 4)
(92554, 4)
(100050, 4)
(107669, 4)
(115314, 4)
(122838, 4)
(130249, 4)
(137568, 4)
(145031, 4)
(152558, 4)
(159813, 4)
(166800, 4)
(173642, 4)
(180612, 4)
(187548, 4)
(194282, 4)
(201025, 4)
(207690, 4)
(214395, 4)
(221232, 4)
(227866, 4)
(234204, 4)
(240528, 4)
(246838, 4)
(253249, 4)
(259729, 4)
(265888, 4)
(272041, 4)
(278226, 4)
(284193, 4)
(290107, 4)
(295903, 4)
(301515, 4)
(306980, 4)
(312316, 4)
(317715, 4)
(323153, 4)
(328527, 4)
(333850, 4)
(339170, 4)
(344346, 4)
(349213, 4)
(354009, 4)
(358784, 4)
(363468, 4)
(368133, 4)
(372748, 4)
(377513, 4)
(382344, 4)
(387061, 4)
(392084, 4)
(397408, 4)
(402838, 4)
(408440, 4)
(414104, 4)
(419600, 4)
(425073, 4)
(430617, 4)
(436259, 4)
(442075, 4)
(447944, 4)
(453791, 4)
(459609, 4)
(465532, 4)
(471410, 4)
(477195, 4)
(482926, 4)
(488755, 4)
(494764, 4)
(500735, 4)
(506756, 4)
(512858, 4)
(518825, 4)
(524757, 4)
(530697, 4)
(536

In [13]:
del(data)

In [14]:
X.columns = ["_".join(x) if isinstance(x, tuple) else x for x in X.columns.ravel()]
#print(X.columns.values)

In [15]:
#print(X.columns.values)
columns_to_normalize = list(X.columns.values)
columns_to_normalize.remove('SrcAddr_')

In [16]:
normalize_column(X, columns_to_normalize)

Sport_RU      0.994574
DstAddr_RU    0.963379
Dport_RU      0.968772
dtype: float64 Sport_RU      0.067928
DstAddr_RU    0.181867
Dport_RU      0.167205
dtype: float64


In [17]:
with pd.option_context('display.max_rows', 10, 'display.max_columns', 22):
    print(X.shape)
    print(X)
    print(X.dtypes)

(2226720, 4)
              SrcAddr_  Sport_RU  DstAddr_RU  Dport_RU
0              0.0.0.0  0.079879     0.20136  0.186766
1    00:15:17:2c:e5:2d  0.079879     0.20136  0.186766
2        1.144.156.226  0.079879     0.20136  0.186766
3           1.144.5.55  0.079879     0.20136  0.186766
4        1.155.150.224  0.079879     0.20136  0.186766
..                 ...       ...         ...       ...
787     99.192.158.141  0.079879     0.20136  0.186766
788       99.242.24.14  0.079879     0.20136  0.186766
789     99.245.126.143  0.079879     0.20136  0.186766
790     99.245.140.246  0.079879     0.20136  0.186766
791     99.254.184.154  0.079879     0.20136  0.186766

[2226720 rows x 4 columns]
SrcAddr_       object
Sport_RU      float64
DstAddr_RU    float64
Dport_RU      float64
dtype: object


with pd.option_context('display.max_rows', 10, 'display.max_columns', 20):
   print(X.loc[X['Label'] != 0])

In [18]:
X.drop('SrcAddr_', axis=1).to_hdf('data_window3_botnet3.h5', key="data", mode="w")
np.save("data_window_botnet3_id3.npy", X['SrcAddr_'])
np.save("data_window_botnet3_labels3.npy", labels)