In [1]:
import numpy as np
import pandas as pd
import os

#设置行不限制数量
pd.set_option('display.max_rows',None)
#设置列不限制数量
pd.set_option('display.max_columns',None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)

In [5]:
"""原始数据合并"""

data_path = "./data/origin_data/"
file_list = os.listdir(data_path)  # 原始数据文件列表
data_list = []                     # 存储每个文件处理后的dataframe数据

# 依次处理每个数据文件
for file in file_list:
    file_path = os.path.join(data_path, file)
    df = pd.read_csv(file_path, index_col=None)
    df.replace([np.inf, -np.inf, "Infinity", "NaN"], np.nan, inplace=True)   # 将无穷值替换为缺失值
    df.dropna(axis="rows", how="any", inplace=True)                          # 删除缺失值所在行
    data_list.append(df)

frame = pd.concat(data_list, axis=0, ignore_index=True)   # 数据合并

In [6]:
frame.head(3)

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,0,0,0.0,0.0,4000000.0,666666.6667,3.0,0.0,3,3,3,3.0,0.0,3,3,0,0.0,0.0,0,0,0,0,0,0,40,0,666666.6667,0.0,6,6,6.0,0.0,0.0,0,0,0,0,1,0,0,0,0,9.0,6.0,0.0,40,0,0,0,0,0,0,2,12,0,0,33,-1,1,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,6,6,6.0,0.0,110091.7,18348.62385,109.0,0.0,109,109,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,20,20,9174.311927,9174.311927,6,6,6.0,0.0,0.0,0,0,0,0,1,1,0,0,1,9.0,6.0,6.0,20,0,0,0,0,0,0,1,6,1,6,29,256,0,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,6,6,6.0,0.0,230769.2,38461.53846,52.0,0.0,52,52,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,20,20,19230.76923,19230.76923,6,6,6.0,0.0,0.0,0,0,0,0,1,1,0,0,1,9.0,6.0,6.0,20,0,0,0,0,0,0,1,6,1,6,29,256,0,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [18]:
frame[" Label"].value_counts()

BENIGN                        1741839
DoS Hulk                       230124
PortScan                       158804
DDoS                           128025
DoS GoldenEye                   10293
FTP-Patator                      7935
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1956
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name:  Label, dtype: int64

可以发现，数据集存在严重的的样本不均衡问题，正常流量（BENIGN）在数据集中的占比很大，而其他一些异常流量则占比很小，这就容易导致训练出的模型更偏向于将输入样本预测为占比大的那一类，实际应用场景中并不能达到我们预期的效果。为此我们对数据集进行平衡样本处理，仅选择前六个类别进行训练预测。

In [20]:
"""平衡数据集样本"""
BENIGN = frame[frame[' Label']=='BENIGN'][:250000]
DosHulk = frame[frame[' Label']=='DoS Hulk']
PortScan = frame[frame[' Label']=='PortScan']
DDoS = frame[frame[' Label']=='DDoS']
DoSGoldenEye = frame[frame[' Label']=='DoS GoldenEye']
FTPPatator = frame[frame[' Label']=='FTP-Patator']

new_dataset = pd.concat([BENIGN, DosHulk, PortScan, DDos, DoSGoldenEye, FTPPatator])

In [21]:
# 选出11个典型特征进行训练
final_columns = [' Bwd Packet Length Min',' Subflow Fwd Bytes','Total Length of Fwd Packets',
                 ' Fwd Packet Length Mean',' Bwd Packet Length Std',' Flow Duration',' Flow IAT Std',
                 'Init_Win_bytes_forward',' Bwd Packets/s', ' PSH Flag Count',' Average Packet Size',' Label' ]
dataset = new_dataset[final_columns]


In [22]:
from sklearn.preprocessing import LabelEncoder
"""特征编码"""
# 将类别特征转换为离散性数值
y = dataset[' Label']                      # 经过对数据的进一步观察，发现只有Label列为类别特征
le = LabelEncoder()                         # 编码器实例化
le = le.fit(y)                              # 导入数据
label = le.transform(y)                     # 接口调取结果
dataset[' Label'] = label                 # 更新对应属性列的值

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [23]:
# 查看编码后的Label属性列
dataset[" Label"].value_counts()

0    250000
2    230124
4    158804
1     10293
3      7935
Name:  Label, dtype: int64

In [25]:
# 编码前后的对应关系
match = le.inverse_transform([0, 1, 2, 3, 4])
match

array(['BENIGN', 'DoS GoldenEye', 'DoS Hulk', 'FTP-Patator', 'PortScan'],
      dtype=object)

In [27]:
# 保存清洗过的数据为csv文件
dataset.to_csv('./data/processed_data/dataset6classes.csv')