# 数据加载和数据处理

In [2]:
import numpy as np
import pandas as pd

# 加载数据
filename = 'dataset/dga_domains_sample.csv'
# 数据导入
df = pd.read_csv(filename, encoding="UTF-8")
# 设置标签
df.columns = ['Label', 'Source', 'Domain']  # 为每一行命名
# 来源的分类
df['Source'].unique()

array(['corebot', 'alexa', 'ranbyus', 'symmi', 'emotet', 'dircrypt',
       'matsnu', 'simda', 'fobber', 'pushdo', 'qadars', 'kraken',
       'ramnit', 'nymaim', 'pykspa', 'tinba', 'murofet', 'cryptolocker',
       'ramdo', 'vawtrak', 'conficker', 'padcrypt', 'rovnix', 'suppobox',
       'necurs', 'gozi'], dtype=object)

In [3]:
# 替换为数字
source_dict = pd.factorize(df['Source']) # source_dict[0]表示分类后的类别，source_dict[1]表示类别名称

(array([ 0,  1,  1, ...,  8,  1, 16], dtype=int64),
 Index(['corebot', 'alexa', 'ranbyus', 'symmi', 'emotet', 'dircrypt', 'matsnu',
        'simda', 'fobber', 'pushdo', 'qadars', 'kraken', 'ramnit', 'nymaim',
        'pykspa', 'tinba', 'murofet', 'cryptolocker', 'ramdo', 'vawtrak',
        'conficker', 'padcrypt', 'rovnix', 'suppobox', 'necurs', 'gozi'],
       dtype='object'))

In [4]:
df['Source'] = source_dict[0]
# 将合法域名标签改为1，dga的改为0
df['Label'] = df['Label'].replace({'legit': 0, 'dga': 1})
# 对于域名也进行处理，留下中间的可识别部分作为n-gram的输入
domain = np.array(df['Domain'])
D = []
for url in domain:
    # 删除.后的所有数据
    url = url.split(".", 1)[0]
    D.append(url)
col_2 = ['Domain']
df['Domain'] = pd.DataFrame(D, columns=col_2)
# 其中Label中，1代表DGA域名，Source中1代表alexa域名
df.head()

Unnamed: 0,Label,Source,Domain
0,1,0,cvyh1po636avyrsxebwbkn7
1,0,1,plasticbags
2,0,1,mzltrack
3,0,1,miss-slim
4,1,2,txumyqrubwutbb


# 数据转化为N-gram向量

In [6]:

from keras_preprocessing import sequence
import nltk

# 形成列表
list_Domain = df['Domain'].tolist()
list_s = df['Source'].tolist()

# 生成Bigram
Bigram_Data = []

for i in range(len(list_Domain)):
    var = []
    for j in range(len(list_Domain[i]) - 1):
        var.append(list_Domain[i][j] + list_Domain[i][j + 1])
    Bigram_Data.append(var)

Bigram_D = []

for i in range(len(list_Domain)):
    for j in range(len(list_Domain[i]) - 1):
        var = list_Domain[i][j] + list_Domain[i][j + 1]
        Bigram_D.append(var)

# 使用nltk统计bigram出现的次数

freq_dist_Data = nltk.FreqDist(Bigram_D)



# 用bigram出现的频率替换为数字
def data_to_vector(freq_dist, Bigram_list, list_Label):
    # 使用构造好的字典对域名进行处理
    list_data = [[freq_dist[y] for y in x] for x in Bigram_list]
    max_data_len = len(max(Bigram_list, key=len, default=''))
    # 以里面域名最大长度构造特征，小于最大长度的用一个非常小的值填充
    PAD_VALUE = 1e-10
    data_vector = sequence.pad_sequences(list_data, maxlen=max_data_len, dtype=np.float, value=PAD_VALUE)
    # 开平方根处理，使域名向量不会过大而导致无法输入神经网络，加60能够保留出现次数低的特征
    data_vector = np.sqrt(data_vector + 60) - np.sqrt(PAD_VALUE + 60)
    # 将标签列表转化为ndarray
    list_label = np.array(list_Label)
    return data_vector, list_label,max_data_len

data_vector, list_label, max_data_len = data_to_vector(freq_dist_Data, Bigram_Data, list_Label)
data_vector

array([[ 0.        ,  0.        ,  0.        , ...,  3.87298335,
         4.54223904,  0.68018308],
       [ 0.        ,  0.        ,  0.        , ..., 12.12864022,
        10.22623406,  5.67044117],
       [ 0.        ,  0.        ,  0.        , ..., 18.54091216,
        14.94764474,  9.37127608],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  3.65578756,
         8.89735028,  4.04385943],
       [ 0.        ,  0.        ,  0.        , ..., 13.34905642,
        13.51432493, 17.43338993],
       [ 0.        ,  0.        ,  0.        , ...,  4.04385943,
         8.31641171, 13.44365341]])

# 神经网络的设计

In [7]:
import keras
from tensorflow_core.python.keras.layers import Embedding, LSTM, Dropout, Flatten, Dense
from tensorflow_core.python.keras import Input, Model


# 按照参考论文设计
def example_LSTM(max_features, max_data_len, class_num):
    # 输入层
    input_layer = Input(shape=(max_data_len,), dtype='int32')
    # 词嵌入
    embed_layer = Embedding(input_dim=max_features, output_dim=128, input_length=max_data_len)(input_layer)
    # LSTM层
    lstm = LSTM(128)(embed_layer)
    # 丢弃50%
    dropout = Dropout(0.5)(lstm)
    # 全连接层
    flat = Flatten()(dropout)
    # 使用softmax做多分类问题
    out = Dense(class_num, activation='softmax')(flat)
    # 整合神经网络
    model = Model(input_layer, out)

    # 评价函数后得到所有的参数
    METRICS = [
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc')
    ]

    # 模型使用多元交叉熵损失函数（二分类问题），优化器使用Adam优化器，评价函数参照以上
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=METRICS)