<a href="https://colab.research.google.com/github/pbenito1/TFM/blob/main/RedesNeuronales.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Detección de dominios DGA utilizando redes neuronales
En este notebook se realizarán diversos experimentos de detección de dominios generados por DGA utilizando distintas arquitecturas de redes neuronales.

Una vez entrenadas las distintas arquitecturas se compararán los resultados, tanto en términos de *accuracy*, número de parámetros y tiempo de entrenamiento.

In [None]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras import layers, callbacks
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, recall_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from timeit import default_timer as timer

In [None]:
# Estilos seaborn
sns.set(rc={"figure.dpi":150, 'savefig.dpi':150})
sns.set_context('notebook')
sns.set_style("ticks")

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png')

In [None]:
# Cargamos datos
GDRIVE_PATH='/content/drive/MyDrive/TFM'
DATASET_PATH=os.path.join(GDRIVE_PATH, 'data_processed/dga.csv')
IMAGES_PATH=os.path.join(GDRIVE_PATH, 'img')
CHECKPOINTS_PATH=os.path.join(GDRIVE_PATH, 'checkpoints')

df = pd.read_csv(DATASET_PATH,index_col=0)

In [None]:
df.sample(10)

Unnamed: 0,domain,family,category,tld,sld,subdomains_number,digits,length,num_dashes,entropy,ratio_vc,3gram,4gram,5gram,6gram
894784,casino-vavada.quest,benign,benign,quest,casino-vavada,2,0,13,1,2.931209,1.0,11,10,9,8
82733,objectunitedforabothe.ru,gozi,malign,ru,objectunitedforabothe,2,0,21,0,3.61781,0.75,19,18,16,12
819081,enkcf.com,benign,benign,com,enkcf,2,0,5,0,2.321928,0.25,3,2,1,0
995161,lokalclassified.com,benign,benign,com,lokalclassified,2,0,15,0,3.189898,0.666667,13,12,11,10
1034654,peymanapp.ir,benign,benign,ir,peymanapp,2,0,9,0,2.419382,0.5,7,6,5,4
793988,ningyangseo.com,benign,benign,com,ningyangseo,2,0,11,0,2.845351,0.571429,9,8,7,6
1131177,fixmestick.com,benign,benign,com,fixmestick,2,0,10,0,3.121928,0.428571,8,7,6,5
1096602,legislazionetecnica.it,benign,benign,it,legislazionetecnica,2,0,19,0,3.32636,0.9,17,16,15,14
25738,ykydnvbaxtu.com,conficker,malign,com,ykydnvbaxtu,2,0,11,0,3.277613,0.222222,9,4,0,0
560740,nqvylsr.com,vawtrak,malign,com,nqvylsr,2,0,7,0,2.807355,0.0,5,2,0,0


In [None]:
# Para probar el código tomamos un 10% del dataset original (aprox 10.000)
# Realizamos una muestra estratificada para que haya muestras de todas las familias

#df=df.groupby('family', group_keys=False).apply(lambda x: x.sample(frac=0.5))
df.shape

(1142536, 15)

In [None]:
X = np.array(df['domain'])

# Codificamos como 0 los dominios benignos y como 1 los malignos
Y = np.array([0 if x == 'benign' else 1 for x in df['category']])

# Obtiene un set (elementos únicos) de todos los caracteres utilizados en todos los dominios
# No hay ningún criterio para la ordenación, se asignan números según aparecen los caracteres.
valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X)))}
# Se le suma 1 para UNKNOWN
max_features = len(valid_chars) + 1
# Se determina max_len como el dominio mas largo del dataset. 
# Se puede optar por fijarlo al valor especificado en el RFC
# La longitud máxima de un dominio es de 255 caracteres (incluyendo TLD, SLD, ...)
# RFC 1035 section 2.3.4.
max_len = max([len(i) for i in X])

# Recorre todos los caracteres de todos los dominios y los sustituye por su equivalente numérico
X = [[valid_chars[y] for y in x] for x in X]

# Añade padding hasta la máxima longitud
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=max_len)

# Dividimos el dataset en training (80%) y test (20%). 
# Utilizamos una semilla para poder reproducir los resultados
# Hacemos un muestreo estratificado de acuerdo al atributo 'family'
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=42, stratify=df['family'])


In [None]:
#@title Hiperparámetros
LOSS_FUNCTION='binary_crossentropy'
OPTIMIZER='nadam'
METRICS=['accuracy']
PATIENCE=10
MAX_EPOCHS=100
#MAX_EPOCHS=100

In [None]:
def plot_model(history,title):

  f = plt.figure(figsize=(12,5))
  ax = f.add_subplot(1,2,1)
  plt.plot(history.history['accuracy'])
  plt.plot(history.history['val_accuracy'])
  plt.legend(['train', 'validate'])
  plt.title('Accuracy '+title)
  plt.xlabel('epoch')
  plt.ylabel('accuracy')
  
  ax = f.add_subplot(1,2,2)
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.legend(['train', 'validate'])
  plt.title('Loss '+title)
  plt.xlabel('epoch')
  plt.ylabel('loss')

'''
  sns.histplot(data=full_df, ax=ax, stat="count", multiple="stack",
              x="3gram", kde=False,
              palette="pastel", hue="category",
              element="bars", legend=True,discrete=True)
  ax.set_title("3-Gram")
  ax.set_xlabel("3Gram")
  ax.set_ylabel("Dominios")
'''

'\n  sns.histplot(data=full_df, ax=ax, stat="count", multiple="stack",\n              x="3gram", kde=False,\n              palette="pastel", hue="category",\n              element="bars", legend=True,discrete=True)\n  ax.set_title("3-Gram")\n  ax.set_xlabel("3Gram")\n  ax.set_ylabel("Dominios")\n'

In [None]:
def get_cm_metrics(model, x_test, y_test):
  # https://github.com/Tony607/ROC-Keras/blob/master/ROC-Keras.ipynb
  y_test_pred = model.predict(x_test)

  # Confusion matrix
  cm=confusion_matrix(y_test, y_test_pred > .5)
  disp = ConfusionMatrixDisplay(confusion_matrix=cm)#, display_labels=labels)
  disp.plot(cmap=plt.cm.Blues)
  plt.show()

  # Classification report
  print(classification_report(y_test, y_test_pred > .5))

  # ROC
  fpr, tpr, threshold = roc_curve(y_test, y_test_pred)
  roc_auc = auc(fpr, tpr)
  return (fpr,tpr,threshold,roc_auc)


In [None]:
def get_ANN(max_features, max_len, loss_function, optimizer):
  '''
  Red neuronal monocapa formada por 100 neuronas
  '''
  
  model = Sequential([
    layers.Embedding(input_dim=max_features, output_dim=128, input_length=max_len),
    layers.Dense(128, activation='relu'),
    layers.Flatten(),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')    
  ])

  model.summary()

  model.compile(loss=loss_function, optimizer=optimizer, metrics=['accuracy'])
  return model

In [None]:
def get_DNN(max_features, max_len, loss_function, optimizer):
  '''
  Red neuronal multicapa. Dos capas de 50 neuronas.
  '''
  model = Sequential([
    layers.Embedding(input_dim=max_features, output_dim=128, input_length=max_len),
    layers.Dense(64, activation='relu'),
    layers.Flatten(),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Flatten(),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')    
  ])

  model.summary()

  model.compile(loss=loss_function, optimizer=optimizer, metrics=['accuracy'])
  return model

In [None]:
def get_LSTM(max_features, max_len, loss_function, optimizer):

  model = Sequential([
    layers.Embedding(input_dim=max_features, output_dim=128, input_length=max_len),
    layers.LSTM(128, return_sequences=True),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.LSTM(64),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')    
  ])
  model.summary()

  model.compile(loss=loss_function, optimizer=optimizer, metrics=['accuracy'])
  return model

In [None]:
def get_Conv1D(max_features, max_len, loss_function, optimizer):
  '''
  Proporciona un modelo CNN Conv1D con una única capa convolucional con 
  30 filtros de tamaño de kernel 4.
  '''
  model = Sequential()
  model.add(layers.Embedding(input_dim=max_features, output_dim=128, input_length=max_len))
  #filters=256, kernel_size=5, padding='same',
  #model.add(layers.Conv1D(32, 6, activation='relu'))
  model.add(layers.Conv1D(filters=256, kernel_size=5, padding='same', activation='relu'))
  model.add(layers.GlobalMaxPooling1D())

  model.add(layers.Dropout(0.5))
  

  model.add(layers.Dense(1))
  model.add(layers.Activation('sigmoid'))
  model.summary()

  model.compile(loss=loss_function, optimizer=optimizer, metrics=['accuracy'])
  #tf.keras.utils.plot_model(model, to_file=os.path.join(IMAGES_PATH,'Conv1D.png'), show_shapes=True)
  
  return model

In [None]:
def get_LSTM_Conv1D(max_features, max_len, loss_function, optimizer):
  domain_input = tf.keras.Input(shape = (max_len,), name='text_input')
  input_layer = layers.Embedding(max_features, 128, input_length=max_len)(domain_input)

  lstm = layers.LSTM(128)(input_layer)
  lstm = layers.Dropout(0.5)(lstm)
  lstm = layers.Dense(1)(lstm)
  block_lstm_output = layers.Activation('sigmoid')(lstm)

  # 15 filtros, kernels de distintos tamaños: 2, 4 y 6. Algo similar a ngrams

  # Chollet:
  # A 1D convnet processing sequences of characters using convolution windows 
  # of size 5 should be able to learn words or word fragments of length 5 or less
  #
  conv_a = layers.Conv1D(15,2, activation='relu')(input_layer)
  pool_a = layers.GlobalMaxPooling1D()(conv_a)

  conv_b = layers.Conv1D(15,4, activation='relu')(input_layer)
  pool_b = layers.GlobalMaxPooling1D()(conv_b)

  conv_c = layers.Conv1D(15,6, activation='relu')(input_layer)
  pool_c = layers.GlobalMaxPooling1D()(conv_c)

  flattened = layers.add([pool_a, pool_b, pool_c])
  drop = layers.Dropout(0.5)(flattened)

  block_cnn_output = layers.Dense(1, activation='sigmoid')(drop)

  output = layers.add([block_lstm_output, block_cnn_output])

  model = tf.keras.Model(domain_input, output)
  model.summary()

  model.compile(loss=loss_function, optimizer=optimizer, metrics=['accuracy'])
  
  #tf.keras.utils.plot_model(model, to_file=os.path.join(IMAGES_PATH,'LSTM_Conv1D.png'), show_shapes=True)

  return model

In [None]:
def getLSTM_Attention(max_features, max_len, loss_function, optimizer):
  # Ref: https://keras.io/api/layers/attention_layers/attention/
  # Definimos el tamaño de nuestras entradas y salidas


  # Definimos la capa de entrada
  #inputs = Input(shape=(input_size,))
  #inputs = tf.keras.Input(shape = (max_len,), name='text_input')
  domain_input = tf.keras.Input(shape = (max_len,), name='text_input')
  input_layer = layers.Embedding(max_features, 128, input_length=max_len)(domain_input)

  # Definimos la capa Conv1D
  conv = layers.Conv1D(filters=32, kernel_size=3, strides=1, padding='same')(input_layer)

  # Definimos la capa de atención
  attention = layers.Attention()([conv, input_layer])

  # Definimos la capa densa de salida
  outputs = layers.Dense(1, activation='sigmoid')(attention)

  # Creamos el modelo
  model = tf.keras.Model(inputs=domain_input, outputs=outputs)


  #----


  
  model.summary()

  model.compile(loss=loss_function, optimizer=optimizer, metrics=['accuracy'])
  #tf.keras.utils.plot_model(model, to_file=os.path.join(IMAGES_PATH,'Conv1D.png'), show_shapes=True)
  
  return model

# Hiperparámetros, funciones de activación...

¿Pq ann/dnn potencias de 2: 2,8,16,64,...? 
En teoría convergen más rápido, pero sólo debería ser para CNN:
https://ai.stackexchange.com/questions/5399/why-should-the-number-of-neurons-in-a-hidden-layer-be-a-power-of-2

¿Por qué 128  en la capa de embedding?

¿Por qué batch size de 128?

¿Por qué 100 epochs?

¿Por qué 0.2 o 0.5 en la capa Dropout? La capa de dropout sirve para evitar el overfitting

¿Por qué 30 filtros en Conv1D? 

¿Por qué kernel size de 4?

¿En Conv1D por qué capas paralelas en lugar de consecutivas?

Utilizar grid search en Keras: https://towardsai.net/p/l/stop-using-grid-search-the-complete-practical-tutorial-on-keras-tuner



In [None]:
# Array donde guardaremos las métricas y tiempos de cada modelo para compararlos
metrics=[]
def train_model_get_metrics(model_name, model):
  tf.keras.utils.plot_model(model, to_file=os.path.join(IMAGES_PATH,'model_'+model_name+'.png'), show_shapes=True)
  cp_path=os.path.join(CHECKPOINTS_PATH,model_name)
  # Callbacks: Checkpoint
  checkpoint = tf.keras.callbacks.ModelCheckpoint(cp_path, 
                                                  save_weights_only=True,
                                                  monitor='val_loss',
                                                  mode='min',
                                                  save_best_only=True)
  # El entrenamiento se detiene si tras tres epoch (patience=3) no se reduce el mejor val_loss
  earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=PATIENCE)

  # Entrenamiento del modelo
  start=timer()
  #history = model.fit(x_train, y_train, batch_size=128, epochs=MAX_EPOCHS, validation_split=0.1, callbacks=[earlyStopping,checkpoint])
  history = model.fit(x_train, y_train, batch_size=128, epochs=MAX_EPOCHS, validation_data=(x_test, y_test), callbacks=[earlyStopping,checkpoint])
  end = timer()
  t=end-start

  # Cargamos el checkpoint del mejor modelo y evaluamos con el conjunto de test
  model.load_weights(cp_path)
  loss, acc = model.evaluate(x_test, y_test, verbose=2)

  # Dibujamso 
  plot_model(history, model_name)
  (fpr,tpr,threshold,roc_auc)=get_cm_metrics(model, x_test, y_test)

  # Recopilamos métricas, número de parámetros de la red y tiempo de ejecución
  #metrics.append([model_name,acc,loss,model.count_params(),t])
  metrics.append({'model':model_name,
                  'acc':acc,
                  'loss':loss,
                  'params':model.count_params(),
                  'time':t,
                  'fpr':fpr,
                  'tpr':tpr,
                  'threshold':threshold,
                  'roc_auc':roc_auc
                  })


In [None]:
#@title Modelo ANN (monocapa)
train_model_get_metrics(model_name='ANN',model=get_ANN(max_features,max_len,LOSS_FUNCTION,OPTIMIZER))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 67, 128)           4992      
                                                                 
 dense (Dense)               (None, 67, 128)           16512     
                                                                 
 flatten (Flatten)           (None, 8576)              0         
                                                                 
 dropout (Dropout)           (None, 8576)              0         
                                                                 
 dense_1 (Dense)             (None, 1)                 8577      
                                                                 
Total params: 30,081
Trainable params: 30,081
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoc

In [None]:
#@title Modelo DNN (multicapa)
train_model_get_metrics(model_name='dnn',model=get_DNN(max_features,max_len,LOSS_FUNCTION,OPTIMIZER))

In [None]:
#@title Modelo RNN (LSTM)
train_model_get_metrics(model_name='LSTM',model=get_LSTM(max_features,max_len,LOSS_FUNCTION,OPTIMIZER))

In [None]:
#@title Modelo CNN (Conv1D)
train_model_get_metrics(model_name='CONV1D',model=get_Conv1D(max_features,max_len,LOSS_FUNCTION,OPTIMIZER))

In [None]:
#@title Modelo mixto LSTM-Conv1D
train_model_get_metrics(model_name='CONV1D_LSTM',model=get_LSTM_Conv1D(max_features,max_len,LOSS_FUNCTION,OPTIMIZER))

# Tabla resumen de resultados
La siguiente tabla resume los resultados obtenidos. 
Se puede observar que los mejores resultados son los ofrecidos por el modelo `LSTM` y por el modelo combinado `LSTM+Conv1D`.


In [None]:
table_content=''
for m in metrics:
  table_content=table_content+'|'+m['model'] \
  +'|'+"{:10.2f}".format(m['acc']) \
  +'|'+"{:10.2f}".format(m['loss']) \
  +'|'+"{:10.3f}".format(m['roc_auc']) \
  +'|'+str(m['params']) \
  +'|'+"{:.0f} s.".format(m['time'])+'|\n'


display(Markdown(
"""
| Modelo | _Accuracy_ | _Loss_ | AUC | Núm. Parámetros | Tiempo entrenamiento |
|--------|------------|--------|-----|-----------------|----------------------|
"""+table_content
))

Comparamos a continuación los resultados por medio de las curvas ROC:



In [None]:
# Ref: https://github.com/Tony607/ROC-Keras/blob/master/ROC-Keras.ipynb
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
for m in metrics:
  plt.plot(m['fpr'],m['tpr'], label=m['model']+' (AUC = {:.3f})'.format(m['roc_auc']))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('Curva ROC')
plt.legend(loc='best')
plt.show()

plt.figure(2)
plt.xlim(0, 0.2)
plt.ylim(0.8, 1)
plt.plot([0, 1], [0, 1], 'k--')
for m in metrics:
  plt.plot(m['fpr'],m['tpr'], label=m['model']+' (AUC = {:.3f})'.format(m['roc_auc']))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('Curva ROC (zoom en la parte superior izquierda)')
plt.legend(loc='best')
plt.show()

In [None]:
#@title Modelo CNN + Attention
#train_model_get_metrics(model_name='LSTM_Attention',model=getLSTM_Attention(max_features,max_len,LOSS_FUNCTION,OPTIMIZER))