In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Install

In [None]:
pip install 'h5py<3.0.0'

In [None]:
pip install tensorflow==1.15.0

In [None]:
pip install keras==2.5.0rc0

In [None]:
!pip install wordninja

# Imports

In [None]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow.keras
import wordninja as wn

from sklearn import metrics
from sklearn import preprocessing
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dense,Lambda,Input
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Parametri

In [None]:
path = "/content/drive/MyDrive/Cyber Security/Elmo/"
elmo_path = "/content/drive/MyDrive/Cyber Security/Elmo/3"

batch_size = 32
numEpochs = 10

start_fold = 1
end_fold = 11

nfolds = 10

nome_file = "Dataset_Completo.csv"

# Definizioni

In [None]:
def arrayToSentence(x):
  string=''
  for a in x:
    string=string + a + ' '
  return string

def buildDataset():
  filecsv = open(path + nome_file, newline="")
  lettore = csv.reader(filecsv, delimiter=";")

  dataset_x = []
  dataset_y = []
  temp_y = []
  for a in lettore:
    dataset_y.append(a[0])
    if a[0] == 'dga':
      temp_y.append(1)
    else:
      temp_y.append(0)
    split = wn.split(a[3])
    sen = arrayToSentence(split)
    dataset_x.append(sen)

  filecsv.close()

  return dataset_x, dataset_y, temp_y

In [None]:
def kfold(x, y, temp_y):
  # Divide the dataset into training + holdout and testing with folds
  sss = StratifiedKFold(n_splits=nfolds)

  fold = 0
  for train, test in sss.split(x, temp_y):
    print("Writing fold " + str(fold + 1) + " to csv...")
    fold += 1
    x_train, x_test, y_train, y_test, y_temp_train, y_temp_test = x[train], x[test], y[train], y[test], temp_y[train], temp_y[test]
    np.savetxt(path + "Dataset/x_train" + str(fold) + ".csv", x_train, fmt='%s', delimiter=';')
    np.savetxt(path + "Dataset/x_test" + str(fold) + ".csv", x_test, fmt='%s', delimiter=';')
    np.savetxt(path + "Dataset/y_train" + str(fold) + ".csv", y_train, fmt='%s', delimiter=';')
    np.savetxt(path + "Dataset/y_test" + str(fold) + ".csv", y_test, fmt='%s', delimiter=';')
    np.savetxt(path + "Dataset/temp_y_train" + str(fold) + ".csv", y_temp_train, fmt='%i', delimiter=';')
    np.savetxt(path + "Dataset/temp_y_test" + str(fold) + ".csv", y_temp_test, fmt='%i', delimiter=';')
  print("Files created")

In [None]:
def encode(le, labels):
    enc = le.transform(labels)
    return tf.keras.utils.to_categorical(enc)

def decode(le, one_hot):
    dec = np.argmax(one_hot, axis=1)
    return le.inverse_transform(dec)

# Preprocesing

In [None]:
dataset_x, dataset_y, temp_y = buildDataset()
dataset_x = np.array(dataset_x)

le = preprocessing.LabelEncoder()
le.fit(dataset_y)

dataset_y_encode = encode(le, dataset_y)
dataset_y = np.array(dataset_y_encode)

temp_y = np.array(temp_y)

# kfold(dataset_x, dataset_y, temp_y) DA FARE SOLO LA PRIMA VOLTA PER GENERARE I VARI FOLD

# Model

In [None]:
fold = 0
for fold in range(start_fold, end_fold):
  print('Fold: ', fold, 'Epochs: ', numEpochs)
  #Get fold by csv
  x_train = np.genfromtxt(path + "Dataset/x_train" + str(fold) + ".csv", delimiter=';', dtype=None)
  x_test = np.genfromtxt(path + "Dataset/x_test" + str(fold) + ".csv", delimiter=';', dtype=None)
  y_train = np.genfromtxt(path + "Dataset/y_train" + str(fold) + ".csv", delimiter=';', dtype=None)
  y_test = np.genfromtxt(path + "Dataset/y_test" + str(fold) + ".csv", delimiter=';', dtype=None)
  y_train_temp = np.genfromtxt(path + "Dataset/temp_y_train" + str(fold) + ".csv", delimiter=';', dtype=None)
  y_test_temp = np.genfromtxt(path + "Dataset/temp_y_test" + str(fold) + ".csv", delimiter=';', dtype=None)

  print('Model Construction...')
  model = None

  #Parte costruzione del modello
  # importo il modulo con la funzione di embedding ELMo
  elmo = hub.Module(elmo_path)

  # Definisco la funzione di embedding
  def ELMoEmbedding(x):
    return elmo(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

  input_text = Input(shape=(1,), dtype=tf.string)
  embedding = Lambda(ELMoEmbedding, output_shape=(1024, ))(input_text)
  dense = Dense(128, activation='relu')(embedding)
  pred = Dense(len(y_train[0]), activation='sigmoid')(dense)
  model = Model(inputs=[input_text], outputs=pred)

  model.compile('adam', 'binary_crossentropy', metrics=['accuracy',
      tf.keras.metrics.AUC(),
      tf.keras.metrics.Precision(),
      tf.keras.metrics.Recall()
      ])

  print('...done!')

  print('Train...')
  #parte di training
  Type = 'binary-'
  with tf.compat.v1.Session() as session:
    earlystop = EarlyStopping(monitor='loss', patience=3)
    best_save = ModelCheckpoint(path + 'Saved/bestmodel' + str(fold) + '.hdf5', save_best_only=True, 
                                save_weights_only=False, 
                                monitor='val_loss', 
                                mode='min')
    tf.compat.v1.keras.backend.set_session(session)
    session.run(tf.compat.v1.global_variables_initializer())
    session.run(tf.compat.v1.tables_initializer())
    history = model.fit(x_train,y_train,
              batch_size=batch_size,
              epochs=numEpochs, 
              callbacks=[earlystop, best_save],
              validation_split=0.1
              )
    print('\nhistory dict:', history.history)

  print('...done!')

  print('Test...')
  #Parte di test
  with tf.compat.v1.Session() as session:
    tf.compat.v1.keras.backend.set_session(session)
    session.run(tf.compat.v1.global_variables_initializer())
    session.run(tf.compat.v1.tables_initializer())
    best_model = load_model(path + 'Saved/bestmodel' + str(fold) + '.hdf5')
    predicts = best_model.predict(x_test, batch_size=batch_size, verbose= 1)

  # print(predicts)
  y_preds = decode(le, predicts)
  y_test_temp = np.where(y_test_temp == 1, 'dga', 'legit')
  print('...done!')

  print('Results:')
  #Plotta i risultati
  cm = metrics.confusion_matrix(y_test_temp, y_preds)
  np.savetxt(path + 'Saved/confusion_matrix' + str(fold) + '.csv', cm, delimiter=',',  fmt='%i')
  metrics1 = metrics.classification_report(y_test_temp, y_preds, output_dict=True, target_names=['legit', 'dga'])
  print('Confusion_matrix:')
  print(cm)
  print('Classification_report:')
  print(metrics1)
  try:
      df1 = pd.read_csv(path + "Saved/metrics.csv", index_col=[0])
      df1 = df1.append(pd.DataFrame(metrics1))
      df1.to_csv(path + "Saved/metrics.csv")
  except:
      pd.DataFrame(metrics1).to_csv(path + "Saved/metrics.csv")

  df_cm = pd.DataFrame(cm, index = [i for i in le.classes_],
                    columns = [i for i in le.classes_])
  plt.figure(1, figsize = (10,7))
  sn.heatmap(df_cm, annot=True, fmt="d")
  plt.show()