In [None]:
# What 3rd party models can be loaded into elasticsearch ML for inference?
# https://www.elastic.co/guide/en/machine-learning/current/ml-inference.html
# https://www.elastic.co/guide/en/machine-learning/current/appendix-third-party-models.html
# https://www.elastic.co/guide/en/machine-learning/current/appendix-third-party-models.html#appendix-third-party-models

In [39]:
#%pip install xgboost eland elasticsearch elasticsearch_serverless pandas gzip tensorflow tqdm


In [8]:
import pandas as pd
import gzip
import json

# Load gzipped JSON data
with open('.\dga-training-data-encoded.json\dga-training-data-fixed.json', 'rt', encoding='utf-8') as f:
    sourcedata = [json.loads(line) for line in f]

# Convert to DataFrame
df = pd.DataFrame(sourcedata)
df.head()

In [31]:
import random
import string
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample data in the specified format
data = [
    {"domain": "google", "threat": "benign"},
    {"domain": "example", "threat": "benign"},
    {"domain": "random1234", "threat": "malicious"},
    {"domain": "abcd1234", "threat": "benign"},
    {"domain": "malicious", "threat": "malicious"},
    {"domain": "dangerous123", "threat": "malicious"}
]

# All possible characters in a domain name
characters = list(string.ascii_letters + string.digits)
random.shuffle(characters)

# Creating a lookup table
lookup_table = {char: idx + 1 for idx, char in enumerate(characters)}

# Function to convert domain names into encoded format
def encode_domain(domain):
    return [lookup_table.get(char, 0) for char in domain]

# Process test data into features and labels
domains = [item['domain'] for item in data]
threats = [1 if item['threat'] == 'dga' else 0 for item in data]  # 1 for malicious, 0 for benign

# Encoding test domains
encoded_domains = pad_sequences([encode_domain(domain) for domain in domains], maxlen=20)

encoded_domains, threats


(array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 15, 45,
         45, 15, 30, 43],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 43,  6, 14,
         32, 17, 30, 43],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 57, 14, 18,  3, 45, 32,
         22, 31, 58, 53],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 14, 40, 51,  3,
         22, 31, 58, 53],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 32, 14, 30, 35, 51,
         35, 45, 25, 56],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  3, 14, 18, 15, 43, 57, 45, 25,
         56, 22, 31, 58]]),
 [0, 0, 0, 0, 0, 0])

In [1]:
lookup_table

NameError: name 'lookup_table' is not defined

In [32]:
from tqdm import tqdm
# Process test data into features and labels
domains = [item['domain'] for item in sourcedata]
threats = [1 if item['threat'] == 'dga' else 0 for item in sourcedata]  # 1 for malicious, 0 for benign

# Encoding test domains
encoded_domains = pad_sequences([encode_domain(domain) for domain in tqdm(domains, desc='Encoding Domains')], maxlen=40)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(encoded_domains, threats, test_size=0.3, random_state=42)


Encoding Domains: 100%|██████████| 16246006/16246006 [00:42<00:00, 378685.52it/s] 


In [64]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


# Initialize the model - CPU only
model = XGBClassifier()

param_grid = {
    'max_depth': [3, 4, 5],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Fit the model
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=8)

grid_search.fit(X_train, y_train)


# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

# Evaluate on the test set
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy on test set: {:.2f}%".format(accuracy * 100))

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best parameters found:  {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
Best accuracy found:  0.9365585597266213
Accuracy on test set: 93.70%


In [None]:
GPU Section

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


# Initialize the model - w/ GPU Support
model = XGBClassifier(tree_method='device')

param_grid = {
    'max_depth': [3, 4, 5],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Fit the model
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=1)

grid_search.fit(X_train, y_train)


# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

# Evaluate on the test set
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy on test set: {:.2f}%".format(accuracy * 100))

# Test model locally

In [None]:
import random

# Grab 20 random samples from sourcedata
samples = random.sample(sourcedata, 20)

# Encode the domains
encoded_samples = pad_sequences([encode_domain(sample['domain']) for sample in samples], maxlen=40)

# Run the encoded samples against the model
predictions2 = best_model.predict(encoded_samples)

# Print the predictions interactively
for i, sample in enumerate(samples):
    print("Sample", i+1)
    print("Domain:", sample['domain'])
    print("Threat:", sample['threat'])
    print("Prediction:", ['dga' if predictions2[i] == 1 else 'benign'])
    print()


# Load model into elasticsearch and use for inference ther

In [69]:
from eland.ml import MLModel
from elasticsearch import Elasticsearch

url = "" 
api_key = ""
es = Elasticsearch(
    url,
    api_key="",  # API key for your project
)
print(es.info())


{'name': 'serverless', 'cluster_name': 'ca9a9d90a7e44baab2629c85cc6ba7ea', 'cluster_uuid': 'uoMZYkwSR9auvQUhIRxtXQ', 'version': {'number': '8.11.0', 'build_flavor': 'serverless', 'build_type': 'docker', 'build_hash': '00000000', 'build_date': '2023-10-31', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '8.11.0', 'minimum_index_compatibility_version': '8.11.0'}, 'tagline': 'You Know, for Search'}


In [72]:
feature_names = ["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37", "f38", "f39"]
model_id = "dga_xgboost_classifier_v1"
es_model = MLModel.import_model(
  es,
  model_id=model_id,
  model=best_model,
  feature_names=feature_names,
  es_if_exists='replace'
)


In [74]:
# run samples up against elasticsearch

import random

# Grab 20 random samples from sourcedata
samples = random.sample(sourcedata, 20)

# Encode the domains
encoded_samples = pad_sequences([encode_domain(sample['domain']) for sample in samples], maxlen=40)

# Run the encoded samples against the model
predictions3 = es_model.predict(encoded_samples)

# Print the predictions interactively
for i, sample in enumerate(samples):
    print("Sample", i+1)
    print("Domain:", sample['domain'])
    print("Threat:", sample['threat'])
    print("Prediction:", ['dga' if predictions3[i] == 1 else 'benign'])
    print()


Sample 1
Domain: a67l527ly2c4o27
Threat: dga
Prediction: ['dga']

Sample 2
Domain: vendercomprardolares
Threat: benign
Prediction: ['benign']

Sample 3
Domain: bhyqyjrnxawn
Threat: dga
Prediction: ['dga']

Sample 4
Domain: gripqlul3del1lcjutm
Threat: dga
Prediction: ['dga']

Sample 5
Domain: wisconsinyes
Threat: benign
Prediction: ['benign']

Sample 6
Domain: yvxxik
Threat: dga
Prediction: ['dga']

Sample 7
Domain: khdqgecohmrnukfqr
Threat: dga
Prediction: ['dga']

Sample 8
Domain: e61fax5v1n7b12u8cr1pen1
Threat: dga
Prediction: ['dga']

Sample 9
Domain: celtmyth
Threat: benign
Prediction: ['dga']

Sample 10
Domain: mlfjeniwsffqogvtusmd
Threat: dga
Prediction: ['dga']

Sample 11
Domain: ezwsvxrqsylzjpow
Threat: dga
Prediction: ['dga']

Sample 12
Domain: wsguigqiuamgiueyimocwoye
Threat: dga
Prediction: ['dga']

Sample 13
Domain: 3xc0uvkp1lsrw0g0gh5vk6a
Threat: dga
Prediction: ['dga']

Sample 14
Domain: tiwshjx
Threat: dga
Prediction: ['dga']

Sample 15
Domain: xuwwvhfcxlqwnot
Threat: dg