# Threat Hunting Hugging Face Dataset for Gretel Processing


In [None]:
%pip install wandb #comet_ml
%pip install pyarrow python-dotenv datasets gretel-client
%pip install numpy
%pip install pandas


%pip install -q torch


In [None]:
RUNS=0
PROJECT_NAME="Cognitive Synthesis"

In [None]:
if RUNS == 0:
  !echo "let's set up your project"
  DEFAULT_GH_PROJECT="Synavate Labs"
  USER = input('user?') #this is your git user name
  GH_NAME=input('What is your name?')
  PROJECT_NAME == PROJECT_NAME
  RUNS+=1
else:
  pass

In [None]:
import os
import random
from dotenv import load_dotenv
import numpy as np
import torch
from tqdm.auto import tqdm

COLAB = False

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

# Device configuration
device = torch.device("mps" if COLAB == False & torch.backends.mps.is_available() else "cpu")
print(f"Your device is {device}")



# Gretel Synthetics

In [59]:
#Hugging Face Data
from datasets import load_dataset
import pandas as pd
load_dotenv()

DATASET = "Olec/cyber-threat-intelligence_v2"
ds = load_dataset(DATASET)
#ds = ds.with_format("torch", device=device)





In [71]:
df = pd.DataFrame({
    "id": ds['train']["id"],
    "text": ds["train"]["text"],
    "entities": ds["train"]["entities"]
})
df

Unnamed: 0,id,text,entities
0,249,A cybersquatting domain save-russia[.]today is...,"[{'end_offset': 16, 'id': 44656, 'label': 'att..."
1,14309,"Like the Android Maikspy, it first sends a not...","[{'end_offset': 17, 'id': 48530, 'label': 'SOF..."
2,13996,While analyzing the technical details of this ...,"[{'end_offset': 194, 'id': 48781, 'label': 'th..."
3,13600,(Note that Flash has been declared end-of-life...,"[{'end_offset': 79, 'id': 51687, 'label': 'TIM..."
4,14364,Figure 21. Connection of Maikspy variants to 1...,"[{'end_offset': 191, 'id': 51779, 'label': 'UR..."
...,...,...,...
327,1852,We also observed the execution of a passwo...,"[{'end_offset': 71, 'id': 46982, 'label': 'too..."
328,535,We also observed that Mailto (AKA NetWalker) t...,"[{'end_offset': 28, 'id': 1765, 'label': 'malw..."
329,2633,While several top-tier RaaS affiliate program...,"[{'end_offset': 98, 'id': 47813, 'label': 'thr..."
330,3281,The malware downloads OBS Studio files if the...,"[{'end_offset': 12, 'id': 48378, 'label': 'mal..."


In [66]:
# Remove N/A and duplicate text, id
print(f"NA: {df.isna().sum()}\n\n")
print(f"NULL: {df.isnull().sum()}\n\n")
df['text'] = df['text'].drop_duplicates()
df['id'] = df['id'].drop_duplicates()
print(df['text'].head())
print(df.shape)



NA: id           0
text        11
entities     0
dtype: int64


NULL: id           0
text        11
entities     0
dtype: int64


0    A cybersquatting domain save-russia[.]today is...
1    Like the Android Maikspy, it first sends a not...
2    While analyzing the technical details of this ...
3    (Note that Flash has been declared end-of-life...
4    Figure 21. Connection of Maikspy variants to 1...
Name: text, dtype: object
(332, 3)


dict_keys(['end_offset', 'id', 'label', 'start_offset'])

In [67]:
%pip install tqdm ipywidgets IProgress
import IProgress
from tqdm.notebook import tqdm
tqdm.pandas()


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [83]:
#Process Text NLP
%pip install nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag


def reduce_text(text):
  lemmatizer = WordNetLemmatizer()
  words = [lemmatizer.lemmatize(word) for word in text if word not in stopwords.words('english')]
  return words
  
def tokenize_text(text):  
  tokens = word_tokenize(text)
  tokens = [word for word in tokens if word.isalpha()]
  return ' '.join(tokens)




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to /Users/nullzero/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nullzero/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nullzero/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nullzero/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nullzero/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [113]:
#Feature extraction function
def extract_features(entities):
    features = {
        "attack-pattern": 0,
        "malware": 0,
        "ip": 0,
        "threat-actor": 0,
        "campaign": 0
    }
    feature = 0
    if df[0]['label'] in features:
        print(df[0].get('entities').get('labels'))
        df['features'] == df[0].get('entities').get('labels')
        feature += 1
    return df['features'].head()




In [90]:
# Isolate attack patterns
def isolate_attack_patterns(entities):
    attack_patterns = [label for label in entities if entity[0]['label'] == 'attack-pattern']
    return ', '.join([f"{entity['start_offset']}-{entity['end_offset']}" for entity in attack_patterns])





In [47]:
#Classify
def classify_text(text):
    return nltk.classify(text)

In [84]:
#Reduce text down using NLTK and apply from Pandas
df["text"] = df["text"].progress_apply(reduce_text)
print("Text reduced")

  0%|          | 0/332 [00:00<?, ?it/s]

Text reduced


In [111]:
df['features'] = df['entities'].apply(extract_features)

KeyError: 0

In [112]:



#Feature extraction


#classify_text
#df["classification"] = df['text'].progress_apply(classify_text)

#Isolate entities 
# Apply the function to the 'entities' column and create a new column with the results
df['attack_patterns'] = df.progress_apply(isolate_attack_patterns)
print("Attack patterns done")



  0%|          | 0/3 [00:00<?, ?it/s]

NameError: name 'entity' is not defined

In [55]:
from sklearn.preprocessing import OneHotEncoder

def one_hot(labels):
    return df["oneHotFeatures"] == df["features"].OneHotEncoder(
        sparse=False
    )
    
df.head()


Unnamed: 0,id,text,entities
0,249,A cybersquatting domain save-russia[.]today is...,"[{'end_offset': 16, 'id': 44656, 'label': 'att..."
1,14309,"Like the Android Maikspy, it first sends a not...","[{'end_offset': 17, 'id': 48530, 'label': 'SOF..."
2,13996,While analyzing the technical details of this ...,"[{'end_offset': 194, 'id': 48781, 'label': 'th..."
3,13600,(Note that Flash has been declared end-of-life...,"[{'end_offset': 79, 'id': 51687, 'label': 'TIM..."
4,14364,Figure 21. Connection of Maikspy variants to 1...,"[{'end_offset': 191, 'id': 51779, 'label': 'UR..."


In [None]:
#Tokenize text
df["text"] = df["text"].progress_apply(tokenize_text)
print("Text tokenized")

In [54]:
#OneHot Encoding
df['entities'][0]["labels"].progress_apply(one_hot)

KeyError: 'labels'

In [None]:
# import the WandB library
import wandb

# start a new experiment
wandb.init(key=os.getenv(os.getenv("WANDB_API_KEY"), project="threathunting model")

# capture a dictionary of hyperparameters with config
wandb.config = {"learning_rate": 0.001, "epochs": 100, "batch_size": 128}

# set up model and data
model, dataloader = get_model(), get_data()

# optional: track gradients
wandb.watch(model)

for batch in dataloader:
  metrics = model.training_step()
  # log metrics inside your training loop to visualize model performance
  wandb.log(metrics)

# optional: save model at the end
model.to_onnx()
wandb.save("model.pt")