In [101]:
#!pip install google-cloud-secret-manager
#!pip install tensorflow_decision_forests
#!pip install -q wandb
#######THIS NOTEBOOK USES ONLY SIX UNINTERESTING FEATURES USED MERELY AS A BASELINE#####

In [1]:
import json
import dask.dataframe as dd
from google.cloud import secretmanager
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
import tensorflow_decision_forests as tfdf
from datetime import datetime
import wandb
from wandb.keras import WandbCallback, WandbMetricsLogger

In [2]:
#authenticate self as google user that is connected to GCP cloud account with secret manager access
import sys
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

#key value of my secret
secret_name = "harvardmlops_json"

#name of the GCP project
project_id = 'harvardmlops'

#name of GCP bucket
bucket_name = "harvardmlops"

#name of bucket folder to read
gold_folder = "gold"

In [3]:
# Create a local secrets manager client:
client = secretmanager.SecretManagerServiceClient()

# resource F string
resource_name = f"projects/{project_id}/secrets/{secret_name}/versions/latest"

# ask the client to get my secret
response = client.access_secret_version(request={"name": resource_name})

#decode the response
secret_string = response.payload.data.decode('UTF-8')

#token access
token = json.loads(secret_string)

In [4]:
def read(files):

    #storage option paramenter
    storage_options={'token': token}

    #begin time
    start = datetime.now()

    #read files as parquest
    df = dd.read_parquet(files, storage_options=storage_options)

    #stop timing
    end = datetime.now()

    #give time result
    print(f"Read data from in GCP bucket in: {end-start}")

    #ensure domain is str
    df['domains'] = df['domains'].astype(str)

    return df

In [106]:
#location of GCP bucket/folder
files = f"gs://{bucket_name}/{gold_folder}/*/*/*.gzip"

#read files into a dask dataframe
df = read(files)

#convert to pandas DF for first transformation
pandas_df = df.compute()

Read data from in GCP bucket in: 0:00:02.930732


In [107]:
def split_data(data, test_size=0.05, validation_size=0.05):

    if test_size + validation_size >= 1.0:
        raise ValueError("The sum of test_size and validation_size must be less than 1.0.")

    # Calculate the remaining percentage for training data
    train_size = 1.0 - test_size - validation_size

    # Split the data into training and remaining
    train_data, remaining_data = train_test_split(data, train_size=train_size, random_state=42)

    # Split the remaining data into validation and test
    validation_data, test_data = train_test_split(remaining_data, test_size=test_size / (test_size + validation_size), random_state=42)

    return train_data, validation_data, test_data

In [108]:
# get training, valiation, and test data
train_data, validation_data, test_data = split_data(pandas_df)

#attempt to save on memory constraints
del pandas_df

In [109]:
def select_features(from_columns, elements_to_remove):
    features = list(filter(lambda x: x not in elements_to_remove, from_columns))
    return features

def Xy(data, elements_to_remove=["domains", "actor"]):
    features = select_features(data.columns, elements_to_remove)
    X_train = data[features]
    y_train = [label2index[actor] for actor in data['actor']]
    return X_train, y_train

#TRAINING DATA
X_train, y_train = Xy(train_data, ["domains"])
del train_data
train_data = tfdf.keras.pd_dataframe_to_tf_dataset(X_train, label="actor")

#VALIDATION DATA
X_val, y_val = Xy(validation_data, ["domains"])
del validation_data
validation_data = tfdf.keras.pd_dataframe_to_tf_dataset(X_val, label="actor")

#TEST DATA
X_test, y_test = Xy(test_data, ["domains"])
del test_data
test_data = tfdf.keras.pd_dataframe_to_tf_dataset(X_test, label="actor")

In [None]:
#code pulled directly from lecture notes
#def tf_data(X, y, batch_size=10000):
#  # Create TF Dataset
#  tf_dataset = tf.data.Dataset.from_tensor_slices((X, y))
#  #tf_dataset = tf_dataset.shuffle(buffer_size=len(X))
#  tf_dataset = tf_dataset.batch(batch_size)
#  tf_dataset = tf_dataset.prefetch(tf.data.AUTOTUNE)
#  return tf_dataset
#train_data = tf_data(X_train, y_train)
#validation_data = tf_data(X_val, y_val)

In [110]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
model_name = "randomforest_6_param"
max_depth = 16
num_trees = 10

In [None]:
model = tfdf.keras.RandomForestModel(num_threads=4,
                                     max_depth = max_depth,
                                     num_trees=num_trees,
                                     allow_na_conditions=True,
                                     verbose=2,
                                     name=model_name)
model.compile(metrics=["accuracy"])

# Initialize a W&B run
wandb.init(
    project = 'harvardmlops',
    config = {
      "num_trees": num_trees,
      "max_depth": max_depth,
      "model_name": model.name
    },
    name = model.name
)

Use /tmp/tmpebkk2aek as temporary training directory


[34m[1mwandb[0m: Currently logged in as: [33mrob-chavez[0m ([33mharvardmlops[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Train model
import time
start_time = time.time()
training_results = model.fit(train_data,
                             callbacks=[WandbCallback()],
                             validation_data=validation_data)
execution_time = (time.time() - start_time)/60.0
print("Training execution time (mins)",execution_time)

# Update W&B
wandb.config.update({"execution_time": execution_time})
# Close the W&B run
wandb.run.finish()



Reading training dataset...
Training tensor examples:
Features: {'length': <tf.Tensor 'data:0' shape=(None,) dtype=int64>, 'entropy': <tf.Tensor 'data_1:0' shape=(None,) dtype=float64>, 'number_of_vowels': <tf.Tensor 'data_2:0' shape=(None,) dtype=int64>, 'number_of_consonants': <tf.Tensor 'data_3:0' shape=(None,) dtype=int64>, 'number_of_numbers': <tf.Tensor 'data_4:0' shape=(None,) dtype=int64>, 'number_of_specials': <tf.Tensor 'data_5:0' shape=(None,) dtype=int64>}
Label: Tensor("data_6:0", shape=(None,), dtype=int64)
Weights: None
Normalized tensor features:
 {'length': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'Cast:0' shape=(None,) dtype=float32>), 'entropy': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'Cast_1:0' shape=(None,) dtype=float32>), 'number_of_vowels': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'Cast_2:0' shape=(None,) dtype=float32>), 'number_of_consonants': SemanticTensor(semantic=<Semantic.NUMER

[INFO 23-10-03 04:08:06.3315 UTC kernel.cc:773] Start Yggdrasil model training
[INFO 23-10-03 04:08:06.3315 UTC kernel.cc:774] Collect training examples
[INFO 23-10-03 04:08:06.3316 UTC kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 23-10-03 04:08:06.3322 UTC kernel.cc:393] Number of batches: 27899
[INFO 23-10-03 04:08:06.3322 UTC kernel.cc:394] Number of examples: 27898208
[INFO 23-10-03 04:08:08.3149 UTC kernel.cc:794] Training dataset:
Number of records: 27898208
Number of columns: 7

Number of columns by type:
	NUMERICAL: 6 (85.7143%)
	CATEGORICAL: 1 (14.2857%)

Columns:

NUMERICAL: 6 (85.7143%)
	1: "entropy" NUMERICAL mean:3.45912 min:0 max:

Model trained in 0:12:12.119821
Compiling model...
Model compiled.
Training execution time (mins) 15.02772464354833


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁
val_accuracy,▁
val_loss,▁

0,1
epoch,0.0
val_accuracy,0.66411
val_loss,0.0


In [None]:
evaluation = model.evaluate(test_data, return_dict=True)
print()

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")


loss: 0.0000
accuracy: 0.6633


In [None]:
model_name = "gradient_boosted_trees_6_param"
max_depth = 5
num_trees = 20
model_gbt = tfdf.keras.GradientBoostedTreesModel(num_threads=4,
                                                 max_depth = max_depth,
                                                 num_trees=num_trees,
                                                 allow_na_conditions=True,
                                                 verbose=2,
                                                 name=model_name)
model_gbt.compile(metrics=["accuracy"])

# Initialize a W&B run
wandb.init(
    project = 'harvardmlops',
    config = {
      "num_trees": num_trees,
      "max_depth": max_depth,
      "model_name": model_gbt.name
    },
    name = model_gbt.name
)

# Train model
import time
start_time = time.time()
training_results = model_gbt.fit(train_data,
                                 callbacks=[WandbCallback()],
                                 validation_data=validation_data)
execution_time = (time.time() - start_time)/60.0
print("Training execution time (mins)",execution_time)

# Update W&B
wandb.config.update({"execution_time": execution_time})
# Close the W&B run
wandb.run.finish()
evaluation = model_gbt.evaluate(test_data, return_dict=True)
print()

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")

Use /tmp/tmpi_zt3_8_ as temporary training directory


[34m[1mwandb[0m: Currently logged in as: [33mrob-chavez[0m ([33mharvardmlops[0m). Use [1m`wandb login --relogin`[0m to force relogin




Reading training dataset...
Training tensor examples:
Features: {'length': <tf.Tensor 'data:0' shape=(None,) dtype=int64>, 'entropy': <tf.Tensor 'data_1:0' shape=(None,) dtype=float64>, 'number_of_vowels': <tf.Tensor 'data_2:0' shape=(None,) dtype=int64>, 'number_of_consonants': <tf.Tensor 'data_3:0' shape=(None,) dtype=int64>, 'number_of_numbers': <tf.Tensor 'data_4:0' shape=(None,) dtype=int64>, 'number_of_specials': <tf.Tensor 'data_5:0' shape=(None,) dtype=int64>}
Label: Tensor("data_6:0", shape=(None,), dtype=int64)
Weights: None
Normalized tensor features:
 {'length': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'Cast:0' shape=(None,) dtype=float32>), 'entropy': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'Cast_1:0' shape=(None,) dtype=float32>), 'number_of_vowels': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'Cast_2:0' shape=(None,) dtype=float32>), 'number_of_consonants': SemanticTensor(semantic=<Semantic.NUMER

[INFO 23-10-05 01:09:28.0361 UTC kernel.cc:773] Start Yggdrasil model training
[INFO 23-10-05 01:09:28.0361 UTC kernel.cc:774] Collect training examples
[INFO 23-10-05 01:09:28.0361 UTC kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 23-10-05 01:09:28.0373 UTC kernel.cc:393] Number of batches: 27899
[INFO 23-10-05 01:09:28.0373 UTC kernel.cc:394] Number of examples: 27898208
[INFO 23-10-05 01:09:29.8123 UTC kernel.cc:794] Training dataset:
Number of records: 27898208
Number of columns: 7

Number of columns by type:
	NUMERICAL: 6 (85.7143%)
	CATEGORICAL: 1 (14.2857%)

Columns:

NUMERICAL: 6 (85.7143%)
	1: "entropy" NUMERICAL mean:3.45912 min:0 max:

In [97]:
#!pip install transformers
#!pip install torch torchvision torchaudio -f https://download.pytorch.org/whl/cu111/torch_stable.html


In [5]:
import random
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence

if torch.cuda.is_available():
    print("GPU is available.")
else:
    print("GPU is not available.")

GPU is available.


In [6]:
model_name = "bert_based_model"
# Set a seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

#location of GCP bucket/folder
files = f"gs://{bucket_name}/{gold_folder}/*/*/*.gzip"

#read files into a dask dataframe
df = read(files)

#convert to pandas DF for first transformation
pandas_df = df.compute()

# Get a random sample of 10% of the records
sample_percentage = 0.1  # 10%
sampled_df = pandas_df.sample(frac=sample_percentage, random_state=42)

Read data from in GCP bucket in: 0:00:03.264632


In [None]:
label2index =

In [9]:
# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenize the text and convert labels to integers
tokenized_text = [tokenizer.encode(domain, truncation=True, add_special_tokens=True, max_length=20, pad_to_max_length=True) for domain in sampled_df['domains']]
labels = sampled_df['actor'].map({'symmi':0, 'legit':1, 'ranbyus_v1':2, 'kraken_v1':3, 'not_dga':4, 'pushdo':5,
                                  'ranbyus_v2':6, 'zeus-newgoz':7, 'locky':8, 'corebot':9, 'dyre':10, 'shiotob':11,
                                  'proslikefan':12, 'nymaim':13, 'ramdo':14, 'necurs':15, 'tinba':16, 'vawtrak_v1':17,
                                  'qadars':18, 'matsnu':19, 'fobber_v2':20, 'alureon':21, 'bedep':22, 'dircrypt':23,
                                  'rovnix':24, 'sisron':25, 'cryptolocker':26, 'fobber_v1':27, 'chinad':28,
                                  'padcrypt':29, 'simda':30})

# Split the data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(tokenized_text, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

X_train = torch.LongTensor(X_train)
y_train = torch.LongTensor(np.array(y_train))
X_val = torch.LongTensor(X_val)
y_val = torch.LongTensor(np.array(y_val))
X_test = torch.LongTensor(X_test)
y_test = torch.LongTensor(np.array(y_test))

# Create data loaders
batch_size = 64
lr = 1e-5
epochs = 3
wandb.init(
    project = 'harvardmlops',
    config = {
      "batch_size": batch_size,
      "lr": lr,
      "epochs":epochs,
      "model_name": "bert_based"
    },
    name = "bert_based"
)
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=wandb.config.batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=wandb.config.batch_size, shuffle=False)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=wandb.config.batch_size, shuffle=False)

# Set up training parameters
# Initialize a W&B run
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(sampled_df['actor'].unique()))
model = model.to(device)  # Move the model to the GPU

optimizer = torch.optim.AdamW(model.parameters(), lr=wandb.config.lr)
num_epochs = wandb.config.epochs


# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, labels = [data.to(device) for data in batch]
        optimizer.zero_grad()
        output = model(input_ids, labels=labels)
        loss = output.loss
        wandb.log({'train_batch_loss': loss})
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids, labels = [data.to(device) for data in batch]
        output = model(input_ids)
        predictions = torch.argmax(output.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
wandb.log({'test_accuracy': accuracy})
print(f"Test Accuracy: {accuracy:.2%}")

[34m[1mwandb[0m: Currently logged in as: [33mrob-chavez[0m ([33mharvardmlops[0m). Use [1m`wandb login --relogin`[0m to force relogin


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Test Accuracy: 90.47%


In [10]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [11]:
model_save_name = 'bert_dga_classifier.pt'
path = f"/content/gdrive/MyDrive/{model_save_name}"
torch.save(model.state_dict(), path)

In [None]:
#model.load_state_dict(torch.load(path))