In [None]:
#!pip install google-cloud-secret-manager
#!pip install -q wandb
#!pip install transformers
#!pip install torch torchvision torchaudio -f https://download.pytorch.org/whl/cu111/torch_stable.html

In [None]:
import random
import json
from datetime import datetime
import os

#PANDAS
import pandas as pd

#DASK
import dask.dataframe as dd

#NUMPY
import numpy as np

#TORCH
import torch
from torch.utils.data import DataLoader, TensorDataset
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

#SKLEARN
from sklearn.model_selection import train_test_split

#TRANSFORMERS
from transformers import BertTokenizer, DistilBertTokenizer, AutoTokenizer
from transformers import BertForSequenceClassification, AutoModel, DistilBertForSequenceClassification, DebertaForSequenceClassification
from transformers import AdamW

#GOOGLE.CLOUD
from google.cloud import secretmanager
from google.cloud import storage
if torch.cuda.is_available():
    print("GPU is available.")
else:
    print("GPU is not available.")

CPU is available.


In [None]:
############################
######## CONSTANTS #########
############################

#name of the GCP project
project_id = '222163854742'

# key value of my secret -> this is the GOOGLE_APPLICATION_CREDENTIALS
secrets = { "project_secret": "harvardmlops_json",
            "trainer_secret": "newtrainerjson"}

# DATA CONSTANTS
#name of GCP DATA bucket
bucket_name = "harvardmlops"

#name of data bucket folder to read
gold_folder = "gold"

#location of GCP bucket/folder files
files = f"gs://{bucket_name}/{gold_folder}/*/*/*.gzip"


# MODEL CONSTANTS
#name of GCP MODELS bucket
models_bucket = "dga-models"

#name of the best GENERAL model
best_model = "bert-base-uncased"

#name of best FINE-TUNED MODEL
best_model_fine_tuned = "bert_dga_classifier.pt"

# path to best fine tuned model
best_fine_tuned_model_path = f"gs://{models_bucket}/{best_model_fine_tuned}"

#name of model to be quantized
best_model_quantized = "quantized_bert_dga_classifier_0.pt"

#bucket to save models
save_path = f"gs://{models_bucket}"

#path where to find quantized model
quantized_path = f"gs://{models_bucket}/{best_model_quantized}"
# **** MODELS ****


# LABELS
#labels of the DGA actors
actors = {'symmi':0, 'legit':1, 'ranbyus_v1':2, 'kraken_v1':3, 'not_dga':4, 'pushdo':5,
          'ranbyus_v2':6, 'zeus-newgoz':7, 'locky':8, 'corebot':9, 'dyre':10, 'shiotob':11,
          'proslikefan':12, 'nymaim':13, 'ramdo':14, 'necurs':15, 'tinba':16, 'vawtrak_v1':17,
          'qadars':18, 'matsnu':19, 'fobber_v2':20, 'alureon':21, 'bedep':22, 'dircrypt':23,
          'rovnix':24, 'sisron':25, 'cryptolocker':26, 'fobber_v1':27, 'chinad':28,
          'padcrypt':29, 'simda':30}

#the number of labels found in my classification problem
num_labels = 31
# *** LABELS ***

#device needs to be cpu
device = torch.device("cpu")

In [None]:
############################
######### FUNCTIONS ########
############################

def get_token(type):

    # Create a local secrets manager client:
    client = secretmanager.SecretManagerServiceClient()

    # get the secret name
    secret_name = secrets.get(type)

    # resource F string
    resource_name = f"projects/{project_id}/secrets/{secret_name}/versions/latest"

    # ask the client to get my secret
    response = client.access_secret_version(request={"name": resource_name})

    # decode the response
    secret_string = response.payload.data.decode('UTF-8')

    # token access
    token = json.loads(secret_string)

    return token


def read(files:str, token):

    #storage option paramenter
    storage_options={'token': token}

    #begin time
    start = datetime.now()

    #read files as parquest
    df = dd.read_parquet(files, storage_options=storage_options)

    #stop timing
    end = datetime.now()

    #give time result
    print(f"Read data from in GCP bucket in: {end-start}")

    #ensure domain is str
    df['domains'] = df['domains'].astype(str)

    return df


def get_data(kind, frac=0.001, random_state=42):

    #get trainer_secret token from secrets manager
    token = get_token(kind)

    #read files into a dask dataframe
    df = read(files, token)

    #convert to pandas DF for first transformation
    pandas_df = df.compute()

    #sample dataframe
    sampled_df = pandas_df.sample(frac=frac, random_state=random_state)

    return sampled_df


def tokenize_data(model_name, data, predict=False):

    # init tokenizer
    if model_name == 'bert-base-uncased':
        tokenizer = BertTokenizer.from_pretrained(model_name)

    elif model_name == 'distilbert-base-uncased':
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)

    elif model_name == 'microsoft/deberta-base':
        tokenizer = AutoTokenizer.from_pretrained(model_name)

    else:
        raise Exception("Model not found")
        sys.exit(1)

    # tokenize the data/domains
    tokenized_text = [tokenizer.encode(domain, truncation=True, add_special_tokens=True, max_length=20, pad_to_max_length=True) for domain in data['domains']]

    if predict:
        return tokenized_text

    else:
        # map str labels to int
        labels = data['actor'].map(actors)

        return  tokenized_text, labels


def split_data(tokenized_text, labels, batch_size, random_state=42):

    # Split the data to get test data - 20 percent
    X_train, X_test, y_train, y_test = train_test_split(tokenized_text, labels, test_size=0.2, random_state=random_state)

    # Split again to get train and validation data
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=random_state)

    #X_train and y_train LongTensor
    X_train, y_train = torch.LongTensor(X_train), torch.LongTensor(np.array(y_train))

    #X_val and y_val LongTensor
    X_val, y_val = torch.LongTensor(X_val),torch.LongTensor(np.array(y_val))

    #X_test and y_test LongTensor
    X_test, y_test = torch.LongTensor(X_test), torch.LongTensor(np.array(y_test))

    #training dataloader
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)

    #validation dataloader
    val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size, shuffle=False)

    #test dataloader
    test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader



def get_model(model_name):

    #MODEL 1
    if model_name == 'bert-base-uncased':
        model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    #MODEL 2
    elif model_name == 'distilbert-base-uncased':
        model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    #MODEL 3
    elif model_name == 'microsoft/deberta-base':
        model = DebertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    else:
        raise Exception("Model Not Found")
        sys.exit(0)

    #send to cpu
    model = model.to(device)

    # Initialize a GCS client
    client = storage.Client()

    #local_path
    local_model_path = f"{best_model_fine_tuned}"

    # Download the model from GCS
    bucket_name, blob_name = best_fine_tuned_model_path.replace('gs://', '').split('/', 1)
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.download_to_filename(local_model_path)

    #load the state dictionary of the fine-tune tuned model found in path
    model.load_state_dict(torch.load(local_model_path, map_location=device))

    model.to(device)

    return model

In [None]:
############################
######### PROCESS ##########
############################

# RETRIEVE DATA - sample 1 percent of data
data = get_data("trainer_secret", frac=0.01)

# GET FINE-TUNED MODEL AND OPTIMIZER
#model, optimizer = get_model(args.model_name, data, args.lr)
model = get_model(best_model)

# TOKENIZE DATA AND RETRIEVE LABELS
#domains, labels = tokenize_data(args.model_name, data)
domains, labels = tokenize_data(best_model, data)

# SPLIT DATA
#train_loader, val_loader, test_loader = split_data(domains, labels, args.batch_size)
train_loader, val_loader, test_loader = split_data(domains, labels, 64)

In [None]:
# Evaluate the FINE-TUNED model on test data
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids, labels = [data.to(device) for data in batch]
        output = model(input_ids)
        predictions = torch.argmax(output.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)
accuracy = correct / total
print(f"Test Accuracy: {accuracy:.2%}")

Test Accuracy: 90.49%


In [None]:
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)
print(quantized_model)

In [None]:
import os
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(quantized_model)

Size (MB): 438.08923
Size (MB): 181.501674


In [None]:
# Quantized Evaluation
quantized_model.to(torch.device('cpu'))
quantized_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in val_loader:
        input_ids, labels = [data.to(torch.device('cpu')) for data in batch]
        output = quantized_model(input_ids)
        predictions = torch.argmax(output.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

In [None]:
quantized_model.named_modules

In [None]:
accuracy = correct / total
print(f"Test Accuracy: {accuracy:.2%}")

Test Accuracy: 87.11%


In [None]:
#SAVE THE MODEL
torch.save({
    'model_state_dict': quantized_model.state_dict(),
    'quantization_config': quantized_model.config,
}, best_model_quantized)

In [None]:
loaded_quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

# Load the saved model state_dict and quantization configuration
checkpoint = torch.load(path)
loaded_quantized_model.load_state_dict(checkpoint['model_state_dict'])
loaded_quantized_model.qconfig = checkpoint['quantization_config']

# Ensure the model is in evaluation mode
loaded_quantized_model.eval()

# Move the loaded quantized model to the desired device
device = torch.device("cpu")  # Change to your desired device
loaded_quantized_model = loaded_quantized_model.to(device)

In [None]:
# Quantized Evaluation
loaded_quantized_model.to(torch.device('cpu'))
loaded_quantized_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in val_loader:
        input_ids, labels = [data.to(torch.device('cpu')) for data in batch]
        output = loaded_quantized_model(input_ids)
        predictions = torch.argmax(output.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

In [None]:
accuracy = correct / total
print(f"Test Accuracy: {accuracy:.2%}")

Test Accuracy: 87.11%


In [None]:
# EXAMPLE OF HOW TO ENCODE AND RUN FOR A LIST OF DOMAINS


# EXAMPLE DOMAINS
domains = ["cnn.com", "espn.com", "dfjdfjdf.ru", "doraku.com", "kidfhkhjgareuspde.it"]
domains =pd.DataFrame(domains, columns=['domains'])

#Tokenize the DOMAINS
tokenized_text = tokenize_data('bert-base-uncased', domains, predict=True)
#[tokenizer.encode(domain, truncation=True, add_special_tokens=True, max_length=20, pad_to_max_length=True) for domain in domains]

# Setup the Inputs
input = torch.LongTensor(tokenized_text)

# Eval the model
loaded_quantized_model.eval()

# Retreive the outputs
output = loaded_quantized_model(input)

# Get the predictions
predictions = torch.argmax(output.logits, dim=1)

# Print the predictions
predictions.tolist()