In [None]:
!pip install transformers
!pip install datasets

In [None]:
import datasets
from datasets import load_dataset,Value, Sequence, Features
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding, AutoModelForMaskedLM
from transformers import default_data_collator
from transformers import DataCollatorForLanguageModeling
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import AutoConfig, AutoModelForMaskedLM, BertForMaskedLM
from transformers import pipeline
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
import time,sys,os
import math
import collections
import itertools
import random


**Load trained (domain adapted to HTTP REST API requests to Alteon ) distilbert model**

- This trained model can be found on huggingface

https://huggingface.co/bridge4


In [None]:
modelhiddenextract = DistilBertModel.from_pretrained("4epochdistilbert_uri_domainadapt")
print (modelhiddenextract)

**Feature extraction using trained distilbert model**

- Extract embeddings of each HTTP request URI by passing the URI through the trained model and then mean pooling the hidden states.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
modelhiddenextract.to(device)

def extract_meanpooled_hidden_states(batch):

    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}

    with torch.no_grad():
        #BaseModelOutput() with attributes:  last_hidden_state, hidden_states, attention
        last_hidden_state = modelhiddenextract(inputs["input_ids"],
                                               attention_mask=inputs["attention_mask"]).last_hidden_state

    input_mask_expanded = (inputs["attention_mask"].unsqueeze(-1).expand(last_hidden_state.size()).float())
    meanpooled = torch.sum(last_hidden_state, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    return {"hidden_state": meanpooled.cpu().numpy()}

tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask"])

tokenized_datasets_hidden = tokenized_datasets.map(extract_meanpooled_hidden_states, batched=True, batch_size=100) # default batch_size=1000

tokenized_datasets_hidden.save_to_disk("hiddenextractsavedds_meanpool")

dataset_hidden = tokenized_datasets_hidden.remove_columns(["input_ids","attention_mask"])


**Create legitimite and illegimate HTTP sequences from dataset**

In [None]:
dataset_hidden.set_format(type="pandas")
df = dataset_hidden[:]


In [None]:
########## get indexes to create sequences ###################
# Taking a HTTP request sequence to be randomize between 5-10 sequential requests

indices = [i for i, s in enumerate(df["URI"])]
batches = []
idxtmp = 0
for i in range(0, len(indices)):

    if i == 0:
        idxtmp = random.randint(5,15)
        batches.append(indices[i:i + idxtmp])
        idxtmp = i + idxtmp #0+7
    elif i <=idxtmp-1:
        pass
    else:
        idxtmp = random.randint(5,15)
        batches.append(indices[i:i + idxtmp])
        idxtmp = i + idxtmp

print (len(batches), batches[:5],batches[len(batches)-1], batches[len(batches)-2])

In [None]:
##### using indexes from previous cell create sequences/patterns of HTTP requests and save as new HF dataset########
############## labels 0 represent legitimate HTTP request sequences #################
newuricol =[]
newhidcol = []
labells = []
for batch in batches:
    newuricol.append(df["URI"].iloc[batch].values)
    tmp = np.concatenate(df["hidden_state"].iloc[batch].values)
    tmp=tmp.reshape([len(batch),768])
    newhidcol.append(tmp)

labells = [0]*len(newuricol)

newds = Dataset.from_dict({"URI":newuricol,"hidden_state":newhidcol,"label":labells})


In [None]:
########### new dataframe used for classification task ##############
dfclass = pd.DataFrame(columns=["URI","hidden_state"])
dfclass["URI"]=newuricol
dfclass["hidden_state"]=newhidcol
dfclass["label"] = [0]*len(dfclass["URI"]) # 0 label for good sequences



**This dataset will be used for two different downstream modeling tasks**

- **Classification with transformer encoder**

For this create a new dataset where we permute the HTTP requests within a sequence and assign a new label 1 for anomolous

- **Unsupervised learning using LSTM VAE**

Here we continue to use the original dataset of sequences , labels not required.

In [None]:
######## For classification task create anomolous sequences by permuting within sequence #############

dfanom = df.sample(frac=0.75, random_state=72).reset_index(drop=True)
indices = [i for i, s in enumerate(dfanom["URI"])]
batches = []
idxtmp = 0
for i in range(0, len(indices)):

    if i == 0:
        idxtmp = random.randint(5,15)
        batches.append(indices[i:i + idxtmp])
        idxtmp = i + idxtmp #0+7

    elif i <=idxtmp-1:
        pass

    else:
        idxtmp = random.randint(5,15) # idx =7; 6

        batches.append(indices[i:i + idxtmp])
        idxtmp = i + idxtmp #7+7+6

newuricol =[]
newhidcol = []
labells = []

for batch in batches:
    newuricol.append(dfanom["URI"].iloc[batch].values)
    tmp = np.concatenate(dfanom["hidden_state"].iloc[batch].values)
    tmp=tmp.reshape([len(batch),768])
    newhidcol.append(tmp)
labells = [1]*len(newuricol)

newdsanom = Dataset.from_dict({"URI":newuricol,"hidden_state":newhidcol,"label":labells})

In [None]:
######### concat legitimate and anomolous datasets for training in classification task ####################

ds_concat = datasets.concatenate_datasets([newds, newdsanom])
ds_concat = ds_concat.shuffle(seed=42)
ds_concat.save_to_disk("ds_concat_classif_meanpooled")