# Section 4 – Language Models exploration

In [1]:
import pandas as pd
import numpy as np
import ast
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import matplotlib.pyplot as plt
import seaborn as sns
import time

## 4.1 Pretrain BERT

In [2]:
df_o = pd.read_parquet("ssh_attacks.parquet", columns=["full_session", "Set_Fingerprint"])
df = df_o.sample(n=40000, random_state=42)
df

Unnamed: 0,full_session,Set_Fingerprint
201435,cat /proc/cpuinfo | grep name | wc -l ; echo -...,"[Discovery, Persistence]"
8270,"cd /tmp || /var/tmp || /dev/shm ; echo ""ZXZhbC...","[Discovery, Execution, Other, Persistence]"
95809,"cat /proc/cpuinfo | grep name | wc -l ; echo ""...","[Discovery, Execution, Persistence]"
221229,cat /proc/cpuinfo | grep name | wc -l ; echo -...,"[Discovery, Persistence]"
5810,enable ; system ; shell ; sh ; cat /proc/mount...,"[Defense Evasion, Discovery]"
...,...,...
221455,cat /proc/cpuinfo | grep name | wc -l ; echo -...,"[Discovery, Persistence]"
112153,"cat /proc/cpuinfo | grep name | wc -l ; echo ""...","[Discovery, Execution, Persistence]"
47895,"cat /proc/cpuinfo | grep name | wc -l ; echo ""...","[Discovery, Execution, Persistence]"
16530,enable ; system ; shell ; sh ; cat /proc/mount...,"[Defense Evasion, Discovery]"


In [3]:
all_labels=['Persistence', 'Discovery', 'Defense Evasion', 'Execution', 'Impact', 'Other', 'Harmless']
# Binarize multi-labels
mlb = MultiLabelBinarizer(classes=all_labels)
df['encoded_labels'] = list(mlb.fit_transform(df['Set_Fingerprint'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)))


df

Unnamed: 0,full_session,Set_Fingerprint,encoded_labels
201435,cat /proc/cpuinfo | grep name | wc -l ; echo -...,"[Discovery, Persistence]","[1, 1, 0, 0, 0, 0, 0]"
8270,"cd /tmp || /var/tmp || /dev/shm ; echo ""ZXZhbC...","[Discovery, Execution, Other, Persistence]","[1, 1, 0, 1, 0, 1, 0]"
95809,"cat /proc/cpuinfo | grep name | wc -l ; echo ""...","[Discovery, Execution, Persistence]","[1, 1, 0, 1, 0, 0, 0]"
221229,cat /proc/cpuinfo | grep name | wc -l ; echo -...,"[Discovery, Persistence]","[1, 1, 0, 0, 0, 0, 0]"
5810,enable ; system ; shell ; sh ; cat /proc/mount...,"[Defense Evasion, Discovery]","[0, 1, 1, 0, 0, 0, 0]"
...,...,...,...
221455,cat /proc/cpuinfo | grep name | wc -l ; echo -...,"[Discovery, Persistence]","[1, 1, 0, 0, 0, 0, 0]"
112153,"cat /proc/cpuinfo | grep name | wc -l ; echo ""...","[Discovery, Execution, Persistence]","[1, 1, 0, 1, 0, 0, 0]"
47895,"cat /proc/cpuinfo | grep name | wc -l ; echo ""...","[Discovery, Execution, Persistence]","[1, 1, 0, 1, 0, 0, 0]"
16530,enable ; system ; shell ; sh ; cat /proc/mount...,"[Defense Evasion, Discovery]","[0, 1, 1, 0, 0, 0, 0]"


In [4]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # tokenizer is in charge of preparing the inputs for a model

class MultiLabelTextDataset(Dataset):
    def __init__(self, sessions, labels):
        self.sessions = sessions
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx): # executed at each batch to get each item, batch_size=8 -> gets 8 items at a time
        text = self.sessions[idx]
        encoded = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors="pt") # tokenize session text, until a length of 512 for the tensor generated from the pretrained vocabulary
        return encoded, self.labels[idx] # encoded labels and tokenized session body

In [5]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(df['full_session'], df['encoded_labels'], train_size=0.7, random_state=42, shuffle=True)

train_dataset = MultiLabelTextDataset(X_train.tolist(), y_train.tolist())
val_dataset = MultiLabelTextDataset(X_val.tolist(), y_val.tolist())

train_loader = DataLoader(train_dataset, batch_size=30, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
train_dataset[0]

  self.labels = torch.tensor(labels, dtype=torch.float32)


({'input_ids': tensor([[  101,  4937,  1013,  4013,  2278,  1013, 17368,  2378, 14876,  1064,
          24665, 13699,  2171,  1064, 15868,  1011,  1048,  1025,  9052,  1011,
           1041,  1000,  1059, 27798,  3070,  1032,  1050,  2549,  9818, 28505,
           6169,  4801, 10376,  1032,  1050,  2549,  9818, 28505,  6169,  4801,
          10376,  1000,  1064,  3413, 21724,  1064, 24234,  1025,  9052,  1000,
           1059, 27798,  3070,  1032,  1050,  2549,  9818, 28505,  6169,  4801,
          10376,  1032,  1050,  2549,  9818, 28505,  6169,  4801, 10376,  1032,
           1050,  1000,  1064,  3413, 21724,  1025,  9052,  1000, 24030,  1000,
           1028,  1013, 13075,  1013,  1056,  8737,  1013,  1012, 13075,  2692,
          19481, 19317, 12521,  2509,  1025, 28549,  1011, 21792,  1013, 13075,
           1013,  1056,  8737,  1013,  1012, 13075,  2692, 19481, 19317, 12521,
           2509,  1025,  4937,  1013, 13075,  1013,  1056,  8737,  1013,  1012,
          13075,  2692, 19

## 4.2 Add a Dense layer to the Model

In [6]:
class BertMultiLabelClassifier(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_labels=len(all_labels)):
        super(BertMultiLabelClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(self.bert.config.hidden_size, num_labels) # takes the dimension of bert output for the linear layer

    # Forward pass
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) # BERT model
        pooled_output = outputs.pooler_output  # [CLS] token embedding
        x = self.dropout(pooled_output) # Dropout
        logits = self.linear(x)  # Dense Layer. Raw logits
        return logits


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # use of GPU/TPU for google colab

model = BertMultiLabelClassifier()
model.to(device)

criterion = nn.BCEWithLogitsLoss()  # Multi-label loss
optimizer = optim.AdamW(
    model.parameters(), 
    lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
    eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
)


## Training Loop

In [8]:
N = 10  # Number of epochs
total_steps = len(train_loader) * N
print("total steps: ", total_steps)

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

total steps:  9340


In [9]:
N = 10  # Number of epochs
training_stats = []
# Measure the total training time for the whole run.
total_t0 = time.time()
for epoch in range(N):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, N))
    print('Training...')
    model.train()
    total_loss = 0.0

    for i, batch in enumerate(train_loader):
        # Print loss and time every 40 batches
        if (i + 1) % 50 == 0:
            elapsed_time = time.time() - total_t0
            print(f"Epoch [{epoch+1}/{N}], Batch [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}, Time: {elapsed_time:.2f}s")
            
        encoded_inputs, labels = batch # batch = dictionary of tensors and a label tensor
        input_ids = encoded_inputs['input_ids'].squeeze(1).to(device)
        attention_mask = encoded_inputs['attention_mask'].squeeze(1).to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        
        total_loss += loss.item()
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        
        # Update the learning rate.
        scheduler.step()

    
    avg_train_loss = total_loss / len(train_loader)  # Calculate average training loss

    # Append epoch statistics to training_stats
    training_stats.append(
        {
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
        }
    )


    # Validation Step
    model.eval()
    total_val_loss = 0.0
    all_preds, all_lab = [], []
    validation_stats = []

    with torch.no_grad():
        for batch in val_loader:
            encoded_inputs, labels = batch
            input_ids = encoded_inputs['input_ids'].squeeze(1).to(device)
            attention_mask = encoded_inputs['attention_mask'].squeeze(1).to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels.to(device))
            total_val_loss += loss.item()

            preds = torch.sigmoid(outputs)  # Apply sigmoid to get probabilities
            preds = (preds > 0.5).int()

            all_preds.append(preds.cpu())
            all_lab.append(labels.cpu())

    avg_train_loss = total_train_loss / len(train_loader)
    avg_val_loss = total_val_loss / len(val_loader)

    # Append epoch statistics to validation_stats
    validation_stats.append(
        {
            'epoch': epoch + 1,
            'Validation Loss': avg_val_loss,
        }
    )

    print(f"Epoch {epoch+1}/{N}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")




end_time = time.time() # Record the end time of the training
train_duration = end_time - total_t0 # Calculate the training duration
print(f"Training completed in {train_duration:.2f} seconds.")



Training...


KeyboardInterrupt: 

In [None]:
training_df = pd.DataFrame(training_stats)
validation_df = pd.DataFrame(validation_stats)
df_train_val = pd.merge(training_df, validation_df, on='epoch')


sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(df_train_val['Training Loss'], 'b-o', label="Training")
plt.plot(df_train_val['Validation Loss'], 'g-o', label="Validation")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks(range(1, N + 1))

plt.show()