In [1]:
!git clone https://github.com/huggingface/transformers
!cd transformers && pip install .

fatal: destination path 'transformers' already exists and is not an empty directory.
Processing /data/notebooks/transformers
Building wheels for collected packages: transformers
  Building wheel for transformers (setup.py) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-2.5.1-py3-none-any.whl size=528611 sha256=edd5bf6ebd45ce3365dfecb2e4f6857cbae694d001e0d370cf89f13c53bdfe89
  Stored in directory: /tmp/pip-ephem-wheel-cache-lb61b4rp/wheels/5c/1c/47/7aca7c86ce98d3f7beb792bd7f926ef4d3cc45abd4f8daaa44
Successfully built transformers
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 2.5.1
    Uninstalling transformers-2.5.1:
      Successfully uninstalled transformers-2.5.1
Successfully installed transformers-2.5.1


In [2]:
import torch
from transformers import *
import pandas as pd
import numpy as np
import tensorflow as tf

# If there's a GPU available...

if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB
Found GPU at: /device:GPU:0


In [3]:
data_dir = "../datasets/TrainingV2/Train_test_validation/"

all_df = pd.read_csv(data_dir + "all_data.csv")
train_df = pd.read_csv(data_dir + "train_cwe_nlp.csv")
validation_df = pd.read_csv(data_dir + "validation_cwe_nlp.csv")
test_df = pd.read_csv(data_dir + "test_cwe_nlp.csv")
underrep_df = pd.read_csv(data_dir + "underrep_cwe_nlp.csv")

# Drop datapoints with less than 5 examples
counts = all_df["CWE-ID"].value_counts()
underrepresented_cwes = counts.where(counts < 6).dropna().index.tolist()
underrep_frame = all_df[all_df['CWE-ID'].isin(underrepresented_cwes)]
all_df = all_df[~all_df['CWE-ID'].isin(counts[counts < 6].index)]
counts = all_df["CWE-ID"].value_counts()

In [4]:
all_labels = all_df["CWE-ID"].unique()
train_labels = train_df['CWE-ID'].unique()
validation_labels = validation_df['CWE-ID'].unique()
test_labels = test_df['CWE-ID'].unique()
merged_labels = np.concatenate((train_labels, validation_labels, test_labels))
merged_labels = np.unique(merged_labels)
print(len(all_labels))
print(len(train_labels))
print(len(validation_labels))
print(len(test_labels))
print("Unique: {}".format(len(merged_labels)))
#print(np.logical_and( (train_labels==validation_labels).all(), (validation_labels==test_labels).all() ))
#print(np.sort(merged_labels))

377
377
348
344
Unique: 377


In [5]:
lookup_table = dict(zip(list(merged_labels), range(0, len(merged_labels))))
print(lookup_table)

{16: 0, 17: 1, 18: 2, 19: 3, 20: 4, 22: 5, 23: 6, 28: 7, 29: 8, 32: 9, 33: 10, 35: 11, 36: 12, 37: 13, 38: 14, 39: 15, 41: 16, 42: 17, 46: 18, 49: 19, 50: 20, 55: 21, 57: 22, 58: 23, 59: 24, 61: 25, 62: 26, 64: 27, 65: 28, 67: 29, 69: 30, 73: 31, 74: 32, 77: 33, 78: 34, 79: 35, 80: 36, 81: 37, 82: 38, 83: 39, 84: 40, 85: 41, 88: 42, 89: 43, 90: 44, 91: 45, 93: 46, 94: 47, 95: 48, 96: 49, 98: 50, 99: 51, 113: 52, 116: 53, 117: 54, 118: 55, 119: 56, 120: 57, 122: 58, 123: 59, 124: 60, 125: 61, 129: 62, 130: 63, 131: 64, 134: 65, 138: 66, 143: 67, 147: 68, 150: 69, 151: 70, 154: 71, 155: 72, 156: 73, 157: 74, 158: 75, 166: 76, 167: 77, 170: 78, 172: 79, 174: 80, 176: 81, 177: 82, 178: 83, 179: 84, 180: 85, 181: 86, 182: 87, 184: 88, 185: 89, 187: 90, 189: 91, 190: 92, 191: 93, 193: 94, 194: 95, 195: 96, 197: 97, 199: 98, 200: 99, 203: 100, 204: 101, 205: 102, 206: 103, 207: 104, 208: 105, 209: 106, 211: 107, 212: 108, 213: 109, 214: 110, 215: 111, 219: 112, 222: 113, 223: 114, 226: 115, 2

In [6]:
all_df['CWE-ID'] = all_df['CWE-ID'].apply(lambda x:lookup_table[x])
train_df['CWE-ID'] = train_df['CWE-ID'].apply(lambda x:lookup_table[x])
validation_df['CWE-ID'] = validation_df['CWE-ID'].apply(lambda x:lookup_table[x])
test_df['CWE-ID'] = test_df['CWE-ID'].apply(lambda x:lookup_table[x])

In [7]:
train_df['CWE-ID'].value_counts()

35     8672
56     8331
4      5683
99     4794
124    3829
       ... 
373       4
280       4
97        4
368       4
154       4
Name: CWE-ID, Length: 377, dtype: int64

In [8]:
# Configuration Section

In [9]:
MAX_LEN = 512
BATCH_SIZE = 16
EPOCHS = 4
NUM_LABELS = len(lookup_table.keys())

In [10]:
MODELS = (BertForSequenceClassification,       BertTokenizer,       'bert-base-uncased')
model_class, tokenizer_class, pretrained_weights = MODELS
tokenizer = tokenizer_class.from_pretrained(pretrained_weights, do_lower_case=True)

In [11]:

def get_tokens(tokenizer, dataframe):
  return dataframe['Description'].apply(lambda desc: 
                              tokenizer.encode(
                                  desc, 
                                  add_special_tokens=True, 
                                  max_length=MAX_LEN, 
                                  pad_to_max_length=True
                                  #return_tensors = 'pt'
                                  )
                              ).to_list()

def get_attention_masks(input_ids):
  attention_masks = []
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)
  return attention_masks


train_input_tokens = get_tokens(tokenizer, train_df)
validation_input_tokens = get_tokens(tokenizer, validation_df)
test_input_tokens = get_tokens(tokenizer, test_df)

train_masks = get_attention_masks(train_input_tokens)
validation_masks = get_attention_masks(validation_input_tokens)
test_masks = get_attention_masks(test_input_tokens)

In [12]:
print(' Original: ', train_df["Description"][0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(train_df["Description"][0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_df["Description"][0])))

 Original:  The Spotfire Library component of TIBCO Software Inc.'s TIBCO Spotfire Analytics Platform for AWS Marketplace, and TIBCO Spotfire Server contains a vulnerability that might theoretically fail to restrict users with read-only access from modifying files stored in the Spotfire Library, only when the Spotfire Library is configured to use external storage. Affected releases are TIBCO Software Inc.'s TIBCO Spotfire Analytics Platform for AWS Marketplace versions up to and including 10.0.0, and TIBCO Spotfire Server versions up to and including 7.10.1; 7.11.0; 7.11.1; 7.12.0; 7.13.0; 7.14.0; 10.0.0.
Tokenized:  ['the', 'spot', '##fire', 'library', 'component', 'of', 'ti', '##bc', '##o', 'software', 'inc', '.', "'", 's', 'ti', '##bc', '##o', 'spot', '##fire', 'analytics', 'platform', 'for', 'aw', '##s', 'marketplace', ',', 'and', 'ti', '##bc', '##o', 'spot', '##fire', 'server', 'contains', 'a', 'vulnerability', 'that', 'might', 'theoretically', 'fail', 'to', 'restrict', 'users', '

In [13]:
train_labels = train_df['CWE-ID'].values
validation_labels = validation_df['CWE-ID'].values
test_labels = test_df['CWE-ID'].values

In [14]:
# Model expects PyTorch tensors, not numpy arrays

train_inputs = torch.tensor(train_input_tokens)
validation_inputs = torch.tensor(validation_input_tokens)
test_inputs = torch.tensor(test_input_tokens)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
test_labels = torch.tensor(test_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
test_masks = torch.tensor(test_masks)

In [15]:
# Calculate weights


In [16]:

from sklearn.utils import class_weight
import statistics

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(all_df["CWE-ID"]),
                                                 all_df["CWE-ID"])
#values = all_df["CWE-ID"].value_counts().values
#mean = round(sum(values) / len(values))
#print(values)
#print("Mean: {}".format(mean))
#weights = [round(float(mean) / float(value), 2) for value in values]
#print(weights)

In [17]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, WeightedRandomSampler

# `BATCH_SIZE` specified elsewhere
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32. Either must be run on 16GB GPU, will crash runtime on anything less

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = WeightedRandomSampler(class_weights, num_samples=len(class_weights)) #RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)

# Create the DataLoader for our test set.
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

In [18]:
model = model_class.from_pretrained(
    pretrained_weights, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = NUM_LABELS, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [19]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
import random

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = random.getrandbits(32) #42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

for epoch_i in range(0, EPOCHS):
    
    # ========================================
    #               Training
    # ========================================
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    print('Training...')

    t0 = time.time()
    total_loss = 0 # Reset the total loss for this epoch.
    model.train() # Just sets the `mode`, does not perform training

    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad() # Always clear any previously calculated gradients before performing a backward pass. Not automatic with PyTorch

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        loss = outputs[0] # Pull loss value out of the tuple

        total_loss += loss.item() # For average loss calculation at the end

        
        loss.backward() # Perform a backward pass to calculate the gradients.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Prevent exploding gradients

        optimizer.step() # Take a step with the new gradients

        scheduler.step() # Update the learning rate

    avg_train_loss = total_loss / len(train_dataloader) # Calculate the average loss over the training data.         
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval() # Evaluation mode. Dropout layers behave differently

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids) # Calculate the accuracy for this batch of test sentences.
        
        eval_accuracy += tmp_eval_accuracy  # Accumulate the total accuracy.
        nb_eval_steps += 1 # Track the number of batches

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...


add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
add_(Tensor other, Number alpha)
addcmul_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
addcmul_(Tensor tensor1, Tensor tensor2, Number value)
addcdiv_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
addcdiv_(Tensor tensor1, Tensor tensor2, Number value)



  Average training loss: 5.62
  Training epoch took: 0:00:12

Running Validation...
  Accuracy: 0.10
  Validation took: 0:02:23

Training...

  Average training loss: 5.03
  Training epoch took: 0:00:12

Running Validation...
  Accuracy: 0.10
  Validation took: 0:02:24

Training...

  Average training loss: 4.74
  Training epoch took: 0:00:12

Running Validation...
  Accuracy: 0.10
  Validation took: 0:02:23

Training...

  Average training loss: 4.63
  Training epoch took: 0:00:12

Running Validation...


In [87]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = '../Models/ModelV2/ModelBase/Weighted_Sampling/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to ../Models/ModelV2/ModelBase/Weighted_Sampling/


('../Models/ModelV2/ModelBase/Weighted_Sampling/vocab.txt',
 '../Models/ModelV2/ModelBase/Weighted_Sampling/special_tokens_map.json',
 '../Models/ModelV2/ModelBase/Weighted_Sampling/added_tokens.json')

In [88]:
# Testing

In [89]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(test_inputs)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in test_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

Predicting labels for 14,367 test sentences...


In [90]:
prediction_values = [prediction.argmax() for prediction in predictions]

In [91]:
prediction_values, true_values = [], []
for batch in predictions:
    for prediction in batch:
        prediction_values.append(prediction.argmax())

for batch in true_labels:
    for true_value in batch:
        true_values.append(true_value)

count = 0
for prediction, true_label in zip(prediction_values, true_values):
    if prediction == true_label:
        count+=1

print("Predicted {} / {} {} correctly".format(count, len(prediction_values), float(count) / float(len(prediction_values))))

Predicted 6663 / 14367 0.463771142200877 correctly


# Experimentation

In [92]:
model_dir = '../Models/ModelV2/ModelBase/Weighted_Sampling/'
model = model_class.from_pretrained(model_dir)
tokenizer = tokenizer_class.from_pretrained(model_dir)

In [93]:
cwe_lookup_table = pd.read_csv("../datasets/cwe-lookup-table.csv",index_col=False)

cwe_lookup_table.loc[cwe_lookup_table['CWE-ID'] == 255]["Description"].values[0]

'Credentials Management Errors'

In [94]:
def predict(model, sentence, lookup_table):
  # Drop into evaluation mode
  model.eval()

  # Send model to the CPU
  model.cpu()

  # Tokenize the provided text
  # TODO: Tokenizer global at the moment, change that
  encoded_sent = tokenizer.encode(sentence, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True)
  attention_mask = [int(token_id > 0) for token_id in encoded_sent]

  # 1x512 tensor
  sent_tensor = torch.tensor([encoded_sent])
  attn_tensor = torch.tensor([attention_mask])

  # Speeds up calc when gradients don't need to be calculated
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      # TODO: Why no softmax output??
      outputs = model(sent_tensor, token_type_ids=None, 
                      attention_mask=attn_tensor)
      logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  #print(logits)
  prediction = logits.argmax()
  # `lookup_table` holds mapping of labels to CWEs.
  # TODO: Save the lookup table alongside model weights
  for k,v in lookup_table.items():
    if v == prediction:
      cwe = k
      break
  try:
    # Not all CWEs present in the lookup table (mainly 'CWE CATEGORIES' which are missing)
    # TODO: Extract CWEs for master XML spec, not CSVs - DONE
    # TODO: Generate additional training data from CWE spec. At least 3 cases for each CWE can be created with 'Description',
    #         'Extended Description', and 'Background Detail', possibly more - DONE
    # 
    description = cwe_lookup_table.loc[cwe_lookup_table['CWE-ID'] == cwe]["Description"].values[0]
  except:
    description = "💩"

  print("Input: {}\n\tPredicted CWE: {}: {}, Prob: {}".format(sentence, cwe, description, prediction))

In [95]:
predict(model, "The app accepts a user controlled serialized object", lookup_table)
predict(model, "Site deserializes client side object", lookup_table)
predict(model, "There is no database parameterization", lookup_table)
predict(model, "There is insufficient validation of the 'id' parameter when used with SQL statement", lookup_table)
predict(model, "One user can access another's messages", lookup_table)
predict(model, "An attacker can spoof someone else's access token. Ruh roh. ", lookup_table)
predict(model, "There's no bounds checking on the input buffer", lookup_table)
predict(model, "Passwords are stored clear text", lookup_table)

Input: The app accepts a user controlled serialized object
	Predicted CWE: 200: Exposure of Sensitive Information to an Unauthorized Actor, Prob: 99
Input: Site deserializes client side object
	Predicted CWE: 204: Observable Response Discrepancy, Prob: 101
Input: There is no database parameterization
	Predicted CWE: 22: Improper Limitation of a Pathname to a Restricted Directory ('Path Traversal'), Prob: 5
Input: There is insufficient validation of the 'id' parameter when used with SQL statement
	Predicted CWE: 89: Improper Neutralization of Special Elements used in an SQL Command ('SQL Injection'), Prob: 43
Input: One user can access another's messages
	Predicted CWE: 200: Exposure of Sensitive Information to an Unauthorized Actor, Prob: 99
Input: An attacker can spoof someone else's access token. Ruh roh. 
	Predicted CWE: 327: Use of a Broken or Risky Cryptographic Algorithm, Prob: 169
Input: There's no bounds checking on the input buffer
	Predicted CWE: 119: Improper Restriction of 

In [96]:
# Model fails hard
predict(model, "ip whitelisting is not in place", lookup_table)
predict(model, "No https", lookup_table)
predict(model, "No tls", lookup_table)
predict(model, "no hardware R/X protections", lookup_table)
predict(model, "httponly flag isnt set", lookup_table)
predict(model, "There is no ASLR IN PLACE", lookup_table)
predict(model, "site is getting DDoSd", lookup_table)

Input: ip whitelisting is not in place
	Predicted CWE: 459: Incomplete Cleanup, Prob: 235
Input: No https
	Predicted CWE: 187: Partial String Comparison, Prob: 90
Input: No tls
	Predicted CWE: 911: Improper Update of Reference Count, Prob: 349
Input: no hardware R/X protections
	Predicted CWE: 20: Improper Input Validation, Prob: 4
Input: httponly flag isnt set
	Predicted CWE: 772: Missing Release of Resource after Effective Lifetime, Prob: 309
Input: There is no ASLR IN PLACE
	Predicted CWE: 434: Unrestricted Upload of File with Dangerous Type, Prob: 223
Input: site is getting DDoSd
	Predicted CWE: 434: Unrestricted Upload of File with Dangerous Type, Prob: 223


In [97]:
predict(model, "function protection inadequate", lookup_table)

Input: function protection inadequate
	Predicted CWE: 327: Use of a Broken or Risky Cryptographic Algorithm, Prob: 169


In [98]:
predict(model, "Router admin interface has default credentials", lookup_table)
predict(model, "Our stored passwords will live for all eternity", lookup_table)
predict(model, "full disk encryption is not present", lookup_table)
predict(model, "seed is not random", lookup_table)
predict(model, "Credentials are just flying around unencrypted", lookup_table)
predict(model, "Credentials are just flying?", lookup_table)
predict(model, "passwords are just flying around unencrypted", lookup_table)



# Illustrate semantic variety
print("\nSemantics")
predict(model, "personal info exposed by misconfigured S3 bucket", lookup_table)
predict(model, "info (PII) leaked through improperly configured object storage", lookup_table)
predict(model, "bad guy stole stuff", lookup_table)
predict(model, "data breach of user personal data", lookup_table) # Maps incorrectly

Input: Router admin interface has default credentials
	Predicted CWE: 74: Improper Neutralization of Special Elements in Output Used by a Downstream Component ('Injection'), Prob: 32
Input: Our stored passwords will live for all eternity
	Predicted CWE: 255: Credentials Management Errors, Prob: 122
Input: full disk encryption is not present
	Predicted CWE: 200: Exposure of Sensitive Information to an Unauthorized Actor, Prob: 99
Input: seed is not random
	Predicted CWE: 187: Partial String Comparison, Prob: 90
Input: Credentials are just flying around unencrypted
	Predicted CWE: 295: Improper Certificate Validation, Prob: 147
Input: Credentials are just flying?
	Predicted CWE: 312: Cleartext Storage of Sensitive Information, Prob: 160
Input: passwords are just flying around unencrypted
	Predicted CWE: 255: Credentials Management Errors, Prob: 122

Semantics
Input: personal info exposed by misconfigured S3 bucket
	Predicted CWE: 200: Exposure of Sensitive Information to an Unauthorized 

In [99]:
predict(model, "SSRF in Sofort's merchant portal, via notification webhook", lookup_table)
predict(model, "IDOR in POST /api/settings/PN01964/authentication-email/596/generate-token Allows Attacker to Generate Support Tokens for Other Accounts, Expose Email", lookup_table)
predict(model, "[MyDevelopment] XXE processing in SCORM files", lookup_table)
predict(model, "Reflected XSS vulnerability on www.sofort.com, via multipay/wait", lookup_table)

Input: SSRF in Sofort's merchant portal, via notification webhook
	Predicted CWE: 200: Exposure of Sensitive Information to an Unauthorized Actor, Prob: 99
Input: IDOR in POST /api/settings/PN01964/authentication-email/596/generate-token Allows Attacker to Generate Support Tokens for Other Accounts, Expose Email
	Predicted CWE: 200: Exposure of Sensitive Information to an Unauthorized Actor, Prob: 99
Input: [MyDevelopment] XXE processing in SCORM files
	Predicted CWE: 611: Improper Restriction of XML External Entity Reference, Prob: 257
Input: Reflected XSS vulnerability on www.sofort.com, via multipay/wait
	Predicted CWE: 79: Improper Neutralization of Input During Web Page Generation ('Cross-site Scripting'), Prob: 35


In [100]:
predict(model, "an attacker can execute remote shell commands", lookup_table)

Input: an attacker can execute remote shell commands
	Predicted CWE: 787: Out-of-bounds Write, Prob: 319


In [101]:
predict(model, "AWS Metadata is exposed", lookup_table)
predict(model, "User supplied request sent to arbitrary destination", lookup_table)
predict(model, "JWT is not verified", lookup_table)
predict(model, "CBC mode ciphers are enabled", lookup_table)
predict(model, "MD5 is used to hash passwords", lookup_table)
predict(model, "Insecure initialization vector usage", lookup_table)

Input: AWS Metadata is exposed
	Predicted CWE: 200: Exposure of Sensitive Information to an Unauthorized Actor, Prob: 99
Input: User supplied request sent to arbitrary destination
	Predicted CWE: 187: Partial String Comparison, Prob: 90
Input: JWT is not verified
	Predicted CWE: 327: Use of a Broken or Risky Cryptographic Algorithm, Prob: 169
Input: CBC mode ciphers are enabled
	Predicted CWE: 772: Missing Release of Resource after Effective Lifetime, Prob: 309
Input: MD5 is used to hash passwords
	Predicted CWE: 200: Exposure of Sensitive Information to an Unauthorized Actor, Prob: 99
Input: Insecure initialization vector usage
	Predicted CWE: 254: 7PK - Security Features, Prob: 121


In [90]:
lookup_table[158]

75

In [48]:
words = ["xss", "sqli", "httponly", "http", "csrf", "crlf", "html","exec", "eval", "null", "nul", "user", "tls", "https","aslr", "jwt", "aws", "idor", "ddos", "overflow", "md5", "aes", "pkcs", "json","dtd", "dns", "ecc", "ftp", "sftp", "ssh",
"udp", "tcp", "firewall", "fde", "dep", "dep", "hmac", "xhtml", "xmp", "icmp", "imap", "pop3", "auth", "ipsec", "kerberos",
"ldap", "l2tp", "l2f", "lkm", "ospf", "pap", "psn", "ppp", "pptp", "perl","c++","c#", "php", "pgp", "pki", "pfs", "arp",
"rarp", "rsa", "rsbac", "rbac", "rpc", "smime", "s/mime", "ssl", "acl", "sso", "soc", "steganography", "subnet", "syn",
"ack", "saml", "snmp", "telnet", "uri", "url", "vpn", "whois", "wep", "wap", "wpa", "wpa2", "x500", "X.500", "gcm", "spoof", "asn",
"blacklist", "whitelist", "ecb", "ccm", "crl", "checksum", "dmz", "dtls", "dmarc", "spf", "dkim", "dn", "egress", 
"ingress", "txt", "csv", ".tsv", ".doc", ".docx", "fips", "honeypot", "webdav", "oauth", "smb", "powershell", "chroot",
"selinux", "ipsec", "kdc", "kek", "mitm", "phish", "rfid", "rng", "rijndael", "scada", "openid", "dmz", "ssm", "sdlc",
"tpm", "vm", "wap", "wlan", "x509", "x.509", "ack", "bgp", "cgi", "chap", "mschap", "ttls", "crc", "dsa", "dss",
"underflow", "duplex", "csrf", "cswsh", "sqli", "sql", "nosql", "iframe", "div", "hyperlink", "xinetd", "systemd", "l2f",
"lkm", "loopback","multicast","multihome","multiplex", "nat", "%00", "octet", "ipv4", "ipv6", "osi", "x400","x.400", "overload",
"polymorphism", "polyinstantiation", "pfs","rfc","skey","s/key", "tcpdump", "wireshark", "jquery", "f5", "redirect",
"aead", "ctr", "ansi", "asic", "apu", "amd", "nvidia", "gpu", "asn", "asn.1", "ext", "ext3", "ext4", "fat32", "macos",
 "bsd", "openbsd", "freebsd", "axfr", "bios", "b2b", "baseline", "u2f", "2fa", "gigabyte", "megabyte", "terabyte", "bruteforce",
"bytewise", "xor", "captcha", "s3", "aws", "gcp", "tld", "homoglyph","webex", "ascii","cdrom","bootrom","bootloader", "cert",
"dacl", "r/o", "dhcp", "cms", "microsoft", "powerpoint", "vbscript", "dll", "dylib", "osx", "eapol", "eap", "ecdsa", 
"ecdhe", "eeprom", "efi", "efs", "zfs", "gsm", "cdma", "ss6", "bitmap","ss7", "ext2fs", "ext3fs", "fqdn", "whitespace", "gdi", "bitmap",
"gigaherz", "megaherz", "ghz", "mhz", "gps", "gpg", "gnupg", "gnu", "cdma", "guid", "uuid", "rpc", "golang", "hfs", "apfs",
"rootkit", "bootkit", "hklm", "ntlm", "ntlmv2", "mimikatz", "i2c", "ics", "siemens", "bosch", "idp", "isp", "asn", "caml",
"ocaml", "haskell", "javascript", "ecma", "ecmascript", "ecmav6", "ppt", "pptx", "xls", "xpath", "srng", "wsl", "imsi",
"paas", "iaas", "saas", "irc", "isatap", "iso", "iot", "ios", "javaee", "springboot", "ror", "jsp", "tomcat", "solr", 
"kafka", "nginx", "openssh", "openssl", "libressl", "jit", "jose", "jwe", "jwks", "jtag", "jvm", "scala", "erlang", 
"kdc", "ami", "reiserfs", "ibm", "coldfusion", "citrix", "xfs", "fat16", "usb", "sd", "sdcard", "hfs", "udf", "mysql", 
"postgres", "postgresql", "mariadb", "mongodb", "rds", "elasticsearch", "nss", "squashfs", "exfat", "ntfs", "minix",
"led", "lua", "nmap", "emacs", "vim", ".so", "asm", "x86", "x64", "x86-64", "x86_64", "mips", "arm64", "lxc", "samsung", "sandisk",
"seagate", "mta", "mua", "user-agent", "hsts", "mbps", "mitma", "mmc", "mms", "sms",
"ms-chap", "ms-chapv2", "msdos", "ms-dos", "mimo", "lifo", "fifo", "dereference", "nas", "nat", "netbios", "nic", "nfs",
"nist", "nsa", "oid", "ole", "ocsp", "oem", "ofb", "otp", "openpgp", "xmpp", "cisco", "huawei", "opm", "osi", "otp", 
"owasp", "dsl", "vlan", "voip", "telephony", "mpls", "appletalk", "applescript", "xerox", "iscsi", "socks4", "socks5", 
"localhost", "zip", "7zip", "pzip", "lzma", "tgz", "tftp", "ampq", "upnp", "ibm", "toshiba", "ssdp", "p2p", "afp", 
"pac", "paravirutalization", "parameterization", "parity", "pdf", "pdf", "piv", "phar", "php6", "nodejs", "jose", 
"expressjs", "plist", "plist", 'pseudorandom', "pub", "pki", "mobi", "scorm", "ptt", "puk", "qemu", "qos", "vmware",
"hyper-v", "r/w", "rc4", "rc5", "rdms", "rdbms", "sqlserver", "redis", "memcached", "hbase", "olap", "malware", 
"adware", "scp", "sctp", "sla", "siem", "soc", "spam", "ssd", "nvme", "firmware", "smm", "tacacs", "tftp", 'uart',
"uefi", "grub", "timestamp", "datetime", "vdi", "virtualbox", "emulator", "lte", "4g", "5g", "3g", "vxlan", "vrrp", "vnic",
"wsdl", "wifi", "wimax", "win2k", "www", "xquery", "xsl", "xslt", "css", "stylesheet", "api", "adobe", "firefox", "esr",
"chromium", "ssi", "ssrf", "ssri", "clickjacking", "ubuntu", "centos", "gentoo", "xxe", "dtd", "gdi", "wsus", "npm", 
"github", "git", "svn", "iphone", "vma", "vga", "hdmi", "dvi", "dma", "qualcomm", "broadcomm", "mmc", "chipset",
"xbox", "playstation", "tomcat", "servlet", "asp", "aspx", "asp.net", "oop", "activedirectory", "pth", "winxp", "xp", "sid",
"krbtgt", "msvcrt", "lsa", "punycode", "utf", "utf7", "utf8", "utf-7", "utf-8", "bitwise", ".jsp", "regex", "redos", "float32",
"float64", "typecast", "typecasting", "printf", "sprintf", "fprintf", "filename", "off-by-one", "0x00", "0xff", "16bit",
"32bit", "64bit", "16-bit", "32-bit", "64-bit", "endian", "dereference", "dereferencing", "ioctl", "ttl", "tcp/ip", "xwindow",
"infoleak", "fingerprint", "pwd", "sidechannel", "hardening","regexp", "debug", "upload", "coldfusion",
 "netscaler", "lifecycle", "joomla","wordpress","libxml","tiff","jpg","jpeg","svg","gif","fortinet", "esxi", "uaf", "use-after-free",
 "sp1", "sp2","lnk", "samsung", "nokia","motorola", "gnutls","cors","crm","drupal","lfi","rfi", "malloc", "rhel", "sharepoint",
 "uxss","symlink", "hardlink", "sdk","jre","openstack", "rabbitmq", "docker", "exif", "quicktime", "vlc", "webex",
 "phantomjs", "ide", "apparmor", "realnetworks", "lxd", "netflow", "jscript", "jira", "gitlab", "github", "openshift",
 "mkd", "openoffice","x-frame-options", "x-xss-protection", "x-content-type-options", "csp", "content-security-policy",
 "referrer-policy", "expect-ct", "x-permitted-cross-domain-policy", "hpkp", "strict-transport-security", "nosniff",
 "cdn","cache-control", "access-control-allow-origin", "public-key-pins", "public-key-pins-report-only", "report-uri",
 "includesubdomains", "applet", "out-of-bounds", "oob", "heap-based", "yammer", "jabber", "bluetooth", "stack-based",
 "webgl", "wasm", "localstorage", "sqlite", "config", "glibc", "pci", "man-in-the-middle", "authenticator", "same-origin",
 "keyword","homograph", "idn", "redirect" ,"der", "dir", "win32k", "metasploit", "meterpreter", "mifare","owa", "phpmyadmin",
 "blowfish", "juniper", "jws", "libtiff", "eof", "p7", "pem", "mediatek", "mikrotik","ike", "screenos", "atlassian","cpanel",
 "junos", "side-loading", "teamviewer", "dnsmasq", "vnc","lib", "cad", "autodesk", "hpe", "swf", "crossdomain.xml",
 "xmlrpc", "xml-rpc", "chakra", "sigsegv", "rmi","wav","mp3","mp4","mpeg","mkv", "cpp","imagemagick", "gcc","g++", "llvm",
 "gdb", "groovy", "groovyscript", "netgear", "libssh", "okta","xen", "zdi", "cvss", "starttls","babel","geforce", "asus",
 "salesforce", ".net", "rebind", "snapdragon", "exynos", "\0", "mcafee","kaspersky", "dlp", "bytecode", "rapid7", "nexpose",
 "nessus", "zte", "xiaomi","smtpd","fedora","httpd","adb", "ld_library_path","xst","%s","%n", "%x","man-in-the-browser",
 "mitb","non-final","repudiation", "ssri","ads","xsrf","qrljacking","tapjacking","tabnabbing","fido","tmpfs","kubernetes",
 "dpapi", "mvc","xaml","xacml","webroot","sri","html5","websockets","postmessage","wss","includesubdomains","iat","exp",
 "nist","django","grails","gson","angularjs","vue","reactjs","totp","hotp","caa","plugin","nbf","aud","idp","ocsp","laravel",
 "iis","netscape","robots.txt","cakephp","802.1X","802.11","gz","rar","rtf","wget","lsass","gets()","stdin","stdout","stderr",
"spotfire","parser"
 
]

In [72]:
tokenizer.add_tokens(["ad"])
print('Tokenized: ', tokenizer.tokenize("Android"))
list(set(["a","a"]))

Tokenized:  ['and', '##r', 'oid']


['a']

In [49]:
phrases = ['environment variable', "buffer overflow", "stack overflow", "heap overflow", "regular expression",
"denial of service", "cross site request forgery", "cross site scripting", "cross-site scripting", "cross-site request forgery",
"elevation of privilege", "escalation of privilege", "adobe flash player", "adobe acrobat reader", "acrobat reader", "microsoft exchange",
"race condition", "google chrome", "mozilla firefox", "internet explorer", "microsoft windows", "microsoft server", "palo alto",
 "command injection", "directory traversal", "path traversal", "windows rt", "windows 8", "windows 10", "windows xp",
 "windows 8.1", "microsoft edge", "atlassian jira", "sap netweaver", "remote file inclusion", "local file inclusion",
 "os x", "integer overflow", "integer underflow", "use after free", "side loading", "microsoft office", "windows vista",
 "pointer defererence","dll hijacking","open redirect", "openid connect", "memory corruption", "same origin policy",
 "nvidia geforce",".net framework", "dns spoofing", "dns rebinding", "outlook web access","stack trace","android debug bridge",
 "content security policy", "cross site tracing", "cross-site tracing", "forced browsing","format string","server side request forgery",
 "server-side request forgery", "server side includes", "server-side includes", "session fixation","session hijacking",
 'alternate data stream', "os injection","parameter pollution","verb tampering","response splitting","request smuggling",
 "padding oracle", "css injection","microsoft iis","active directory","data execution prevention","server side include","oob read",
           "oob write","local security authority","windows ole","microsoft office ole","forma string","dot dot", "brute force",
"out-of-bounds read","out of bounds read","out-of-bounds write","out of bounds write","apple safari"

]

In [50]:
tokenizer.add_tokens(words)
tokenizer.add_tokens(phrases)
tokenizer.add_tokens([".doc"])
to_add = ["inject","exec()","eval()","xorg","big-ip"]
to_remove = ["ad", "exec","eval","oid","bit","rar","ack","ext","mobi","dep","ssi","iat","aud","exp","auth","api","der",
            "nsa","oob","efs","wav","sla","pap","isp","ole","lib","config","ide","dir","esr","xor"]
base_tokenizer = tokenizer_class.from_pretrained(pretrained_weights, do_lower_case=True)

sent =  train_df['Description'][18]
print('Tokenized: ', tokenizer.tokenize(sent))
print('Tokenized: ', base_tokenizer.tokenize(sent))

Tokenized:  ['multiple', 'cross-site scripting', '(', 'xss', ')', 'vu', '##ln', '##era', '##bilities', 'in', 'pro', '-', 'search', '0', '.', '17', 'and', 'earlier', 'allow', 'remote', 'at', '##t', 'ack', 'er', '##s', 'to', 'in', '##ject', 'ar', '##bit', 'rar', 'y', 'web', 'script', 'or', 'html', 'via', 'the', '(', '1', ')', 'pro', '##t', ',', '(', '2', ')', 'host', ',', '(', '3', ')', 'path', ',', '(', '4', ')', 'name', ',', '(', '5', ')', 'ext', ',', '(', '6', ')', 'size', ',', '(', '7', ')', 'search', '_', 'days', ',', 'or', '(', '8', ')', 'show', '_', 'page', 'parameter', 'to', 'the', 'default', 'uri', '.']
Tokenized:  ['multiple', 'cross', '-', 'site', 'script', '##ing', '(', 'x', '##ss', ')', 'vu', '##ln', '##era', '##bilities', 'in', 'pro', '-', 'search', '0', '.', '17', 'and', 'earlier', 'allow', 'remote', 'attackers', 'to', 'in', '##ject', 'arbitrary', 'web', 'script', 'or', 'html', 'via', 'the', '(', '1', ')', 'pro', '##t', ',', '(', '2', ')', 'host', ',', '(', '3', ')', 'path

In [51]:
words = [word for word in words if word not in to_remove]
words += to_add

In [52]:
words = sorted(list(set(words)))
phrases = sorted(list(set(phrases)))
extra_tokens = {"words" : list(set(words)), "phrases":phrases}
import json
with open('../datasets/vocab/extra_vocab.json', 'w') as outfile:
    json.dump(extra_tokens, outfile)

In [55]:
if "ad" in words:
    print(1)

In [None]:
# https://nvlpubs.nist.gov/nistpubs/ir/2013/NIST.IR.7298r2.pdf
# https://www.sans.org/security-resources/glossary-of-terms/
# https://www.owasp.org/images/1/19/OTGv4.pdf