In [1]:
# This notebook contains training code for the base model + extra security vocabulary

In [1]:
!git clone https://github.com/huggingface/transformers
!cd transformers && pip install .

fatal: destination path 'transformers' already exists and is not an empty directory.
Processing /data/notebooks/transformers
Building wheels for collected packages: transformers
  Building wheel for transformers (setup.py) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-2.5.1-py3-none-any.whl size=528611 sha256=fa05b4197415c1fdd51ac1b4aa90ae7372e76434a4beb5bdfc389c4f08a49b2a
  Stored in directory: /tmp/pip-ephem-wheel-cache-cs7p5sv5/wheels/5c/1c/47/7aca7c86ce98d3f7beb792bd7f926ef4d3cc45abd4f8daaa44
Successfully built transformers
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 2.5.1
    Uninstalling transformers-2.5.1:
      Successfully uninstalled transformers-2.5.1
Successfully installed transformers-2.5.1


In [2]:
import torch
from transformers import *
import pandas as pd
import numpy as np
import tensorflow as tf

# If there's a GPU available...

if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB
Found GPU at: /device:GPU:0


In [3]:
data_dir = "../datasets/TrainingV2/Train_test_validation/"

train_df = pd.read_csv(data_dir + "train_cwe_nlp.csv")
validation_df = pd.read_csv(data_dir + "validation_cwe_nlp.csv")
test_df = pd.read_csv(data_dir + "test_cwe_nlp.csv")
underrep_df = pd.read_csv(data_dir + "underrep_cwe_nlp.csv")



In [4]:
print(len(train_df))

67047


In [5]:
train_labels = train_df['CWE-ID'].unique()
validation_labels = validation_df['CWE-ID'].unique()
test_labels = test_df['CWE-ID'].unique()
merged_labels = np.concatenate((train_labels, validation_labels, test_labels))
merged_labels = np.unique(merged_labels)
print(len(train_labels))
print(len(validation_labels))
print(len(test_labels))
print("Unique: {}".format(len(merged_labels)))
#print(np.logical_and( (train_labels==validation_labels).all(), (validation_labels==test_labels).all() ))
#print(np.sort(merged_labels))

377
348
344
Unique: 377


In [6]:
lookup_table = dict(zip(list(merged_labels), range(0, len(merged_labels))))

In [7]:
train_df['CWE-ID'] = train_df['CWE-ID'].apply(lambda x:lookup_table[x])
validation_df['CWE-ID'] = validation_df['CWE-ID'].apply(lambda x:lookup_table[x])
test_df['CWE-ID'] = test_df['CWE-ID'].apply(lambda x:lookup_table[x])

In [8]:
# Configuration Section

In [9]:
MAX_LEN = 512
BATCH_SIZE = 16
EPOCHS = 4
NUM_LABELS = len(lookup_table.keys())

In [10]:
MODELS = (BertForSequenceClassification,       BertTokenizer,       'bert-base-uncased')
model_class, tokenizer_class, pretrained_weights = MODELS
tokenizer = tokenizer_class.from_pretrained(pretrained_weights, do_lower_case=True)

In [11]:
# Extra vocabulary taken from the dataset and the following:
# https://nvlpubs.nist.gov/nistpubs/ir/2013/NIST.IR.7298r2.pdf
# https://www.sans.org/security-resources/glossary-of-terms/
# https://www.owasp.org/images/1/19/OTGv4.pdf

In [12]:
import json
with open('../datasets/vocab/extra_vocab.json', 'r') as infile:
    extra_vocab = json.load(infile)
extra_tokens = extra_vocab["phrases"] #extra_vocab["words"] + 
tokenizer.add_tokens(extra_tokens)

92

In [13]:

def get_tokens(tokenizer, dataframe):
  return dataframe['Description'].apply(lambda desc: 
                              tokenizer.encode(
                                  desc, 
                                  add_special_tokens=True, 
                                  max_length=MAX_LEN, 
                                  pad_to_max_length=True
                                  #return_tensors = 'pt'
                                  )
                              ).to_list()

def get_attention_masks(input_ids):
  attention_masks = []
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)
  return attention_masks


train_input_tokens = get_tokens(tokenizer, train_df)
validation_input_tokens = get_tokens(tokenizer, validation_df)
test_input_tokens = get_tokens(tokenizer, test_df)

train_masks = get_attention_masks(train_input_tokens)
validation_masks = get_attention_masks(validation_input_tokens)
test_masks = get_attention_masks(test_input_tokens)

In [14]:
print(' Original: ', train_df['Description'][0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(train_df['Description'][0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_df['Description'][0])))

 Original:  The Spotfire Library component of TIBCO Software Inc.'s TIBCO Spotfire Analytics Platform for AWS Marketplace, and TIBCO Spotfire Server contains a vulnerability that might theoretically fail to restrict users with read-only access from modifying files stored in the Spotfire Library, only when the Spotfire Library is configured to use external storage. Affected releases are TIBCO Software Inc.'s TIBCO Spotfire Analytics Platform for AWS Marketplace versions up to and including 10.0.0, and TIBCO Spotfire Server versions up to and including 7.10.1; 7.11.0; 7.11.1; 7.12.0; 7.13.0; 7.14.0; 10.0.0.
Tokenized:  ['the', 'spot', '##fire', 'library', 'component', 'of', 'ti', '##bc', '##o', 'software', 'inc', '.', "'", 's', 'ti', '##bc', '##o', 'spot', '##fire', 'analytics', 'platform', 'for', 'aw', '##s', 'marketplace', ',', 'and', 'ti', '##bc', '##o', 'spot', '##fire', 'server', 'contains', 'a', 'vulnerability', 'that', 'might', 'theoretically', 'fail', 'to', 'restrict', 'users', '

In [15]:
train_labels = train_df['CWE-ID'].values
validation_labels = validation_df['CWE-ID'].values
test_labels = test_df['CWE-ID'].values

In [16]:
# Model expects PyTorch tensors, not numpy arrays

train_inputs = torch.tensor(train_input_tokens)
validation_inputs = torch.tensor(validation_input_tokens)
test_inputs = torch.tensor(test_input_tokens)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
test_labels = torch.tensor(test_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
test_masks = torch.tensor(test_masks)

In [17]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# `BATCH_SIZE` specified elsewhere
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32. Either must be run on 16GB GPU, will crash runtime on anything less

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)

# Create the DataLoader for our test set.
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

In [18]:
model = model_class.from_pretrained(
    pretrained_weights, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = NUM_LABELS, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [19]:
# Must resize the embedding layer after adding new vocabulary
model.resize_token_embeddings(len(tokenizer))

Embedding(30614, 768)

In [20]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [21]:
import random

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = random.getrandbits(32) #42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

for epoch_i in range(0, EPOCHS):
    
    # ========================================
    #               Training
    # ========================================
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    print('Training...')

    t0 = time.time()
    total_loss = 0 # Reset the total loss for this epoch.
    model.train() # Just sets the `mode`, does not perform training

    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad() # Always clear any previously calculated gradients before performing a backward pass. Not automatic with PyTorch

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        loss = outputs[0] # Pull loss value out of the tuple

        total_loss += loss.item() # For average loss calculation at the end

        
        loss.backward() # Perform a backward pass to calculate the gradients.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Prevent exploding gradients

        optimizer.step() # Take a step with the new gradients

        scheduler.step() # Update the learning rate

    avg_train_loss = total_loss / len(train_dataloader) # Calculate the average loss over the training data.         
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval() # Evaluation mode. Dropout layers behave differently

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids) # Calculate the accuracy for this batch of test sentences.
        
        eval_accuracy += tmp_eval_accuracy  # Accumulate the total accuracy.
        nb_eval_steps += 1 # Track the number of batches

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...


add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
add_(Tensor other, Number alpha)
addcmul_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
addcmul_(Tensor tensor1, Tensor tensor2, Number value)
addcdiv_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
addcdiv_(Tensor tensor1, Tensor tensor2, Number value)


  Batch    40  of  4,191.    Elapsed: 0:00:20.
  Batch    80  of  4,191.    Elapsed: 0:00:40.
  Batch   120  of  4,191.    Elapsed: 0:01:00.
  Batch   160  of  4,191.    Elapsed: 0:01:20.
  Batch   200  of  4,191.    Elapsed: 0:01:40.
  Batch   240  of  4,191.    Elapsed: 0:02:00.
  Batch   280  of  4,191.    Elapsed: 0:02:20.
  Batch   320  of  4,191.    Elapsed: 0:02:40.
  Batch   360  of  4,191.    Elapsed: 0:03:00.
  Batch   400  of  4,191.    Elapsed: 0:03:21.
  Batch   440  of  4,191.    Elapsed: 0:03:41.
  Batch   480  of  4,191.    Elapsed: 0:04:01.
  Batch   520  of  4,191.    Elapsed: 0:04:21.
  Batch   560  of  4,191.    Elapsed: 0:04:41.
  Batch   600  of  4,191.    Elapsed: 0:05:01.
  Batch   640  of  4,191.    Elapsed: 0:05:22.
  Batch   680  of  4,191.    Elapsed: 0:05:42.
  Batch   720  of  4,191.    Elapsed: 0:06:02.
  Batch   760  of  4,191.    Elapsed: 0:06:22.
  Batch   800  of  4,191.    Elapsed: 0:06:42.
  Batch   840  of  4,191.    Elapsed: 0:07:03.
  Batch   880

  Batch 2,720  of  4,191.    Elapsed: 0:22:51.
  Batch 2,760  of  4,191.    Elapsed: 0:23:11.
  Batch 2,800  of  4,191.    Elapsed: 0:23:31.
  Batch 2,840  of  4,191.    Elapsed: 0:23:51.
  Batch 2,880  of  4,191.    Elapsed: 0:24:11.
  Batch 2,920  of  4,191.    Elapsed: 0:24:31.
  Batch 2,960  of  4,191.    Elapsed: 0:24:51.
  Batch 3,000  of  4,191.    Elapsed: 0:25:12.
  Batch 3,040  of  4,191.    Elapsed: 0:25:32.
  Batch 3,080  of  4,191.    Elapsed: 0:25:52.
  Batch 3,120  of  4,191.    Elapsed: 0:26:12.
  Batch 3,160  of  4,191.    Elapsed: 0:26:32.
  Batch 3,200  of  4,191.    Elapsed: 0:26:52.
  Batch 3,240  of  4,191.    Elapsed: 0:27:12.
  Batch 3,280  of  4,191.    Elapsed: 0:27:32.
  Batch 3,320  of  4,191.    Elapsed: 0:27:53.
  Batch 3,360  of  4,191.    Elapsed: 0:28:13.
  Batch 3,400  of  4,191.    Elapsed: 0:28:33.
  Batch 3,440  of  4,191.    Elapsed: 0:28:53.
  Batch 3,480  of  4,191.    Elapsed: 0:29:14.
  Batch 3,520  of  4,191.    Elapsed: 0:29:34.
  Batch 3,560

  Batch 1,080  of  4,191.    Elapsed: 0:09:03.
  Batch 1,120  of  4,191.    Elapsed: 0:09:24.
  Batch 1,160  of  4,191.    Elapsed: 0:09:44.
  Batch 1,200  of  4,191.    Elapsed: 0:10:04.
  Batch 1,240  of  4,191.    Elapsed: 0:10:24.
  Batch 1,280  of  4,191.    Elapsed: 0:10:44.
  Batch 1,320  of  4,191.    Elapsed: 0:11:04.
  Batch 1,360  of  4,191.    Elapsed: 0:11:24.
  Batch 1,400  of  4,191.    Elapsed: 0:11:44.
  Batch 1,440  of  4,191.    Elapsed: 0:12:05.
  Batch 1,480  of  4,191.    Elapsed: 0:12:25.
  Batch 1,520  of  4,191.    Elapsed: 0:12:45.
  Batch 1,560  of  4,191.    Elapsed: 0:13:05.
  Batch 1,600  of  4,191.    Elapsed: 0:13:25.
  Batch 1,640  of  4,191.    Elapsed: 0:13:45.
  Batch 1,680  of  4,191.    Elapsed: 0:14:05.
  Batch 1,720  of  4,191.    Elapsed: 0:14:25.
  Batch 1,760  of  4,191.    Elapsed: 0:14:46.
  Batch 1,800  of  4,191.    Elapsed: 0:15:06.
  Batch 1,840  of  4,191.    Elapsed: 0:15:26.
  Batch 1,880  of  4,191.    Elapsed: 0:15:46.
  Batch 1,920

In [22]:
import os
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = '../Models/ModelV2/Model_ExtraVocab/Phrases_Only/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to ../Models/ModelV2/Model_ExtraVocab/Phrases_Only/


('../Models/ModelV2/Model_ExtraVocab/Phrases_Only/vocab.txt',
 '../Models/ModelV2/Model_ExtraVocab/Phrases_Only/special_tokens_map.json',
 '../Models/ModelV2/Model_ExtraVocab/Phrases_Only/added_tokens.json')

In [None]:
# Testing

In [None]:
model.cuda()


In [None]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(test_inputs)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in test_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

In [None]:
prediction_values = [prediction.argmax() for prediction in predictions]

In [None]:
prediction_values, true_values = [], []
for batch in predictions:
    for prediction in batch:
        prediction_values.append(prediction.argmax())

for batch in true_labels:
    for true_value in batch:
        true_values.append(true_value)

count = 0
for prediction, true_label in zip(prediction_values, true_values):
    if prediction == true_label:
        count+=1
        
# Base model Predicted 10388 / 14367 0.7230458690053595 correctly
print("Predicted {} / {} {} correctly".format(count, len(prediction_values), float(count) / float(len(prediction_values))))

In [31]:
cwe_lookup_table = pd.read_csv("../datasets/cwe-lookup-table.csv",index_col=False)

cwe_lookup_table.loc[cwe_lookup_table['CWE-ID'] == 255]["Description"].values[0]

'Credentials Management Errors'

In [32]:
def predict(model, sentence, lookup_table):
  # Drop into evaluation mode
  model.eval()

  # Send model to the CPU
  model.cpu()

  # Tokenize the provided text
  # TODO: Tokenizer global at the moment, change that
  encoded_sent = tokenizer.encode(sentence, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True)
  attention_mask = [int(token_id > 0) for token_id in encoded_sent]

  # 1x512 tensor
  sent_tensor = torch.tensor([encoded_sent])
  attn_tensor = torch.tensor([attention_mask])

  # Speeds up calc when gradients don't need to be calculated
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      # TODO: Why no softmax output??
      outputs = model(sent_tensor, token_type_ids=None, 
                      attention_mask=attn_tensor)
      logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  #print(logits)
  prediction = logits.argmax()
  # `lookup_table` holds mapping of labels to CWEs.
  # TODO: Save the lookup table alongside model weights
  for k,v in lookup_table.items():
    if v == prediction:
      cwe = k
      break
  try:
    # Not all CWEs present in the lookup table (mainly 'CWE CATEGORIES' which are missing)
    # TODO: Extract CWEs for master XML spec, not CSVs - DONE
    # TODO: Generate additional training data from CWE spec. At least 3 cases for each CWE can be created with 'Description',
    #         'Extended Description', and 'Background Detail', possibly more - DONE
    # 
    description = cwe_lookup_table.loc[cwe_lookup_table['CWE-ID'] == cwe]["Description"].values[0]
  except:
    description = "💩"

  print("Input: {}\n\tPredicted CWE: {}: {}, Prob: {}".format(sentence, cwe, description, prediction))

In [33]:
predict(model, "The app accepts a user controlled serialized object", lookup_table)
predict(model, "Site deserializes client side object", lookup_table)
predict(model, "There is no database parameterization", lookup_table)
predict(model, "There is insufficient validation of the 'id' parameter when used with SQL statement", lookup_table)
predict(model, "One user can access another's messages", lookup_table)
predict(model, "An attacker can spoof someone else's access token. Ruh roh. ", lookup_table)
predict(model, "There's no bounds checking on the input buffer", lookup_table)
predict(model, "Passwords are stored clear text", lookup_table)

Input: The app accepts a user controlled serialized object
	Predicted CWE: 502: Deserialization of Untrusted Data, Prob: 244
Input: Site deserializes client side object
	Predicted CWE: 502: Deserialization of Untrusted Data, Prob: 244
Input: There is no database parameterization
	Predicted CWE: 200: Exposure of Sensitive Information to an Unauthorized Actor, Prob: 99
Input: There is insufficient validation of the 'id' parameter when used with SQL statement
	Predicted CWE: 89: Improper Neutralization of Special Elements used in an SQL Command ('SQL Injection'), Prob: 43
Input: One user can access another's messages
	Predicted CWE: 532: Insertion of Sensitive Information into Log File, Prob: 247
Input: An attacker can spoof someone else's access token. Ruh roh. 
	Predicted CWE: 451: User Interface (UI) Misrepresentation of Critical Information, Prob: 231
Input: There's no bounds checking on the input buffer
	Predicted CWE: 119: Improper Restriction of Operations within the Bounds of a Me

In [34]:
# Model fails hard
predict(model, "ip whitelisting is not in place", lookup_table)
predict(model, "No https", lookup_table)
predict(model, "No tls", lookup_table)
predict(model, "no hardware R/X protections", lookup_table)
predict(model, "httponly flag isnt set", lookup_table)
predict(model, "There is no ASLR IN PLACE", lookup_table)
predict(model, "site is getting DDoSd", lookup_table)

Input: ip whitelisting is not in place
	Predicted CWE: 269: Improper Privilege Management, Prob: 128
Input: No https
	Predicted CWE: 254: 7PK - Security Features, Prob: 121
Input: No tls
	Predicted CWE: 611: Improper Restriction of XML External Entity Reference, Prob: 257
Input: no hardware R/X protections
	Predicted CWE: 862: Missing Authorization, Prob: 345
Input: httponly flag isnt set
	Predicted CWE: 352: Cross-Site Request Forgery (CSRF), Prob: 183
Input: There is no ASLR IN PLACE
	Predicted CWE: 362: Concurrent Execution using Shared Resource with Improper Synchronization ('Race Condition'), Prob: 189
Input: site is getting DDoSd
	Predicted CWE: 190: Integer Overflow or Wraparound, Prob: 92


In [46]:
predict(model, "../", lookup_table)

Input: ../
	Predicted CWE: 23: Relative Path Traversal, Prob: 6


In [36]:
predict(model, "Router admin interface has default credentials", lookup_table)
predict(model, "Our stored passwords will live for all eternity", lookup_table)
predict(model, "full disk encryption is not present", lookup_table)
predict(model, "seed is not random", lookup_table)
predict(model, "Credentials are just flying around unencrypted", lookup_table)
predict(model, "Credentials are just flying?", lookup_table)
predict(model, "passwords are just flying around unencrypted", lookup_table)



# Illustrate semantic variety
print("\nSemantics")
predict(model, "personal info exposed by misconfigured S3 bucket", lookup_table)
predict(model, "info (PII) leaked through improperly configured object storage", lookup_table)
predict(model, "bad guy stole stuff", lookup_table)
predict(model, "data breach of user personal data", lookup_table) # Maps incorrectly

Input: Router admin interface has default credentials
	Predicted CWE: 798: Use of Hard-coded Credentials, Prob: 322
Input: Our stored passwords will live for all eternity
	Predicted CWE: 522: Insufficiently Protected Credentials, Prob: 246
Input: full disk encryption is not present
	Predicted CWE: 326: Inadequate Encryption Strength, Prob: 168
Input: seed is not random
	Predicted CWE: 330: Use of Insufficiently Random Values, Prob: 170
Input: Credentials are just flying around unencrypted
	Predicted CWE: 522: Insufficiently Protected Credentials, Prob: 246
Input: Credentials are just flying?
	Predicted CWE: 522: Insufficiently Protected Credentials, Prob: 246
Input: passwords are just flying around unencrypted
	Predicted CWE: 311: Missing Encryption of Sensitive Data, Prob: 159

Semantics
Input: personal info exposed by misconfigured S3 bucket
	Predicted CWE: 200: Exposure of Sensitive Information to an Unauthorized Actor, Prob: 99
Input: info (PII) leaked through improperly configured

In [37]:
predict(model, "SSRF in Sofort's merchant portal, via notification webhook", lookup_table)
predict(model, "IDOR in POST /api/settings/PN01964/authentication-email/596/generate-token Allows Attacker to Generate Support Tokens for Other Accounts, Expose Email", lookup_table)
predict(model, "[MyDevelopment] XXE processing in SCORM files", lookup_table)
predict(model, "Reflected XSS vulnerability on www.sofort.com, via multipay/wait", lookup_table)

Input: SSRF in Sofort's merchant portal, via notification webhook
	Predicted CWE: 918: Server-Side Request Forgery (SSRF), Prob: 354
Input: IDOR in POST /api/settings/PN01964/authentication-email/596/generate-token Allows Attacker to Generate Support Tokens for Other Accounts, Expose Email
	Predicted CWE: 200: Exposure of Sensitive Information to an Unauthorized Actor, Prob: 99
Input: [MyDevelopment] XXE processing in SCORM files
	Predicted CWE: 611: Improper Restriction of XML External Entity Reference, Prob: 257
Input: Reflected XSS vulnerability on www.sofort.com, via multipay/wait
	Predicted CWE: 79: Improper Neutralization of Input During Web Page Generation ('Cross-site Scripting'), Prob: 35


In [38]:
predict(model, "an attacker can execute remote shell commands", lookup_table)

Input: an attacker can execute remote shell commands
	Predicted CWE: 78: Improper Neutralization of Special Elements used in an OS Command ('OS Command Injection'), Prob: 34


In [50]:
predict(model, "AWS Metadata is exposed", lookup_table)
predict(model, "User supplied request sent to arbitrary destination", lookup_table)
predict(model, "JWT is not verified", lookup_table)
predict(model, "CBC mode ciphers are enabled", lookup_table)
predict(model, "MD5 is used to hash passwords", lookup_table)
predict(model, "Insecure initialization vector usage", lookup_table)

Input: AWS Metadata is exposed
	Predicted CWE: 200: Exposure of Sensitive Information to an Unauthorized Actor, Prob: 99
Input: User supplied request sent to arbitrary destination
	Predicted CWE: 41: Improper Resolution of Path Equivalence, Prob: 16
Input: JWT is not verified
	Predicted CWE: 295: Improper Certificate Validation, Prob: 147
Input: CBC mode ciphers are enabled
	Predicted CWE: 330: Use of Insufficiently Random Values, Prob: 170
Input: MD5 is used to hash passwords
	Predicted CWE: 311: Missing Encryption of Sensitive Data, Prob: 159
Input: Insecure initialization vector usage
	Predicted CWE: 400: Uncontrolled Resource Consumption, Prob: 202


In [49]:
print(4191*16)

67056
