In [1]:
import pandas as pd
import numpy as np

X_train = pd.read_csv('../../data/cve_2018-2020_X_train.csv')
y_train = pd.read_csv('../../data/cve_2018-2020_y_train.csv')

X_test = pd.read_csv('../../data/cve_2018-2020_X_test.csv')
y_test = pd.read_csv('../../data/cve_2018-2020_y_test.csv')

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

print(y_train['cvssV3_integrityImpact'].value_counts(dropna=False) / y_train.shape[0])
print(y_test['cvssV3_integrityImpact'].value_counts(dropna=False) / y_test.shape[0])
print(X_train.info())
print(y_train.info())

(22963, 2)
(22963, 2)
(22963, 16)
(22963, 16)
HIGH    0.510691
NONE    0.312633
LOW     0.176676
Name: cvssV3_integrityImpact, dtype: float64
HIGH    0.513435
NONE    0.304490
LOW     0.182076
Name: cvssV3_integrityImpact, dtype: float64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22963 entries, 0 to 22962
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   CVE_ID       22963 non-null  object
 1   Description  22963 non-null  object
dtypes: object(2)
memory usage: 358.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22963 entries, 0 to 22962
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   cvssV3_vectorString           22963 non-null  object 
 1   cvssV3_attackVector           22963 non-null  object 
 2   cvssV3_attackComplexity       22963 non-null  object 
 3   cvssV3_privilegesRequired     2296

In [2]:
label_column_name = "cvssV3_integrityImpact"
train_labels = y_train.loc[:, label_column_name]
test_labels = y_test.loc[:, label_column_name]


from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(train_labels)
print(le.classes_)

NUM_CLASSES = len(le.classes_)
print(NUM_CLASSES)

encoded_train_labels = le.transform(train_labels)
encoded_test_labels = le.transform(test_labels)

print(train_labels[:10], encoded_train_labels[:10])
print(len(X_train), len(train_labels), len(X_test), len(test_labels))

['HIGH' 'LOW' 'NONE']
3
0    HIGH
1    HIGH
2    HIGH
3     LOW
4    NONE
5    NONE
6    NONE
7    HIGH
8    NONE
9    HIGH
Name: cvssV3_integrityImpact, dtype: object [0 0 0 1 2 2 2 0 2 0]
22963 22963 22963 22963


In [3]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('prajjwal1/bert-small')

#train_encodings = tokenizer(X_train.loc[:,"Description"].tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test.loc[:,"Description"].tolist(), truncation=True, padding=True, max_length=128)

In [4]:
import torch

class CVEDataset(torch.utils.data.Dataset):
    def __init__(self, X, encodings, labels, encoded_labels):
        self.cve_id = X.loc[:,"CVE_ID"].tolist()
        self.texts = X.loc[:,"Description"].tolist()
        self.encodings = encodings
        self.labels = labels.tolist()
        self.encoded_labels = encoded_labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['text_labels'] = self.labels[idx]
        item['encoded_labels'] = torch.tensor(self.encoded_labels[idx])
        item['CVE_ID'] = self.cve_id[idx]
        item['vulnerability_description'] = self.texts[idx]
        
        return item

    def __len__(self):
        return len(self.labels)

In [5]:
#train_dataset = CVEDataset(X_train, train_encodings, train_labels, encoded_train_labels)
test_dataset = CVEDataset(X_test, test_encodings, test_labels, encoded_test_labels)

test_dataset[0]

{'input_ids': tensor([  101,  9980,  4303,  2279,  4245,  1006,  1040,  3070,  1013, 25269,
          2278,  1007,  1020,  1012,  1014,  1012,  1016,  1012,  1020,  1012,
          1014,  1012,  1020,  1010,  1998,  1020,  1012,  1014,  1012,  6079,
          2003,  8211,  2000,  2892,  1011,  2609,  5896,  2075,  1012,  2023,
         18130,  4473,  5198,  2000,  7861,  8270, 15275,  9262, 22483,  3642,
          1999,  1996,  4773, 21318,  2947, 22552,  1996,  3832, 15380,  9280,
          2877,  2000, 22496, 19380,  2306,  1037,  9480,  5219,  1012,  9980,
          1060,  1011,  2486,  8909,  1024, 26833,  2692,  2620,  1012,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [6]:
import torch
from torch.autograd import Variable
import torch.nn.functional as F



def connectivity_tensor_calculation(input_ids, attention_mask, label, model):

    input_embedding = model.get_input_embeddings()
    vocab_size = input_embedding.weight.shape[0]
    input_ids_one_hot = torch.nn.functional.one_hot(input_ids, num_classes=vocab_size)
    input_ids_one_hot = input_ids_one_hot.type(torch.float)
    input_ids_one_hot = Variable(input_ids_one_hot, requires_grad=True) #to allow the computation of the gradients with respect to the input 
    #print(input_ids_one_hot.grad)


    #Calculate the input embeddings manually and pass them to the model through the inputs_embeds argument
    inputs_embeds = torch.matmul(input_ids_one_hot, input_embedding.weight)
    embedding_dim = input_embedding.weight.shape[1]
    inputs_embeds = torch.mul(inputs_embeds, torch.cat([attention_mask.unsqueeze(1)]*embedding_dim, dim=1))


    outputs = model(inputs_embeds=inputs_embeds.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0), labels=label.unsqueeze(0))
    #print("loss:", outputs.loss)
    #print("logits:", outputs.logits)
    predicted_label = torch.max(F.softmax(outputs.logits, dim=1), dim=1)[1].item()
    #print("predicted label (after softmax):", predicted_label)
    #print("score for predicted label (after softmax):", torch.max(F.softmax(outputs.logits, dim=1), dim=1)[0].item())
    #print("sample true label:", labels[selected_idx])

    outputs.logits[0][predicted_label].backward() #compute the gradient of the logit (predicted, the one with the highest score)
    #print(input_ids_one_hot.grad)                  #with respect to the input
   
    connectivity_tensor = torch.linalg.norm(input_ids_one_hot.grad, dim=1)
    connectivity_tensor = connectivity_tensor/torch.max(connectivity_tensor)
    return connectivity_tensor
    
def top_influential_tokens(connectivity_tensor, input_ids, top_k=5):
    
    indices_sorted_by_connectivity = torch.argsort(connectivity_tensor)
    input_tokens = tokenizer.convert_ids_to_tokens(list(input_ids))
    
    top_indices_sorted = indices_sorted_by_connectivity[-top_k:].flip(dims=(0,))
    top_tokens = [input_tokens[position.item()] for position in top_indices_sorted]
    top_tokens_connectivity = connectivity_tensor[top_indices_sorted].tolist()
    
    
    return top_tokens, top_indices_sorted.tolist(), top_tokens_connectivity


In [7]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('./bert-small-vulnerability_integrity_impact-classification/')
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 512, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-12, element

In [8]:
for i in range(0, 10):
    input_ids = test_dataset[i]['input_ids']
    attention_mask = test_dataset[i]['attention_mask']
    label = test_dataset[i]['encoded_labels']
    
    connectivity_tensor = connectivity_tensor_calculation(input_ids, attention_mask, label, model)
    top_tokens, positions, _ = top_influential_tokens(connectivity_tensor, input_ids)
    print(top_tokens, positions)

['script', '##script', 'disclosure', '##ing', 'java'] [36, 48, 63, 37, 47]
['password', '##in', 'log', '##code', 'via'] [42, 50, 49, 38, 51]
['disclosure', 'script', 'java', 'site', '##script'] [41, 14, 25, 13, 26]
['##ographic', 'crypt', 'constant', 'and', 'leaks'] [21, 20, 6, 17, 15]
['authentication', 'bypass', 'publicly', 'access', 'share'] [32, 27, 19, 30, 20]
['code', 'execution', 'code', 'remote', 'execution'] [3, 4, 25, 2, 26]
['execute', '##load', 'files', 'those', 'arbitrary'] [41, 24, 43, 42, 25]
['wear', 'read', 'behavior', '##efined', 'mobile'] [43, 23, 4, 3, 38]
['control', 'terminal', 'v8', 'unauthorized', 'due'] [46, 49, 5, 37, 32]
['nm', '##ete', 'files', 'del', 'read'] [23, 12, 15, 11, 14]


In [9]:
top_tokens_list = []
top_tokens_positions_list = []
top_tokens_connectivity_list = []

for i in range(0, len(test_dataset)):
    if i%500==0:
        print((i*100)/len(test_dataset), '%')
    
    input_ids = test_dataset[i]['input_ids']
    attention_mask = test_dataset[i]['attention_mask']
    label = test_dataset[i]['encoded_labels']
    
    connectivity_tensor = connectivity_tensor_calculation(input_ids, attention_mask, label, model)
    top_tokens, positions, connectivities = top_influential_tokens(connectivity_tensor, input_ids)
    
    top_tokens_list.append(top_tokens)
    top_tokens_positions_list.append(positions)
    top_tokens_connectivity_list.append(connectivities)
        

0.0 %
2.177415842877673 %
4.354831685755346 %
6.532247528633018 %
8.709663371510691 %
10.887079214388363 %
13.064495057266036 %
15.24191090014371 %
17.419326743021383 %
19.596742585899054 %
21.774158428776726 %
23.9515742716544 %
26.128990114532073 %
28.306405957409748 %
30.48382180028742 %
32.661237643165094 %
34.838653486042766 %
37.01606932892044 %
39.19348517179811 %
41.37090101467578 %
43.54831685755345 %
45.72573270043113 %
47.9031485433088 %
50.080564386186474 %
52.257980229064145 %
54.43539607194182 %
56.612811914819495 %
58.79022775769717 %
60.96764360057484 %
63.14505944345251 %
65.32247528633019 %
67.49989112920785 %
69.67730697208553 %
71.8547228149632 %
74.03213865784087 %
76.20955450071855 %
78.38697034359622 %
80.5643861864739 %
82.74180202935156 %
84.91921787222924 %
87.0966337151069 %
89.27404955798458 %
91.45146540086226 %
93.62888124373993 %
95.8062970866176 %
97.98371292949527 %


In [10]:
top_tokens_list

[['script', '##script', 'disclosure', '##ing', 'java'],
 ['password', '##in', 'log', '##code', 'via'],
 ['disclosure', 'script', 'java', 'site', '##script'],
 ['##ographic', 'crypt', 'constant', 'and', 'leaks'],
 ['authentication', 'bypass', 'publicly', 'access', 'share'],
 ['code', 'execution', 'code', 'remote', 'execution'],
 ['execute', '##load', 'files', 'those', 'arbitrary'],
 ['wear', 'read', 'behavior', '##efined', 'mobile'],
 ['control', 'terminal', 'v8', 'unauthorized', 'due'],
 ['nm', '##ete', 'files', 'del', 'read'],
 ['control', 'race', 'condition', 'flaw', 'gain'],
 ['denial', 'attack', 'organize', 'service', 'service'],
 ['execution', 'arbitrary', 'code', 'write', '[SEP]'],
 ['##box', 'sand', 'escape', 'html', 'use'],
 ['code', 'code', 'execute', 'execute', 'page'],
 ['code', 'execution', 'arbitrary', 'remote', 'code'],
 ['unspecified', 'denial', 'impact', 'other', '##virus'],
 ['leak', 'and', 'denial', 'denial', 'memory'],
 ['blue', 'privilege', '##ware', 'write', 'es'],

In [11]:
import pickle

with open("top_tokens_list.txt", "wb") as f:
    pickle.dump(top_tokens_list, f)
    
with open("top_tokens_positions_list.txt", "wb") as f:
    pickle.dump(top_tokens_positions_list, f)
    
with open("top_tokens_connectivity_list.txt", "wb") as f:
    pickle.dump(top_tokens_connectivity_list, f)

In [12]:
with open("top_tokens_list.txt", "rb") as f:
    top_tokens_list = pickle.load(f)
    
with open("top_tokens_positions_list.txt", "rb") as f:
    top_tokens_positions_list = pickle.load(f)
    
with open("top_tokens_connectivity_list.txt", "rb") as f:
    top_tokens_connectivity_list = pickle.load(f)


In [13]:
from collections import Counter

flattened_top_tokens_list = [tokens for elements in top_tokens_list for tokens in elements]
    
occurence_count = Counter(flattened_top_tokens_list)
occurence_count.most_common(20)

[('code', 3922),
 ('execution', 2590),
 ('arbitrary', 2499),
 ('##ss', 2107),
 ('denial', 2003),
 ('execute', 1830),
 ('x', 1626),
 ('vulnerability', 1594),
 ('disclosure', 1587),
 ('script', 1554),
 ('privilege', 1431),
 ('information', 1277),
 ('read', 1248),
 ('access', 1112),
 ('injection', 1062),
 ('site', 859),
 ('remote', 826),
 ('privileges', 810),
 ('password', 803),
 ('service', 780)]

In [14]:
def bigrams_finder(top_words, positions):
    bigrams_list = []
    
    np_positions = np.array(positions)

    ordered_positions = [positions[i] for i in np.argsort(np_positions)]
    ordered_words = [top_words[i] for i in np.argsort(np_positions)]
    position_of_the_next_word = [i+1 for i in ordered_positions]

    for i, j, k in zip(ordered_positions[1:], position_of_the_next_word[:-1], range(0,len(ordered_words)-1)):
        if i==j:
            bigram = str(ordered_words[k])+' '+str(ordered_words[k+1])
            bigrams_list.append(bigram)
            
    return bigrams_list


In [15]:
i = 1
print(top_tokens_list[i], top_tokens_positions_list[i])

['password', '##in', 'log', '##code', 'via'] [42, 50, 49, 38, 51]


In [16]:
bigrams_finder(top_tokens_list[i], top_tokens_positions_list[i])

['log ##in', '##in via']

In [17]:
bigrams_list = []

for i in range(0, len(test_dataset)):
    
    bigrams = bigrams_finder(top_tokens_list[i], top_tokens_positions_list[i])
    bigrams_list.append(bigrams)

In [18]:
bigrams_list

[['script ##ing', 'java ##script'],
 ['log ##in', '##in via'],
 ['site script', 'java ##script'],
 ['crypt ##ographic'],
 ['publicly share'],
 ['remote code', 'code execution', 'code execution'],
 ['##load arbitrary', 'execute those', 'those files'],
 ['##efined behavior'],
 [],
 ['del ##ete', 'read files'],
 ['race condition', 'condition flaw', 'gain control'],
 ['organize denial', 'service attack'],
 ['arbitrary code', 'code execution'],
 ['sand ##box', '##box escape'],
 ['execute code'],
 ['remote code', 'code execution'],
 ['unspecified other', 'other impact'],
 ['memory leak'],
 [],
 ['read files', 'up ##load'],
 ['contents of'],
 ['code execution', 'arbitrary commands'],
 ['information exposure', 'exposure vulnerability'],
 [],
 ['execute malicious', 'malicious code'],
 ['path travers', 'travers ##al', '##al vulnerability'],
 ['exploit ##able', '##able crash'],
 ['information disclosure'],
 ['create website', 'website settings'],
 ['have x', 'x ##ss'],
 [],
 ['wolf ##ss'],
 [],
 

In [19]:
from collections import Counter

flattened_bigrams_list = [bigram for elements in bigrams_list for bigram in elements]
    
occurence_count = Counter(flattened_bigrams_list)
occurence_count.most_common(20)

[('code execution', 2047),
 ('x ##ss', 1559),
 ('arbitrary code', 1414),
 ('execute arbitrary', 979),
 ('site script', 679),
 ('remote code', 648),
 ('information disclosure', 535),
 ('script ##ing', 506),
 ('sql injection', 476),
 ('privilege vulnerability', 385),
 ('unauthorized update', 298),
 ('sensitive information', 287),
 ('cs ##rf', 260),
 ('java ##script', 251),
 ('command injection', 246),
 ('stored x', 239),
 ('buffer over', 219),
 ('bounds read', 202),
 ('over ##flow', 190),
 ('update ,', 184)]

In [20]:
[bigram for elements in bigrams_list for bigram in elements]

['script ##ing',
 'java ##script',
 'log ##in',
 '##in via',
 'site script',
 'java ##script',
 'crypt ##ographic',
 'publicly share',
 'remote code',
 'code execution',
 'code execution',
 '##load arbitrary',
 'execute those',
 'those files',
 '##efined behavior',
 'del ##ete',
 'read files',
 'race condition',
 'condition flaw',
 'gain control',
 'organize denial',
 'service attack',
 'arbitrary code',
 'code execution',
 'sand ##box',
 '##box escape',
 'execute code',
 'remote code',
 'code execution',
 'unspecified other',
 'other impact',
 'memory leak',
 'read files',
 'up ##load',
 'contents of',
 'code execution',
 'arbitrary commands',
 'information exposure',
 'exposure vulnerability',
 'execute malicious',
 'malicious code',
 'path travers',
 'travers ##al',
 '##al vulnerability',
 'exploit ##able',
 '##able crash',
 'information disclosure',
 'create website',
 'website settings',
 'have x',
 'x ##ss',
 'wolf ##ss',
 'unauthorized update',
 'update ,',
 ', insert',
 'may le

In [59]:
i=9

input_ids = test_dataset[i]['input_ids']
attention_mask = test_dataset[i]['attention_mask']
label = test_dataset[i]['encoded_labels']
    
connectivity_tensor = connectivity_tensor_calculation(input_ids, attention_mask, label, model)
top_words, positions, _ = top_influential_tokens(connectivity_tensor, input_ids)
print(top_words, positions)
    

['privilege', '##cala', '##tion', 'via', 'potentially'] [34, 31, 32, 35, 28]


In [60]:
np_positions = np.array(positions)
np_positions

array([34, 31, 32, 35, 28])

In [61]:
ordered_positions = [positions[i] for i in np.argsort(np_positions)]

In [62]:
ordered_words = [top_words[i] for i in np.argsort(np_positions)]

In [63]:
print(ordered_positions, ordered_words)

[28, 31, 32, 34, 35] ['potentially', '##cala', '##tion', 'privilege', 'via']


In [64]:
position_of_the_next_word = [i+1 for i in ordered_positions]
position_of_the_next_word

[29, 32, 33, 35, 36]

In [65]:
ordered_positions[1:]

[31, 32, 34, 35]

In [66]:
position_of_the_next_word[:-1]

[29, 32, 33, 35]

In [69]:
for i, j, k in zip(ordered_positions[1:], position_of_the_next_word[:-1], range(0,len(ordered_words)-1)):
    print(i, j, k)
    if i==j:
        bigram = str(ordered_words[k])+' '+str(ordered_words[k+1])
        print(bigram)

31 29 0
32 32 1
##cala ##tion
34 33 2
35 35 3
privilege via


In [None]:
def bigrams_finder(top_words, positions):
    bigrams_list = []
    
    np_positions = np.array(positions)

    ordered_positions = [positions[i] for i in np.argsort(np_positions)]
    ordered_words = [top_words[i] for i in np.argsort(np_positions)]
    position_of_the_next_word = [i+1 for i in ordered_positions]

    for i, j, k in zip(ordered_positions[1:], position_of_the_next_word[:-1], range(0,len(ordered_words)-1)):
        print(i, j, k)
        if i==j:
            bigram = str(ordered_words[k])+' '+str(ordered_words[k+1])
            bigrams_list.append(bigram)
            
    return bigrams_list
