# Find vulnerabilities using Keyword based classifier

In [2]:
import vulns_common
from sklearn.utils import shuffle

vulns_common.download_nvd_vulns_json()
nvd_vulns = vulns_common.load_nvd_vulns_json('data/nvdcve-1.0-*.json')
print('Vulnerability descriptions: ' + str(len(nvd_vulns)))
cpe_names = vulns_common.compile_cpe_names(nvd_vulns)
print('Vulnerable product names:'+str(len(cpe_names)))
vuln_data = shuffle(nvd_vulns, n_samples=4000)
print('Vulnerability learning data: ' + str(len(vuln_data)))

Vulnerability descriptions: 32963
Vulnerable product names:9176
Vulnerability learning data: 4000


In [3]:
import pandas as pd

vuln_descs = []
for d in vuln_data:
    if not d[1].startswith('** REJECT'):#a few descriptions are rejected by nvd
        vuln_descs.append(d[1])

reports = pd.read_excel('data/sec_issues_edit.xlsx')
reports = reports.loc[(reports['security']==1)]
reports['security'] = reports['security'].values.astype(bool)
reports['report'] = reports['report'].values.astype(str)
print('Manually labeled vulnerability dataset: '+ str(len(reports)))
print('Mixed with non-security issues')
mixed = vulns_common.get_mixed_dataset(reports['report'], 1000)
print(mixed.shape)
mixed.head(10)

Manually labeled vulnerability dataset: 148
Mixed with non-security issues
(1148, 2)


Unnamed: 0,report,security
23298,Issue 24548 : Downloads Page is Blank (NOT usi...,-1
8,dblook displays message keys instead of their ...,-1
118,Issue 12027 : Memory corruption on dragging fi...,1
972,SJMS component throws class cast error when us...,-1
387,o.a.wicket.ng.** Resource related classes don'...,-1
309,Implement SSL/TLS communication between client...,1
787,ComponentFeedbackPanel broken under Wicket 6.0...,-1
172,Pipeline could change the MEP unintended Whe...,-1
7360,Issue 7712 : Make FileStream use IOBuffer &lsa...,-1
598,Convert altertable.sql to JUnit Converting a...,-1


In [4]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk import ngrams
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction import text
import time

class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        self.stemmer = WordNetLemmatizer()
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc: (analyzer(' '.join([self.stemmer.lemmatize(word) for word in doc.split(' ')])))  

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        self.stemmer = SnowballStemmer("english")
        analyzer = super(CountVectorizer, self).build_analyzer()

        return lambda doc: (analyzer(' '.join([self.stemmer.stem(word) for word in doc.split(' ')])))

class KeywordStemClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, min_ngrams=2, max_ngrams=2):
        self.min_ngrams = min_ngrams
        self.max_ngrams = max_ngrams
        unwanted_words = ['issue','defect','bug','fault','flaw','mistake','error','version','system','because','before','disputed']
        stop_words = text.ENGLISH_STOP_WORDS#.union(cpe_names)
        stop_words = stop_words.union(unwanted_words)
        self.vectorizer = StemmedCountVectorizer(stop_words=stop_words,
                                                 lowercase=True,
                                                 ngram_range=(min_ngrams, max_ngrams), 
                                                 min_df=1,
                                                 token_pattern=r'(?u)\b\w*[a-zA-Z]{3,}\w*\b')

    def fit(self, raw_documents, y=None):
        self.vectorizer.fit(raw_documents)

        return self
    
    def predict(self, raw_documents, y=None):
        assert (len(self.vectorizer.vocabulary_) > 0), "You must call fit() before predicting data!"
        scores = self.score(raw_documents)
        
        predictions = []
        for count in scores:
            if count >= 2:#at least two vulnerability n-grams classified as security related
                predictions.append(1)
            else:
                predictions.append(-1)
        
        return np.array(predictions)   
    
    def word_grams(self, words, min, max):
        s = []
        for n in range(min, max+1):
            for ngram in ngrams(words, n):
                s.append(' '.join(str(i) for i in ngram))
        return s
    
    def _score_single(self, tokens):
        count= 0
        for index, token in enumerate(tokens):
            if token in self.vectorizer.vocabulary_:
                count = count + 1
        return count
        
    def score(self, raw_documents, y=None):
        assert (len(self.vectorizer.vocabulary_) > 0), "You must call fit() before scoring data!"
        stemmer = SnowballStemmer("english")
        
        scores = []
        for index, row in enumerate(raw_documents):
            stems = []
            for word in row.split():
                stems.append(stemmer.stem(word))
            tokens = self.word_grams(stems, self.min_ngrams, self.max_ngrams)
            count = self._score_single(tokens)
            scores.append(count)
        
        return np.array(scores)        
    
nfold = 10

print('Mixed dataset: '+ str(len(mixed)))
c = KeywordStemClassifier()
#get a new learning data before the fit method
vuln_data = shuffle(nvd_vulns, n_samples=4000)
vuln_descs = []
for d in vuln_data:
    if not d[1].startswith('** REJECT'):#some descriptions are rejected by NVD
        vuln_descs.append(d[1])
c.fit(vuln_descs)
scores = []
t = time.time()
for i in range(nfold): #n-fold cross val score
    #get a new testing set
    mixed = vulns_common.get_mixed_dataset(reports['report'], 1000)  
    predicted = c.predict(mixed['report'])
    #score = f1_score(y_true=mixed['security'], y_pred=predicted, average='micro')
    score = roc_auc_score(y_true=mixed['security'], y_score=predicted, average='micro')
    scores.append(score)
    
scores = np.array(scores)
print(str(nfold)+'-fold cross validated score:' + str(scores.mean()))
print('Time taken: ' + str(round(time.time() - t, 1)) + 's')
print("Vocabulary items:" + str(len(c.vectorizer.vocabulary_)))

Mixed dataset: 1148
10-fold cross validated score:0.7520364864864865
Time taken: 144.0s
Vocabulary items:42035


In [5]:
# some score and predict tests
p = shuffle(vuln_descs, n_samples=1)
print(p[0])
print(c.score(p))
print(c.predict(p))

p = reports.sample(n=1)
print(p['report'].iloc[0])
print(c.score(p['report']))
print(c.predict(p['report']))

p = mixed.sample(n=1)
print(p['report'].iloc[0])
print(p['security'].iloc[0])
print(c.score(p['report']))
print(c.predict(p['report']))

SecurEnvoy SecurMail before 9.2.501 allows remote authenticated users to read arbitrary e-mail messages via the option1 parameter in a reply action to secmail/getmessage.exe.
[8]
[1]
Blind SQL Injection SQL injection (SQLi) refers to an injection attack wherein an attacker can execute malicious SQL statements that control a web application's database server. URL encoded POST input enddate was set to f14QC2St'; waitfor delay '0:0:0' --
[10]
[1]
HDFS Service Check failed (as designed) but took 10 mins to fail   After I had a cluster up and running successfully  but manually turned HDFS Safe Mode ON by running:hdfs dfsadmin -safemode enter.HDFS Execute Check failed due to Puppet timeout after 10 minutes.It should have failed quicker since task timeout is 10 minutes on server.   
-1
[1]
[-1]


In [6]:
## how it finds manually labeled vulnerability issues

In [7]:
import pandas as pd
reports = pd.read_excel('data/sec_issues_edit.xlsx')
reports = reports.loc[(reports['security']==1)]
reports['security'] = reports['security'].values.astype(bool)
reports['report'] = reports['report'].values.astype(str)
print('Manually labeled vulnerability dataset: '+ str(len(reports)))
predicted = c.predict(reports['report'])
vulns_common.print_vulns_metrics(reports['security'], predicted)

Manually labeled vulnerability dataset: 148
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           1       1.00      0.72      0.84       148

   micro avg       0.72      0.72      0.72       148
   macro avg       0.50      0.36      0.42       148
weighted avg       1.00      0.72      0.84       148

TN=0, FP=0, FN=41, TP=107


  'recall', 'true', average, warn_for)


In [8]:
## How it finds from the learning set

In [9]:
print('Vulnerability learning dataset:'+str(len(vuln_descs)))
predicted = c.predict(vuln_descs)
vulns_common.print_vulns_metrics([1]*len(vuln_descs), predicted)

Vulnerability learning dataset:3884
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           1       1.00      0.95      0.97      3884

   micro avg       0.95      0.95      0.95      3884
   macro avg       0.50      0.47      0.49      3884
weighted avg       1.00      0.95      0.97      3884

TN=0, FP=0, FN=195, TP=3689


In [10]:
# some vulnerability words in vocabulary
c.vectorizer.vocabulary_

{'minttoken function': 23674,
 'function smart': 15980,
 'smart contract': 34710,
 'contract implement': 8357,
 'implement eststoken': 18113,
 'eststoken ethereum': 13241,
 'ethereum token': 13259,
 'token integ': 37065,
 'integ overflow': 19274,
 'overflow allow': 26076,
 'allow owner': 1818,
 'owner contract': 26231,
 'contract set': 8362,
 'set balanc': 34017,
 'balanc arbitrari': 3949,
 'arbitrari user': 2749,
 'user ani': 38843,
 'ani value': 2132,
 'fork cms': 15457,
 'cms allow': 6748,
 'allow store': 1868,
 'store xss': 35733,
 'xss private': 41833,
 'private set': 28953,
 'set facebook_admin_id': 34030,
 'facebook_admin_id paramet': 14298,
 'paramet aka': 26547,
 'aka admin': 1553,
 'admin ids': 1035,
 'ids input': 17822,
 'input facebook': 18928,
 'facebook section': 14296,
 'vulner allow': 40184,
 'allow remot': 1841,
 'remot attack': 30992,
 'attack disclos': 3178,
 'disclos sensit': 11183,
 'sensit inform': 33225,
 'inform vulner': 18670,
 'vulner instal': 40382,
 'instal 

In [11]:
## How much the method produces false positives if a security specialist needs to proritize them

In [12]:
vulns_common.print_classified_dataset('data/Ambari.csv', c)
vulns_common.print_classified_dataset('data/Camel.csv', c)
vulns_common.print_classified_dataset('data/Wicket.csv', c)
vulns_common.print_classified_dataset('data/Chromium.csv', c)
vulns_common.print_classified_dataset('data/Derby.csv', c)

Test on data/Ambari.csv. Rows: 1000
              precision    recall  f1-score   support

       False       0.98      0.82      0.89       971
        True       0.07      0.45      0.12        29

   micro avg       0.81      0.81      0.81      1000
   macro avg       0.52      0.63      0.50      1000
weighted avg       0.95      0.81      0.87      1000

TN=793, FP=178, FN=16, TP=13
Test on data/Camel.csv. Rows: 1000
              precision    recall  f1-score   support

       False       0.97      0.77      0.86       967
        True       0.05      0.34      0.08        32

   micro avg       0.75      0.75      0.75       999
   macro avg       0.51      0.56      0.47       999
weighted avg       0.94      0.75      0.83       999

TN=743, FP=224, FN=21, TP=11
Test on data/Wicket.csv. Rows: 1000
              precision    recall  f1-score   support

       False       0.99      0.81      0.89       990
        True       0.03      0.50      0.05        10

   micro avg     

In [17]:
## Find optimal dataset size to fit the classifier

In [18]:
sample_sizes = [100, 500, 1000, 2000, 4000, 6000, 8000, 10000, 12000, 14000, 16000, 18000]

final_results = []
for size in sample_sizes:
    results = 0

    for n in range(10):#cross validation
        vuln_data = shuffle(nvd_vulns, n_samples=size)
        vuln_descs = []
        for d in vuln_data:
            vuln_descs.append(d[1])
        mixed = vulns_common.get_mixed_dataset(reports['report'], 1000)
    
        c.fit(vuln_descs)
        predicted = c.predict(mixed['report'])
        #score = f1_score(y_true=mixed['security'], y_pred=predicted, average='micro)
        score = roc_auc_score(y_true=mixed['security'], y_score=predicted, average='micro')
        results= results + score

    final_results.append([str(size), results/10])
    print(str(results/10))

print('done')

0.6180364864864865
0.6898175675675675
0.7238081081081083
0.7338702702702703
0.7327972972972974
0.7317837837837838
0.7305135135135136
0.7193581081081081
0.7183324324324325
0.7030702702702704
0.7080554054054053
0.6971216216216217
done
