## Find vulnerabilities using OneClass Classifiers

In [1]:
from sklearn.utils import shuffle
import vulns_common

vulns_common.download_nvd_vulns_json()
nvd_vulns = vulns_common.load_nvd_vulns_json('data/nvdcve-1.0*.json')
print('Vulnerability descriptions: ' + str(len(nvd_vulns)))
cpe_names = vulns_common.compile_cpe_names(nvd_vulns)
print('Vulnerable product names:'+str(len(cpe_names)))
vuln_data = shuffle(nvd_vulns, n_samples=6000)
print('Vulnerability learning data: ' + str(len(vuln_data)))

Vulnerability descriptions: 32963
Vulnerable product names:9176
Vulnerability learning data: 6000


In [2]:
vuln_data[0]

['CVE-2018-1000401',
 'Jenkins project Jenkins AWS CodePipeline Plugin version 0.36 and earlier contains a Insufficiently Protected Credentials vulnerability in AWSCodePipelineSCM.java that can result in Credentials Disclosure. This attack appear to be exploitable via local file access. This vulnerability appears to have been fixed in 0.37 and later.',
 'CWE-522',
 'AV:L/AC:L/Au:N/C:P/I:N/A:N',
 2.1,
 'CVSS:3.0/AV:L/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:H',
 7.8,
 ['jenkins', 'aws_codepipeline']]

In [3]:
import pandas as pd
vuln_descs = []
for d in vuln_data:
    if not d[1].startswith('** REJECT'):#a few descriptions are rejected by NVD
        vuln_descs.append(d[1])

reports = pd.read_excel('data/sec_issues_edit.xlsx')
reports = reports.loc[(reports['security']==1)]
reports['security'] = reports['security'].values.astype(bool)
reports['report'] = reports['report'].values.astype(str)
print('Manually labeled vulnerability dataset: '+ str(len(reports)))
print('Mixed with non-security issues')
mixed = vulns_common.get_mixed_dataset(reports['report'], 1000)
print(mixed.shape)
mixed.head(10)

Manually labeled vulnerability dataset: 148
Mixed with non-security issues
(1148, 2)


Unnamed: 0,report,security
807,Mail component does not work as expected (Emai...,-1
261,Password field submitted using GET method This...,1
775,Issue 783 : Customisation of new tab page 3 ...,-1
76,[PATCH] remove XX in firefox and palette I k...,-1
276,LangScripts JUnit test fails in views.sql La...,-1
540,tools/ide is not included in the source tarbal...,-1
703,ClassCastException when calling boolean meta-d...,-1
364,Incorrect privileges may be required for INSER...,1
23341,Issue 24594 : Crash - v8::internal::MarkingVis...,-1
860,Component.continueToOriginalDestination() can ...,-1


## Sklearn Vectorizers

In [4]:
from sklearn.feature_extraction.text import CountVectorizer #Term Frequencies
from sklearn.feature_extraction.text import TfidfVectorizer #Term Frequency times inverse document frequency
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction import text 
import numpy as np
import time

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        self.stemmer = SnowballStemmer("english")
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc: (analyzer(' '.join([self.stemmer.stem(word) for word in doc.split(' ')])))

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        self.stemmer = SnowballStemmer("english")
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (analyzer(' '.join([self.stemmer.stem(word) for word in doc.split(' ')])))

class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        self.stemmer = WordNetLemmatizer()
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc: (analyzer(' '.join([self.stemmer.lemmatize(word) for word in doc.split(' ')])))    

class LemmaTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        self.stemmer = WordNetLemmatizer()
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (analyzer(' '.join([self.stemmer.lemmatize(word) for word in doc.split(' ')])))   
    
nfold = 10

def run_test(vzr, cls):
    print(str(cls)[0:str(cls).find('(')] + ' ' + str(vzr)[0:str(vzr).find('(')])

    #get a new testing set
    mixed = vulns_common.get_mixed_dataset(reports['report'], 1000)    
    
    scores = []
    t = time.time()
    for i in range(nfold): #n-fold cross val score
        predicted = pipe.predict(mixed['report'])
        #score = f1_score(y_true=mixed['security'], y_pred=predicted, average='micro')
        score = roc_auc_score(y_true=mixed['security'], y_score=predicted, average='micro')
        scores.append(score)
    
    scores = np.array(scores)
    print(str(nfold)+'-fold cross validated roc-auc-score:' + str(scores.mean()) + '\n')
    print('Time taken: ' + str(round(time.time() - t, 1)) + 's')

classifiers = [
    OneClassSVM(gamma='scale', kernel='linear', nu=0.2, shrinking=True, tol=1e-05),
    #OneClassSVM(gamma='scale'),
    #IsolationForest(contamination='auto', behaviour='new'),
    #LocalOutlierFactor(novelty=True, contamination='auto')
]

print('Mixed dataset for predict: '+ str(len(mixed)))

df = 1
l = True
t = r'\b\w*[a-zA-Z]{3,}\w*\b'#default: r'\b\w+\b'
ngram_s = 1
ngram_e = 2

unwanted_words = ['issue','defect','bug','fault','flaw','mistake','error','version','system','because','before','disputed']
stop_words = text.ENGLISH_STOP_WORDS#.union(cpe_names)
stop_words = stop_words.union(unwanted_words)

vectorizers = [
               #CountVectorizer(stop_words=stop_words, lowercase=l, ngram_range=(ngram_s, ngram_e), min_df=df, token_pattern=t),
               #StemmedCountVectorizer(stop_words=stop_words, lowercase=l, ngram_range=(ngram_s, ngram_e), min_df=df, token_pattern=t),
               #TfidfVectorizer(stop_words=stop_words, lowercase=l, ngram_range=(ngram_s, ngram_e), min_df=df, token_pattern=t),
               #StemmedTfidfVectorizer(stop_words=stop_words, lowercase=l, ngram_range=(ngram_s, ngram_e), min_df=df, token_pattern=t),
               LemmaTfidfVectorizer(stop_words=stop_words, lowercase=l, ngram_range=(ngram_s, ngram_e), min_df=df, token_pattern=t),
               #LemmaCountVectorizer(stop_words=stop_words, lowercase=l, ngram_range=(ngram_s, ngram_e), min_df=df, token_pattern=t),
              ]

for c in classifiers:
    for v in vectorizers:
        pipe = Pipeline([('vect', v), ('clf', c)])
        #get a new learning data before the fit method
        vuln_data = shuffle(nvd_vulns, n_samples=6000)
        vuln_descs = []
        for d in vuln_data:
            if not d[1].startswith('** REJECT'):#some descriptions are rejected by NVD
                vuln_descs.append(d[1])
        print('Dataset to fit:'+str(len(vuln_descs)))
        pipe = pipe.fit(vuln_descs)
        run_test(v, c)
    


Mixed dataset for predict: 1148
Dataset to fit:5823
OneClassSVM LemmaTfidfVectorizer
10-fold cross validated roc-auc-score:0.7296216216216216

Time taken: 16.8s


In [5]:
vectorizer = LemmaTfidfVectorizer(stop_words=stop_words, lowercase=l, ngram_range=(ngram_s, ngram_e), min_df=df, token_pattern=t)

vectors = vectorizer.fit_transform(vuln_descs)
print("Vectors shape:" + str(vectors.shape))

Vectors shape:(5823, 69609)


In [6]:
p = shuffle(vuln_descs, n_samples=1)
print(p[0])
vectors = vectorizer.transform(p)
print(c.score_samples(vectors)-c.offset_)
print(c.predict(vectors))

p = reports.sample(n=1)
print(p['report'].iloc[0])
vectors = vectorizer.transform(p['report'])
print(c.score_samples(vectors)-c.offset_)
print(c.predict(vectors))

p = mixed.sample(n=1)
print(p['report'].iloc[0])
print(p['security'].iloc[0])
vectors = vectorizer.transform(p['report'])
print(c.score_samples(vectors)-c.offset_)
print(c.predict(vectors))

A vulnerability in unit_deserialize of systemd allows an attacker to supply arbitrary state across systemd re-execution via NotifyAccess. This can be used to improperly influence systemd execution and possibly lead to root privilege escalation. Affected releases are systemd versions up to and including 239.
[2.86537603e-06]
[1]
Add Support for network Server USRIDONL security   Currently Network Server supports only two security mechanisms.User ID and password (usridpwd)User ID and encrypted password (eusridpwd)It would be good to add support for User ID Only security so that it could more closely match the embedded driver and not always require a password.See details of security mechanism implementations in section 4.4.2.1 of DRDA V3 Vol. 1: Distributed Relational Database Architecture http://www.opengroup.org/dbiop/   
[3.1373414]
[1]
BlueprintCamelContext should not get started in the init() method but later when the blueprint container is fully initialized   The init() method in Bl

In [7]:
## how it finds manually labeled vulnerability issues

In [8]:
pipe = Pipeline([('vect', vectorizer), ('clf', c)])
pipe = pipe.fit(vuln_descs)

In [9]:
vectorizer.vocabulary_

{'vulnerability': 66403,
 'identified': 28839,
 'simatic': 56671,
 'open': 42054,
 'controller': 13406,
 'cpu': 14039,
 'versions': 65683,
 'family': 23191,
 'software': 57413,
 'plcsim': 46070,
 'advanced': 2062,
 'attacker': 5127,
 'man': 37078,
 'middle': 38602,
 'position': 46636,
 'potentially': 46890,
 'modify': 39242,
 'network': 40443,
 'traffic': 61567,
 'exchanged': 21609,
 'port': 46468,
 'tcp': 60333,
 'certain': 9134,
 'property': 48615,
 'calculation': 8263,
 'used': 63808,
 'integrity': 31527,
 'protection': 48693,
 'order': 42637,
 'exploit': 22441,
 'able': 338,
 'perform': 44883,
 'attack': 4971,
 'impact': 29383,
 'communication': 11471,
 'public': 49073,
 'exploitation': 22562,
 'known': 33816,
 'time': 61063,
 'advisory': 2110,
 'publication': 49097,
 'vulnerability identified': 66663,
 'identified simatic': 28867,
 'simatic open': 56680,
 'open controller': 42073,
 'controller cpu': 13424,
 'cpu versions': 14059,
 'versions simatic': 65745,
 'simatic cpu': 56676,


In [10]:
import pandas as pd
reports = pd.read_excel('data/sec_issues_edit.xlsx')
reports = reports.loc[(reports['security']==1)]
reports['security'] = reports['security'].values.astype(bool)
reports['report'] = reports['report'].values.astype(str)
print('Manually labeled vulnerability dataset: '+ str(len(reports)))
predicted = pipe.predict(reports['report'])
vulns_common.print_vulns_metrics(reports['security'], predicted)

Manually labeled vulnerability dataset: 148
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           1       1.00      0.74      0.85       148

   micro avg       0.74      0.74      0.74       148
   macro avg       0.50      0.37      0.43       148
weighted avg       1.00      0.74      0.85       148

TN=0, FP=0, FN=38, TP=110


  'recall', 'true', average, warn_for)


In [11]:
## How it finds from the learning set

In [12]:
print('Vulnerability learning dataset:'+str(len(vuln_descs)))
predicted = pipe.predict(vuln_descs)
vulns_common.print_vulns_metrics([1]*len(vuln_descs), predicted)

Vulnerability learning dataset:5823
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           1       1.00      0.79      0.88      5823

   micro avg       0.79      0.79      0.79      5823
   macro avg       0.50      0.39      0.44      5823
weighted avg       1.00      0.79      0.88      5823

TN=0, FP=0, FN=1251, TP=4572


In [13]:
print('Mixed dataset: '+ str(len(mixed)))
predicted = pipe.predict(mixed['report'])
vulns_common.print_vulns_metrics(mixed['security'], predicted)

Mixed dataset: 1148
              precision    recall  f1-score   support

          -1       0.95      0.69      0.80      1000
           1       0.26      0.74      0.39       148

   micro avg       0.70      0.70      0.70      1148
   macro avg       0.61      0.72      0.59      1148
weighted avg       0.86      0.70      0.75      1148

TN=691, FP=309, FN=38, TP=110


In [14]:
## How much the method produces false alarms

In [15]:
vulns_common.print_classified_dataset('data/Ambari.csv', pipe)
vulns_common.print_classified_dataset('data/Camel.csv', pipe)
vulns_common.print_classified_dataset('data/Wicket.csv', pipe)
vulns_common.print_classified_dataset('data/Chromium.csv', pipe)
vulns_common.print_classified_dataset('data/Derby.csv', pipe)

Test on data/Ambari.csv. Rows: 1000
              precision    recall  f1-score   support

       False       0.98      0.81      0.88       971
        True       0.05      0.34      0.09        29

   micro avg       0.79      0.79      0.79      1000
   macro avg       0.51      0.58      0.49      1000
weighted avg       0.95      0.79      0.86      1000

TN=783, FP=188, FN=19, TP=10
Test on data/Camel.csv. Rows: 1000
              precision    recall  f1-score   support

       False       0.98      0.66      0.79       967
        True       0.05      0.59      0.10        32

   micro avg       0.66      0.66      0.66       999
   macro avg       0.52      0.63      0.44       999
weighted avg       0.95      0.66      0.77       999

TN=637, FP=330, FN=13, TP=19
Test on data/Wicket.csv. Rows: 1000
              precision    recall  f1-score   support

       False       0.99      0.62      0.76       990
        True       0.01      0.50      0.03        10

   micro avg     

In [16]:
import pandas as pd

reports = pd.read_csv('data/Wicket.csv')
reports = vulns_common.preprocess_csv_data(reports)
sec_issues = reports.loc[(reports['security']==1)]
normal_bugs = reports.loc[(reports['security']==0)]
print('Security related issue:\n' + sec_issues.iloc[0]['report'] + '\n')
print('Normal bug report:\n' + normal_bugs.iloc[0]['report'])


Security related issue:
IDataProvider-Overflow with size()   Hi I get an Integer-overflow with my Dataprovider (yeah  there are a couple of entries in the database). Is there a reason why size() and iterator( first  count ) are limited to Integer?Regards  &#8212; Jan.   

Normal bug report:
open Modal Window without AjaxRequestTarget   Wicket 1.2.2 included a new Modal Window component. However  this component can only be used with a valid AjaxRequestTarget. It would be useful if Modal Windows could be opened programmatically at any time without an AjaxRequestTarget.   


In [17]:
## find optimal oneclass svm hyperparameters

In [11]:
#from sklearn.model_selection import GridSearchCV
#cannot use GridSearchCv as fitting and predicting with a separate data
from sklearn.model_selection import ParameterGrid

#OneClassSVM hyperparameters in the pipeline
parameters = {'clf__kernel': ('linear', 'rbf'),#default rbf
              'clf__shrinking': (True, False),
              'clf__tol': (1e-5, 1e-6, 1e-7),
              'clf__nu': (0.1, 0.2, 0.5),#default 0.5
             }

for z in ParameterGrid(parameters):
    pipe.set_params(**z)
    pipe.fit(vuln_descs)
    predicted = pipe.predict(mixed['report'])
    score = roc_auc_score(y_true=mixed['security'], y_score=predicted, average='micro')
    print(str(score)+': '+str(z))


0.7334054054054056: {'clf__kernel': 'linear', 'clf__nu': 0.1, 'clf__shrinking': True, 'clf__tol': 1e-05}
0.7334054054054056: {'clf__kernel': 'linear', 'clf__nu': 0.1, 'clf__shrinking': True, 'clf__tol': 1e-06}
0.7334054054054056: {'clf__kernel': 'linear', 'clf__nu': 0.1, 'clf__shrinking': True, 'clf__tol': 1e-07}
0.7334054054054056: {'clf__kernel': 'linear', 'clf__nu': 0.1, 'clf__shrinking': False, 'clf__tol': 1e-05}
0.7334054054054056: {'clf__kernel': 'linear', 'clf__nu': 0.1, 'clf__shrinking': False, 'clf__tol': 1e-06}
0.7334054054054056: {'clf__kernel': 'linear', 'clf__nu': 0.1, 'clf__shrinking': False, 'clf__tol': 1e-07}
0.7319864864864865: {'clf__kernel': 'linear', 'clf__nu': 0.2, 'clf__shrinking': True, 'clf__tol': 1e-05}
0.7319864864864865: {'clf__kernel': 'linear', 'clf__nu': 0.2, 'clf__shrinking': True, 'clf__tol': 1e-06}
0.7319864864864865: {'clf__kernel': 'linear', 'clf__nu': 0.2, 'clf__shrinking': True, 'clf__tol': 1e-07}
0.7319864864864865: {'clf__kernel': 'linear', 'clf__

In [None]:
## Find optimal dataset size to fit the classifier

In [69]:
sample_sizes = [100, 500, 1000, 2000, 4000, 6000, 8000, 10000, 12000, 14000, 16000, 18000]

classifier =

final_results = []
for size in sample_sizes:
    print(str(size), end='')
    results = dict()
    for v in vectorizers:
        results[v.__class__.__name__] = 0.0 

    for n in range(10):#10-fold cross validation
        vuln_data = shuffle(nvd_vulns, n_samples=size)
        vuln_descs = []
        for d in vuln_data:
            vuln_descs.append(d[1])
        mixed = vulns_common.get_mixed_dataset(reports['report'], 1000)
    
        for v in vectorizers:
            pipe = Pipeline([('vect', v), ('clf', classifier)])
            pipe = pipe.fit(vuln_descs)
            predicted = pipe.predict(mixed['report'])
            #score = f1_score(y_true=mixed['security'], y_pred=predicted, average='micro')
            score = roc_auc_score(y_true=mixed['security'], y_score=predicted, average='micro')
            prev_score = results[v.__class__.__name__]
            results[v.__class__.__name__] = score + prev_score
            print('.', end='')

    for i, (key, value) in enumerate(results.items()):
        final_results.append([str(size), key, value/10])
        print('\n'+key + str(value/10))
    print('\n')

print('done')

100............................................................
CountVectorizer0.692558108108108

StemmedCountVectorizer0.6974378378378379

TfidfVectorizer0.5870418918918918

StemmedTfidfVectorizer0.6090405405405404

LemmaTfidfVectorizer0.5967527027027026

LemmaCountVectorizer0.6865500000000001


500............................................................
CountVectorizer0.6872054054054054

StemmedCountVectorizer0.6837837837837838

TfidfVectorizer0.6679527027027027

StemmedTfidfVectorizer0.6827378378378378

LemmaTfidfVectorizer0.6795202702702703

LemmaCountVectorizer0.6817986486486488


1000............................................................
CountVectorizer0.6774202702702702

StemmedCountVectorizer0.6738837837837838

TfidfVectorizer0.7076445945945947

StemmedTfidfVectorizer0.6965148648648649

LemmaTfidfVectorizer0.7103945945945945

LemmaCountVectorizer0.6719013513513513


2000............................................................
CountVectorizer0.6734500000000001

Ste