In [48]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.linear_model import LinearRegression


# create a fucntion to split the data into training and test sets
def simple_split(data,y,length,split_mark=0.7):
    if split_mark > 0. and split_mark < 1.0:
        n = int(length*split_mark)
    else:
        n = int(split_mark)
    X_train = data[:n].copy()
    X_test = data[n:].copy()
    y_train = y[:n].copy()
    y_test = y[n:].copy()
    
    return X_train, X_test, y_train, y_test


data = pd.read_csv("cve.csv")  
data.head()


Unnamed: 0.1,Unnamed: 0,mod_date,pub_date,cvss,cwe_code,cwe_name,Unnamed: 6
0,CVE-2019-16548,21/11/2019 15:15,21/11/2019 15:15,6.8,352,Cross-Site Request Forgery (CSRF),
1,CVE-2019-16547,21/11/2019 15:15,21/11/2019 15:15,4.0,732,Incorrect Permission Assignment for Critical ...,
2,CVE-2019-16546,21/11/2019 15:15,21/11/2019 15:15,4.3,639,Authorization Bypass Through User-Controlled Key,
3,CVE-2013-2092,20/11/2019 21:22,20/11/2019 21:15,4.3,79,Improper Neutralization of Input During Web P...,
4,CVE-2013-2091,20/11/2019 20:15,20/11/2019 20:15,7.5,89,Improper Neutralization of Special Elements u...,


In [49]:
# split data into training and test sets
X_train, X_test, y_train, y_test = simple_split(data.cwe_name,data.cvss,len(data))


# depends on cwe name analysis, stop words will remove noise
stop_words = text.ENGLISH_STOP_WORDS.union(['22'])

# convert text to number vector
vectorizer = CountVectorizer(stop_words=stop_words)

# convert text into word features
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(62761, 326) (26899, 326) (62761,) (26899,)


In [50]:
#print some of the features for illustration purposes

feature_names = vectorizer.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
print("First 20 features: {}\n".format(feature_names[:20]))
print("Middle 20 features: {}\n".format(feature_names[len(feature_names)//2 - 20:len(feature_names)//2]))
print("Last 20 features: {}\n".format(feature_names[len(feature_names) - 20:]))

Number of features: 326
First 20 features: ['7pk', 'access', 'accessible', 'aka', 'algorithm', 'algorithmic', 'allocation', 'amplification', 'argument', 'array', 'assertion', 'assignment', 'assumed', 'asymmetric', 'attempts', 'attributes', 'authentication', 'authenticity', 'authorization', 'blacklist']

Middle 20 features: ['implemented', 'improper', 'improperly', 'inadequate', 'inappropriate', 'inclusion', 'incompatible', 'incomplete', 'inconsistent', 'incorrect', 'incorrectly', 'index', 'indexable', 'inefficient', 'infinite', 'information', 'initialization', 'injection', 'input', 'insecure']

Last 20 features: ['upload', 'url', 'usage', 'use', 'used', 'user', 'using', 'validation', 'value', 'values', 'verification', 'weak', 'web', 'wrap', 'wraparound', 'write', 'wrong', 'xml', 'xpath', 'zero']



In [51]:
#train the model

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)


LinearRegression()

In [53]:
#predict the output for the test set

y_pred = lr_model.predict(X_test)
print(y_pred)

[4.06855332 4.06855332 4.06855332 ... 7.64624583 7.64624583 7.64624583]


In [54]:
#calculate the mean square error

mse = mean_squared_error(y_test, y_pred)
print (mse)

2.465222095235125


In [55]:
#calcualte R-squared value: R-squared is between 0 and 1 the closer to 1 the better

rsq = lr_model.score(X_test, y_test)
print(rsq)

0.3556904480669183


In [44]:
#used the “SelectKBest” function from “sklearn” to identify the most influential features. 
# used “f_regression” as the score function that uses F value to determine statistically significant features.
mse_lst=[]
rsq_lst=[]
features_total=format(len(feature_names))

#loop through multiple sets of features 50, 100, 150...300

for i in range (1,6):
    fs = SelectKBest(score_func=f_regression, k=i*50)

    fs.fit(X_train, y_train)

    X_train_fs = fs.transform(X_train)

    X_test_fs = fs.transform(X_test)
    
    lr_model = LinearRegression()
    lr_model.fit(X_train_fs, y_train)
    y_pred = lr_model.predict(X_test_fs)
    mse_lst.append(mean_squared_error(y_test, y_pred))
    rsq_lst.append(lr_model.score(X_test_fs, y_test))

In [56]:
print (mse_lst)

[2.4563229431579736, 2.4519627564300723, 2.4624344869529415, 2.4640228058907554, 2.4650056380166103]


In [57]:
print (rsq_lst)

[0.3580163272234034, 0.35915590404385633, 0.35641901635564566, 0.3560038938946197, 0.3557470212469751]
