## Prerequisites

In [3]:
# Basic imports
import numpy as np
import pandas as pd
import seaborn as sns
from numpy import mean, std
from matplotlib import pyplot as plt

# Feature selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFE

# Data preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from imblearn.over_sampling  import RandomOverSampler
from sklearn.model_selection import cross_val_score

#ML models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB

In [10]:
df = pd.read_csv("./datasets/cleaned_tomcat_vulnerabilities.csv")
df = df.drop(columns=['Unnamed: 0'])
np.set_printoptions(threshold=np.inf)
pd.set_option('max_columns',None)

In [11]:
df

Unnamed: 0,file,class,cbo,wmc,dit,rfc,lcom,tcc,lcc,totalMethodsQty,staticMethodsQty,publicMethodsQty,privateMethodsQty,protectedMethodsQty,defaultMethodsQty,visibleMethodsQty,abstractMethodsQty,finalMethodsQty,synchronizedMethodsQty,totalFieldsQty,staticFieldsQty,publicFieldsQty,privateFieldsQty,protectedFieldsQty,defaultFieldsQty,finalFieldsQty,synchronizedFieldsQty,nosi,loc,returnQty,loopQty,comparisonsQty,tryCatchQty,parenthesizedExpsQty,stringLiteralsQty,numbersQty,assignmentsQty,mathOperationsQty,variablesQty,maxNestedBlocksQty,anonymousClassesQty,innerClassesQty,lambdasQty,uniqueWordsQty,modifiers,logStatementsQty,version,title,severity,vulnerable
0,apache-tomcat-10.0.4-src/java/jakarta/el/Array...,java/jakarta/el/ArrayELResolver.java,6,35,2,19,33,0.214286,0.214286,10,2,8,2,0,0,8,0,2,0,1,0,0,1,0,0,1,0,19,92,13,0,7,2,3,3,4,6,0,5,2,0,0,0,43,1,0,10,0,0,0.0
1,apache-tomcat-10.0.4-src/java/jakarta/el/BeanE...,java/jakarta/el/BeanELResolver.java,11,35,2,34,33,0.133333,0.133333,10,0,9,1,0,0,10,0,1,0,4,2,0,4,0,0,4,0,21,253,15,1,15,4,0,6,0,24,0,20,2,0,3,1,70,1,0,10,0,0,0.0
2,apache-tomcat-10.0.4-src/java/jakarta/el/BeanN...,java/jakarta/el/BeanNameELResolver.java,7,26,2,13,1,0.476190,0.476190,7,0,7,0,0,0,7,0,0,0,1,0,0,1,0,0,1,0,10,102,12,0,4,5,4,1,0,10,0,10,2,0,0,0,32,1,0,10,0,0,0.0
3,apache-tomcat-10.0.4-src/java/jakarta/el/BeanN...,java/jakarta/el/BeanNameResolver.java,1,5,1,0,10,0.000000,0.000000,5,0,5,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,17,4,0,0,0,0,0,0,0,0,0,0,0,0,0,68,1025,0,10,0,0,0.0
4,apache-tomcat-10.0.4-src/java/jakarta/el/Compo...,java/jakarta/el/CompositeELResolver.java,3,32,2,14,0,0.818182,0.818182,10,0,10,0,0,0,11,0,0,0,3,1,0,3,0,0,1,0,3,162,14,7,4,1,1,1,12,31,1,27,4,0,1,0,58,1,0,10,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10199,apache-tomcat-4.1.40-src/servletapi/src/share/...,share/javax/servlet/ServletInputStream.java,1,7,2,1,1,0.000000,0.000000,2,0,1,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,18,2,1,3,0,1,0,6,3,0,2,2,0,0,0,92,1025,0,4,0,0,0.0
10200,apache-tomcat-4.1.40-src/servletapi/src/share/...,share/javax/servlet/ServletOutputStream.java,0,21,2,19,118,0.008333,0.008333,16,0,15,0,1,0,16,0,0,0,2,2,0,2,0,0,1,0,7,77,0,1,2,0,1,6,5,12,0,8,2,0,0,0,87,1025,0,4,0,0,0.0
10201,apache-tomcat-4.1.40-src/servletapi/src/share/...,share/javax/servlet/ServletRequestWrapper.java,0,30,1,25,0,1.000000,1.000000,28,0,28,0,0,0,28,0,0,0,1,0,0,1,0,0,0,0,0,93,23,0,2,0,0,2,0,2,0,1,1,0,0,0,87,1,0,4,0,0,0.0
10202,apache-tomcat-4.1.40-src/servletapi/src/share/...,share/javax/servlet/ServletResponseWrapper.java,0,18,1,13,0,1.000000,1.000000,16,0,16,0,0,0,16,0,0,0,1,0,0,1,0,0,0,0,0,57,7,0,2,0,0,2,0,2,0,1,1,0,0,0,70,1,0,4,0,0,0.0


### Data preprocessing and cleanup

In [13]:
df.loc[df['title'].str.contains('Denial', case=False),'title'] = 'DoS'
df.loc[df['title'].str.contains('CSRF', case=False),'title'] = 'CSRF'
df.loc[df['title'].str.contains('information disclosure', case=False),'title'] = 'Information Disclosure'
df.loc[df['title'].str.contains('cve-2020-9484', case=False),'title'] = 'Remote Code execution'
df.loc[df['title'].str.contains('session hijacking', case=False),'title'] = 'Remote Code execution'
df = df.drop_duplicates(keep='first').reset_index(drop=True)

Unique severities (Debugging)

In [14]:
df.severity.unique()

array(['0', 'Low', 'Important', 'Moderate', 'High'], dtype=object)

### K-fold cross validation

In [15]:
train = df.loc[~(df.version==8)]
test = df.loc[df.version==8]
kfold = KFold(10, shuffle=True, random_state=1)
feature_columns =df.columns[2:46] 
print("Shape of dataframe", df.shape)

Shape of dataframe (7380, 50)


### Label encoding the string values in the columns

In [18]:
s = (train.dtypes == 'object')
object_cols = ['severity', 'title']

label_X_train = train.copy()
label_X_valid = test.copy()
le = LabelEncoder()
label_X_train[object_cols] = train[object_cols].apply(le.fit_transform)
label_X_valid[object_cols] = test[object_cols].apply(le.fit_transform)

X = train[feature_columns]
test_X = test[feature_columns]
y = train.vulnerable
severity_y = label_X_train.severity
title_y = label_X_train.title
test_Y = test.vulnerable
severity_test_Y = label_X_valid.severity
title_test_Y = label_X_valid.title
print('Encoded...')


Encoded...


### Sampling

In [19]:
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X, y)

### Feature selection

#### Sequential Feature Selection

In [23]:
sfs = SFS(LogisticRegression(), 
           k_features='best', 
           forward=True, # if forward = True then SFS otherwise SBS
           floating=False, 
           scoring='r2',
          cv=5)
sfs.fit(X,y)
feature_set_1 = list(sfs.k_feature_names_)
feature_set_1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

['rfc',
 'lcc',
 'protectedFieldsQty',
 'nosi',
 'comparisonsQty',
 'mathOperationsQty',
 'innerClassesQty',
 'modifiers']

#### Random Feature Elimination

In [25]:
rfe = RFE(estimator=DecisionTreeClassifier())
rfe.fit(X_ros, y_ros)
feature_set_2= rfe.fit_transform(X_ros,y_ros)
final_features = []
for i in range(X.shape[1]):
    if(rfe.support_[i] == 1):
        final_features.append(X.columns[i])

fit_feature_set() - A simple function to fit a feature set.

In [26]:
def fit_feature_set(feature_set=final_features):
    X =train[feature_set]
    test_X = test[feature_set]
    y = train.vulnerable
    severity_y = train.severity
    title_y = train.title
    test_Y = test.vulnerable
    severity_test_Y = test.severity
    title_test_Y= test.title

### Machine learning

In [27]:
fit_feature_set(final_features)

#### Decision tree classifier

In [28]:
model = DecisionTreeClassifier(random_state=0,ccp_alpha=0.06)
model.fit(X_ros, y_ros)
scores = cross_val_score(model, X_ros, y_ros, scoring='accuracy', cv=kfold, n_jobs=-1)
print('Precision: %.3f' % mean(cross_val_score(model, X_ros, y_ros, scoring='precision', cv=kfold, n_jobs=-1)))
print('Recall: %.3f' % mean(cross_val_score(model, X_ros, y_ros, scoring='recall', cv=kfold, n_jobs=-1)))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
y_train_pred = model.predict(X_ros)
dt_y_test_pred = model.predict(test_X)

Precision: 0.777
Recall: 0.768
Accuracy: 0.774 (0.016)


#### KNN-classifier

In [29]:
model = KNN()
model.fit(X_ros, y_ros)
scores = cross_val_score(model, X_ros, y_ros, scoring='precision', cv=kfold, n_jobs=-1)
print('Precision: %.3f (%.3f)' % (mean(scores), std(scores)))
y_train_pred = model.predict(X_ros)
y_test_pred = model.predict(test_X)

Precision: 0.933 (0.008)


#### Logistic regression

In [30]:
lr_model = LogisticRegression(solver='sag', class_weight='balanced')
lr_model.fit(X_ros, y_ros)
predictions2 = lr_model.predict(test_X)
scores = cross_val_score(lr_model, X_ros, y_ros, scoring='accuracy', cv=kfold, n_jobs=-1)
print('Precision: %.3f' % mean(cross_val_score(lr_model, X_ros, y_ros, scoring='precision', cv=kfold, n_jobs=-1)))
print('Recall: %.3f' % mean(cross_val_score(lr_model, X_ros, y_ros, scoring='recall', cv=kfold, n_jobs=-1)))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))




Precision: 0.610




Recall: 0.889
Accuracy: 0.660 (0.018)


#### Naive Bayes classifier

In [31]:
nb_model = GaussianNB()
nb_model.fit(X_ros,y_ros)
scores = cross_val_score(nb_model, X_ros, y_ros, scoring='accuracy', cv=kfold, n_jobs=-1)
print('Precision: %.3f' % mean(cross_val_score(nb_model, X_ros, y_ros, scoring='precision', cv=kfold, n_jobs=-1)))
print('Recall: %.3f' % mean(cross_val_score(nb_model, X_ros, y_ros, scoring='recall', cv=kfold, n_jobs=-1)))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

predictions3 = nb_model.predict_proba(test_X)

Precision: 0.852
Recall: 0.466
Accuracy: 0.693 (0.010)
