## Prerequisites

In [1]:
# Basic imports
import warnings
import numpy as np
import pandas as pd
from numpy import mean, std
from matplotlib import pyplot as plt
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import accuracy_score
from sklearn.utils._testing import ignore_warnings

# Feature selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFE

# Data preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from imblearn.over_sampling  import RandomOverSampler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss

#ML models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.naive_bayes import GaussianNB

# Ensemble techniques
from sklearn.ensemble import StackingClassifier

### Selecting the dataset

In [2]:
ds = ["./datasets/tomcat-final.csv","./datasets/struts-final.csv"]

Choose 1 for struts2-core dataset, 0 for tomcat dataset.

In [3]:
df = pd.read_csv(ds[1])
np.set_printoptions(threshold=np.inf)
pd.set_option('max_columns',None)

In [4]:
df

Unnamed: 0,file,type,cbo,cboModified,fanin,fanout,wmc,dit,noc,rfc,lcom,lcom*,tcc,lcc,totalMethodsQty,staticMethodsQty,publicMethodsQty,privateMethodsQty,protectedMethodsQty,defaultMethodsQty,visibleMethodsQty,abstractMethodsQty,finalMethodsQty,synchronizedMethodsQty,totalFieldsQty,staticFieldsQty,publicFieldsQty,privateFieldsQty,protectedFieldsQty,defaultFieldsQty,finalFieldsQty,synchronizedFieldsQty,nosi,loc,returnQty,loopQty,comparisonsQty,tryCatchQty,parenthesizedExpsQty,stringLiteralsQty,numbersQty,assignmentsQty,mathOperationsQty,variablesQty,maxNestedBlocksQty,anonymousClassesQty,innerClassesQty,lambdasQty,uniqueWordsQty,modifiers,logStatementsQty,title,severity,vulnerable,version
0,core/src/main/java/org/apache/struts2/views/fr...,class,5,7,2,5,14,1,0,16,0,0.583333,0.607143,1.0,8,0,8,0,0,0,8,0,0,0,3,0,0,3,0,0,0,0,0,50,8,1,3,0,1,5,7,10,4,10,2,0,0,0,50,1,0,0,0,0.0,2.3.20
1,core/src/test/java/org/apache/struts2/views/js...,class,3,3,0,3,4,4,0,14,6,0.000000,0.000000,0.0,4,0,4,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,2,53,0,0,0,0,0,11,6,12,0,12,0,0,0,0,26,1,0,0,0,0.0,2.3.20
2,plugins/cdi/src/test/java/org/apache/struts2/c...,class,8,8,0,8,4,1,0,12,6,0.000000,0.000000,0.0,4,0,4,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,25,0,0,0,0,0,0,1,7,0,7,0,0,0,0,35,1,0,0,0,0.0,2.3.20
3,apps/portlet/src/test/java/JettyPlutoLauncher....,class,4,4,0,4,1,1,0,7,0,0.000000,0.000000,0.0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,14,0,0,0,0,0,8,2,3,0,3,0,0,0,0,26,1,0,0,0,0.0,2.3.20
4,plugins/codebehind/src/test/java/org/apache/st...,class,1,1,0,1,0,1,0,0,0,0.000000,-1.000000,-1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,4,1,0,0,0,0.0,2.3.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19367,plugins/convention/src/test/java/org/apache/st...,class,0,1,1,0,1,1,0,0,0,0.000000,0.000000,0.0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,16,1,0,0,0,0.0,2.5.26
19368,core/src/main/java/com/opensymphony/xwork2/Act...,class,14,33,19,14,49,1,19,56,585,0.883721,0.000000,0.0,43,0,40,0,3,0,43,0,0,0,5,1,0,4,1,0,2,0,5,168,30,0,4,0,0,3,1,15,0,14,3,0,0,0,194,1,3,0,0,0.0,2.5.26
19369,apps/showcase/src/test/java/it/org/apache/stru...,class,2,13,11,2,1,2,11,3,0,0.000000,0.000000,0.0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,1025,0,0,0,0.0,2.5.26
19370,core/src/main/java/org/apache/struts2/views/xs...,class,2,3,1,2,4,1,0,8,0,0.166667,0.000000,0.0,2,0,2,0,0,0,2,0,0,0,3,1,0,2,0,1,1,0,0,22,1,0,1,0,0,7,0,5,2,5,2,0,0,0,58,1,3,0,0,0.0,2.5.26


### Data preprocessing and cleanup

In [5]:
df.loc[df['title'].str.contains('Denial', case=False),'title'] = 'DoS'
df.loc[df['title'].str.contains('CSRF', case=False),'title'] = 'CSRF'
df.loc[df['title'].str.contains('information disclosure', case=False),'title'] = 'Information Disclosure'
df.loc[df['title'].str.contains('cve-2020-9484', case=False),'title'] = 'Remote Code execution'
df.loc[df['title'].str.contains('session hijacking', case=False),'title'] = 'Remote Code execution'
df = df.drop_duplicates(keep='first').reset_index(drop=True)

Unique severities (Debugging)

In [6]:
df.severity.unique()

array(['0', 'high', 'critical', 'medium'], dtype=object)

### K-fold cross validation

In [7]:
train = df.loc[~(df.version=="2.3.20")]
test = df.loc[df.version=="2.3.20"]
kfold = KFold(10, shuffle=True, random_state=1)
feature_columns =df.columns[2:-4] 
print("Shape of dataframe", df.shape)

Shape of dataframe (19365, 55)


### Label encoding the string values in the columns

In [8]:
s = (train.dtypes == 'object')
object_cols = ['severity', 'title']

label_X_train = train.copy()
label_X_valid = test.copy()
le = LabelEncoder()
label_X_train[object_cols] = train[object_cols].apply(le.fit_transform)
label_X_valid[object_cols] = test[object_cols].apply(le.fit_transform)

X = train[feature_columns]
test_X = test[feature_columns]
y = train.vulnerable
severity_y = label_X_train.severity
title_y = label_X_train.title
test_Y = test.vulnerable
severity_test_Y = label_X_valid.severity
title_test_Y = label_X_valid.title
print('Encoded...')


Encoded...


### Random over-sampling

In [9]:
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X, y)

### Feature selection

#### Sequential Feature Selection

In [11]:
sfs = SFS(LogisticRegression(), 
           k_features='best', 
           forward=True, # if forward = True then SFS otherwise SBS
           floating=False, 
           scoring='r2',
          cv=5)
sfs.fit(X,y)
sfs_features = list(sfs.k_feature_names_)
sfs_features

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

['cbo']

#### Random Feature Elimination

In [10]:
rfe = RFE(estimator=DecisionTreeClassifier())
rfe.fit(X_ros, y_ros)
rfe_features = []
for i in range(X.shape[1]):
    if(rfe.support_[i] == 1):
        rfe_features.append(X.columns[i])

fit_feature_set() - A simple function to fit a feature set.

In [11]:
def fit_feature_set(feature_set):
    X =train[feature_set]
    test_X = test[feature_set]
    y = train.vulnerable
    severity_y = train.severity
    title_y = train.title
    test_Y = test.vulnerable
    severity_test_Y = test.severity
    title_test_Y= test.title

### Machine learning

In [12]:
fit_feature_set(rfe_features)

#### Decision tree classifier

In [13]:
len(test_Y)

1782

In [99]:
model = DecisionTreeClassifier(ccp_alpha=0.036)
model.fit(X_ros, y_ros)
scores = cross_val_score(model, X_ros, y_ros, scoring='accuracy', cv=kfold, n_jobs=-1)
print('Precision: %.3f' % mean(cross_val_score(model, X_ros, y_ros, scoring='precision', cv=kfold, n_jobs=-1)))
print('Recall: %.3f' % mean(cross_val_score(model, X_ros, y_ros, scoring='recall', cv=kfold, n_jobs=-1)))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))    
y_train_pred = model.predict(X_ros)
dt_y_test_pred = model.predict(test_X)
print(f'Train score {accuracy_score(y_train_pred,y_ros)}')
print(f'Test score {accuracy_score(dt_y_test_pred,test_Y)}')

Precision: 0.830
Recall: 0.869
Accuracy: 0.845 (0.005)
Train score 0.8454022333637192
Test score 0.8277216610549943


In [100]:
dt_df = pd.DataFrame([dt_y_test_pred, test_Y]).T
dt_df.columns = ['prediction', 'actual']
dt_df['prediction'] = dt_df['prediction'].astype(int)
dt_df['actual'] = dt_df['actual'].astype(int)
dt_df.loc[(dt_df['prediction'] == 1)&(dt_df['actual'] == 1)]

Unnamed: 0,prediction,actual
810,1,1
811,1,1
920,1,1
921,1,1
1261,1,1
1262,1,1


In [102]:
print(f'Log Loss: {log_loss(dt_df.actual, dt_df.prediction)}')

Log Loss: 5.95042077982528


#### KNN-classifier

In [14]:
knn_model = KNN(n_neighbors=200)
knn_model.fit(X_ros, y_ros)
scores = cross_val_score(knn_model, X_ros, y_ros, scoring='accuracy', cv=kfold, n_jobs=-1)
print('Precision: %.3f' % mean(cross_val_score(knn_model, X_ros, y_ros, scoring='precision', cv=kfold, n_jobs=-1)))
print('Recall: %.3f' % mean(cross_val_score(knn_model, X_ros, y_ros, scoring='recall', cv=kfold, n_jobs=-1)))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))    
y_train_pred = knn_model.predict(X_ros)
y_test_pred = knn_model.predict(test_X)
print(f'Train score {accuracy_score(y_train_pred,y_ros)}')
print(f'Test score {accuracy_score(y_test_pred,test_Y)}')

KeyboardInterrupt: 

In [93]:
knn_df = pd.DataFrame([y_test_pred, test_Y]).T
knn_df.columns = ['prediction', 'actual']
knn_df['prediction'] = knn_df['prediction'].astype(int)
knn_df['actual'] = knn_df['actual'].astype(int)
knn_df.loc[(knn_df['prediction']==1)&(knn_df['actual']==1)]

Unnamed: 0,prediction,actual
810,1,1
811,1,1
1261,1,1
1262,1,1


In [95]:
knn_df.loc[knn_df.actual==1].shape[0]

6

#### Logistic regression

In [None]:
lr_model = LogisticRegression(solver='sag', class_weight='balanced')
lr_model.fit(X_ros, y_ros)
predictions2 = lr_model.predict(test_X)
scores = cross_val_score(lr_model, X_ros, y_ros,
                         scoring='accuracy', cv=kfold, n_jobs=-1)
print('Precision: %.3f' % mean(cross_val_score(
    lr_model, X_ros, y_ros, scoring='precision', cv=kfold, n_jobs=-1)))
print('Recall: %.3f' % mean(cross_val_score(
    lr_model, X_ros, y_ros, scoring='recall', cv=kfold, n_jobs=-1)))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))


#### Naive Bayes classifier

In [34]:
nb_model = GaussianNB()
nb_model.fit(X_ros,y_ros)
scores = cross_val_score(nb_model, X_ros, y_ros, scoring='accuracy', cv=kfold, n_jobs=-1)
print('Precision: %.3f' % mean(cross_val_score(nb_model, X_ros, y_ros, scoring='precision', cv=kfold, n_jobs=-1)))
print('Recall: %.3f' % mean(cross_val_score(nb_model, X_ros, y_ros, scoring='recall', cv=kfold, n_jobs=-1)))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

predictions3 = nb_model.predict_proba(test_X)

Precision: 0.638
Recall: 0.950
Accuracy: 0.705 (0.019)


#### Ensemble methods
Stacking classifier

In [17]:
def evaluate(model, X, y):
    cv = KFold(10, shuffle=True, random_state=1)
    return cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1), cross_val_score(model, X, y, scoring='precision', cv=cv, n_jobs=-1), cross_val_score(model, X, y, scoring='recall', cv=cv, n_jobs=-1)


### Stacking implementation

In [103]:
def stacking():
    level0 = list()
    level0.append(('lr', LogisticRegression(solver='saga',class_weight='balanced', max_iter=4000)))
    level0.append(('knn', KNN(n_neighbors=11)))
    level0.append(('dt', DecisionTreeClassifier(ccp_alpha=0.036)))
    level0.append(('nb', GaussianNB()))
	
    level1 = DecisionTreeClassifier(ccp_alpha=0.036)
	
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

In [104]:
def get_models():
    models = dict()
    models['Decision Tree'] = DecisionTreeClassifier(ccp_alpha=0.036)
    models['Logistic Regression'] = LogisticRegression(solver='saga', class_weight='balanced', max_iter=4000)
    models['KNN'] = KNN()
    models['Naive Bayes'] = GaussianNB()
    models['Stacking'] = stacking()
    return models

### Stacking debug

In [107]:
stacking_clf = stacking()
stacking_clf.fit(X_ros, y_ros)
scores = cross_val_score(stacking_clf, X_ros, y_ros, scoring='accuracy', cv=kfold, n_jobs=-1)
print('Precision: %.3f' % mean(cross_val_score(stacking_clf, X_ros, y_ros, scoring='precision', cv=kfold, n_jobs=-1)))
print('Recall: %.3f' % mean(cross_val_score(stacking_clf, X_ros, y_ros, scoring='recall', cv=kfold, n_jobs=-1)))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))    
st_y_train_pred = knn_model.predict(X_ros)
st_y_test_pred = knn_model.predict(test_X)
print(f'Train score {accuracy_score(st_y_train_pred,y_ros)}')
print(f'Test score {accuracy_score(st_y_test_pred,test_Y)}')

Precision: 0.997
Recall: 1.000
Accuracy: 0.999 (0.001)
Train score 0.9977495442114859
Test score 0.9966329966329966


In [108]:
st_df = pd.DataFrame([st_y_test_pred, test_Y]).T
st_df.columns = ['prediction', 'actual']
st_df['prediction'] = st_df['prediction'].astype(int)
st_df['actual'] = st_df['actual'].astype(int)
st_df.loc[(st_df['prediction']==1)&(st_df['actual']==1)]

Unnamed: 0,prediction,actual
810,1,1
811,1,1
1261,1,1
1262,1,1


In [111]:
st_df.loc[st_df.actual==1].shape[0]

6

### Random over-sampling

In [None]:
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X, y)

In [105]:
models = get_models()
results, names = list(), list()
for name, model in models.items():
    acc, pre, rec = evaluate(model, X_ros, y_ros)
    names.append(names)
    print('Model: %s' % (name))
    print('Accuracy: %.3f (%.3f)' % (mean(acc), std(acc)))
    print('Recall: %.3f (%.3f)' % (mean(rec), std(rec)))
    print('Precision: %.3f (%.3f) \n' % (mean(pre), std(pre)))

Model: Decision Tree
Accuracy: 0.845 (0.005)
Recall: 0.869 (0.005)
Precision: 0.830 (0.009) 

Model: Logistic Regression
Accuracy: 0.838 (0.004)
Recall: 0.835 (0.005)
Precision: 0.841 (0.006) 

Model: KNN
Accuracy: 0.998 (0.000)
Recall: 1.000 (0.000)
Precision: 0.996 (0.001) 

Model: Naive Bayes
Accuracy: 0.700 (0.009)
Recall: 0.969 (0.004)
Precision: 0.630 (0.010) 

Model: Stacking
Accuracy: 0.999 (0.001)
Recall: 1.000 (0.000)
Precision: 0.997 (0.001) 

