### Prerequisites

In [1]:
# Basic
import pandas as pd
from numpy import mean, std

# Misc
from collections import Counter

# Random sampling and cross-validation
from imblearn.over_sampling  import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

# Feature selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFE

# ML
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB

In [2]:
df = pd.read_csv('./datasets/struts-final.csv')

In [3]:
df

Unnamed: 0,file,type,cbo,cboModified,fanin,fanout,wmc,dit,noc,rfc,...,anonymousClassesQty,innerClassesQty,lambdasQty,uniqueWordsQty,modifiers,logStatementsQty,title,severity,vulnerable,version
0,core/src/main/java/org/apache/struts2/views/fr...,class,5,7,2,5,14,1,0,16,...,0,0,0,50,1,0,0,0,0.0,2.3.20
1,core/src/test/java/org/apache/struts2/views/js...,class,3,3,0,3,4,4,0,14,...,0,0,0,26,1,0,0,0,0.0,2.3.20
2,plugins/cdi/src/test/java/org/apache/struts2/c...,class,8,8,0,8,4,1,0,12,...,0,0,0,35,1,0,0,0,0.0,2.3.20
3,apps/portlet/src/test/java/JettyPlutoLauncher....,class,4,4,0,4,1,1,0,7,...,0,0,0,26,1,0,0,0,0.0,2.3.20
4,plugins/codebehind/src/test/java/org/apache/st...,class,1,1,0,1,0,1,0,0,...,0,0,0,4,1,0,0,0,0.0,2.3.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19367,plugins/convention/src/test/java/org/apache/st...,class,0,1,1,0,1,1,0,0,...,0,0,0,16,1,0,0,0,0.0,2.5.26
19368,core/src/main/java/com/opensymphony/xwork2/Act...,class,14,33,19,14,49,1,19,56,...,0,0,0,194,1,3,0,0,0.0,2.5.26
19369,apps/showcase/src/test/java/it/org/apache/stru...,class,2,13,11,2,1,2,11,3,...,0,0,0,11,1025,0,0,0,0.0,2.5.26
19370,core/src/main/java/org/apache/struts2/views/xs...,class,2,3,1,2,4,1,0,8,...,0,0,0,58,1,3,0,0,0.0,2.5.26


### Testing
Considering one version of struts as the test version while the other versions are used for training.

In [4]:
pd.DataFrame(df.version.unique(),columns=['version'])

Unnamed: 0,version
0,2.3.20
1,2.3.24
2,2.3.28
3,2.3.28.1
4,2.3.29
5,2.3.35
6,2.5
7,2.5.1
8,2.5.13
9,2.5.17


In [5]:
kfold = KFold(10, shuffle=True, random_state=0.5)
train = df.loc[~(df.version=='2.3.20')]
test = df.loc[df.version=='2.3.20']
print("Shape of dataframe", df.shape)

Shape of dataframe (19372, 55)


## Handling imbalanced dataset
Sampling is done using both ROS + RUS, for optimal performance.

In [6]:
ros = RandomOverSampler(sampling_strategy=0.5)
rus = RandomUnderSampler(sampling_strategy=0.5)
X_ros, y_ros = ros.fit_resample(df.drop(['vulnerable'], axis=1), df['vulnerable'])
X, y = rus.fit_resample(X_ros, y_ros)

In [7]:
Counter(y)

Counter({0.0: 19328, 1.0: 9664})

## Feature extraction
Now that we have the cleaned + imbalance handled dataset, it is time for feature extraction.

In [8]:
df.columns
feature_columns = df.columns[2:-4]
X = X[feature_columns]

### Sequential forward selector (SFS)

In [9]:
sfs = SFS(DecisionTreeClassifier(), 
           k_features='best', 
           forward=True, # if forward = True then SFS otherwise SBS
           floating=False, 
           scoring='r2',
          cv=5)
sfs.fit(X,y)
sfs_features = sfs.k_feature_names_


STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

In [None]:
print('\nSequential Forward Selection (SFS): '+str(len(sfs_features))+' features selected')
print('\nTotal columns: '+str(len(feature_columns)))


Sequential Forward Selection (SFS): 14 features selected

Total columns: 49


### Recursive feature elimination (RFE)

In [None]:
rfe = RFE(estimator=DecisionTreeClassifier())
rfe.fit(X, y)
rfe.fit_transform(X,y)
rfe_features = []
for i in range(X.shape[1]):
    if(rfe.support_[i] == 1):
        rfe_features.append(X.columns[i])

In [None]:
print('\nRecursive Feature Elimination (RFE): '+str(len(rfe_features))+' features selected')
print('\nTotal columns: '+str(len(feature_columns)))


Recursive Feature Elimination (RFE): 24 features selected

Total columns: 49


### Machine Learning

In [None]:
X = train[rfe_features]
test_X = test[rfe_features]
y = train.vulnerable
test_Y = test.vulnerable

In [78]:
model = DecisionTreeClassifier(random_state=0,ccp_alpha=0.06)
model.fit(X, y)
y_test_predictions = model.predict(test_X)

In [75]:
pd.DataFrame(y_test_predictions, columns=['predictions']).predictions.value_counts()

0.0    1785
Name: predictions, dtype: int64

In [76]:
test_Y.value_counts()

0.0    1776
1.0       9
Name: vulnerable, dtype: int64

In [79]:
scores = cross_val_score(model, X, y, scoring='accuracy', cv=kfold, n_jobs=-1)
print('Precision: %.3f' % mean(cross_val_score(model, X, y, scoring='precision', cv=kfold, n_jobs=-1)))
print('Recall: %.3f' % mean(cross_val_score(model, X, y, scoring='recall', cv=kfold, n_jobs=-1)))
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

ValueError: 0.5 cannot be used to seed a numpy.random.RandomState instance