In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from feature_extractor import extract_features_from_raw_code


In [2]:
df_secure = pd.read_csv('data/BCCC-VolSCs-2023_Secure.csv')

df_vulnerable = pd.read_csv('data/BCCC-VolSCs-2023_Vulnerable.csv')

df_combined = pd.concat([df_secure, df_vulnerable], ignore_index=True)

print("Class Distribution in Entire Dataset:")
print(df_combined['label'].value_counts())


Class Distribution in Entire Dataset:
label
0    26915
1     9756
Name: count, dtype: int64


In [3]:
df_combined.head()

Unnamed: 0.1,Unnamed: 0,hash_id,label,bytecode_len,Weight bytecode_character_6,Weight bytecode_character_0,Weight bytecode_character_8,Weight bytecode_character_4,Weight bytecode_character_5,Weight bytecode_character_2,...,bytecode_character_k,bytecode_character_P,Weight bytecode_character_g,bytecode_character_g,Weight bytecode_character_I,Weight bytecode_character_m,bytecode_character_I,bytecode_character_m,Weight bytecode_character_x,bytecode_character_x
0,0,39fcd43b0f0aaa2cf8f084307d15e259d203843ba89845...,0,14044,0.094702,0.284677,0.048206,0.028411,0.079963,0.041014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,9c68294047c47e46f7808778f54e175e7d0f7437c9752f...,0,5606,0.060649,0.158758,0.037817,0.013914,0.058866,0.039422,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4,d4a4062767a37041cbe7c4433e06e3dbe0b5e4f89f1035...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,880072718af126b7e38972393ea0cdbfb324c7c27277d6...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,a54f70b383cfc153e3cb2405c885e1a34d919ef8237b91...,0,12444,0.067261,0.17864,0.041707,0.018161,0.065011,0.042189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Drop unnecessary columns
X = df_combined.drop(columns=['Unnamed: 0', 'hash_id', 'label'])
X = X.select_dtypes(include=['float64', 'int64'])  # Keep only numeric features
y = df_combined['label']

# Split the data (Stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [5]:
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear']
}
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='accuracy', verbose = 3)
grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ...C=0.1, penalty=l2, solver=lbfgs;, score=0.754 total time=   1.6s
[CV 2/5] END ...C=0.1, penalty=l2, solver=lbfgs;, score=0.752 total time=   1.4s
[CV 3/5] END ...C=0.1, penalty=l2, solver=lbfgs;, score=0.753 total time=   1.5s
[CV 4/5] END ...C=0.1, penalty=l2, solver=lbfgs;, score=0.758 total time=   1.3s
[CV 5/5] END ...C=0.1, penalty=l2, solver=lbfgs;, score=0.758 total time=   1.3s
[CV 1/5] END C=0.1, penalty=l2, solver=liblinear;, score=0.754 total time=   3.8s
[CV 2/5] END C=0.1, penalty=l2, solver=liblinear;, score=0.752 total time=   3.4s
[CV 3/5] END C=0.1, penalty=l2, solver=liblinear;, score=0.753 total time=   3.3s
[CV 4/5] END C=0.1, penalty=l2, solver=liblinear;, score=0.757 total time=   3.6s
[CV 5/5] END C=0.1, penalty=l2, solver=liblinear;, score=0.757 total time=   3.4s
[CV 1/5] END .....C=1, penalty=l2, solver=lbfgs;, score=0.754 total time=   2.0s
[CV 2/5] END .....C=1, penalty=l2, solver=lb

In [6]:
# Best Model
best_model = grid_search.best_estimator_

In [7]:
# Predictions
y_pred = best_model.predict(X_test_scaled)

print("Best Hyperparameters:", grid_search.best_params_)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Best Hyperparameters: {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}

Accuracy: 0.7506475800954329

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.95      0.85      5384
           1       0.60      0.19      0.29      1951

    accuracy                           0.75      7335
   macro avg       0.68      0.57      0.57      7335
weighted avg       0.72      0.75      0.70      7335


Confusion Matrix:
[[5131  253]
 [1576  375]]
