In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from feature_extractor import extract_features_from_raw_code
import solcx


In [35]:
solcx.set_solc_version("0.8.0")  # Use the manually installed version
print("Active Solidity version:", solcx.get_solc_version())

Active Solidity version: 0.8.0


In [37]:
df_secure = pd.read_csv('data/BCCC-VolSCs-2023_Secure.csv')
df_vulnerable = pd.read_csv('data/BCCC-VolSCs-2023_Vulnerable.csv')
df_combined = pd.concat([df_secure, df_vulnerable], ignore_index=True)

print("Class Distribution in Entire Dataset:")
print(df_combined['label'].value_counts())


Class Distribution in Entire Dataset:
label
0    26915
1     9756
Name: count, dtype: int64


In [38]:
df_combined.head()

Unnamed: 0.1,Unnamed: 0,hash_id,label,bytecode_len,Weight bytecode_character_6,Weight bytecode_character_0,Weight bytecode_character_8,Weight bytecode_character_4,Weight bytecode_character_5,Weight bytecode_character_2,...,bytecode_character_k,bytecode_character_P,Weight bytecode_character_g,bytecode_character_g,Weight bytecode_character_I,Weight bytecode_character_m,bytecode_character_I,bytecode_character_m,Weight bytecode_character_x,bytecode_character_x
0,0,39fcd43b0f0aaa2cf8f084307d15e259d203843ba89845...,0,14044,0.094702,0.284677,0.048206,0.028411,0.079963,0.041014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,9c68294047c47e46f7808778f54e175e7d0f7437c9752f...,0,5606,0.060649,0.158758,0.037817,0.013914,0.058866,0.039422,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4,d4a4062767a37041cbe7c4433e06e3dbe0b5e4f89f1035...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,880072718af126b7e38972393ea0cdbfb324c7c27277d6...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,a54f70b383cfc153e3cb2405c885e1a34d919ef8237b91...,0,12444,0.067261,0.17864,0.041707,0.018161,0.065011,0.042189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
df = df_combined

In [40]:
# Select relevant columns for training
relevant_columns = [
    'bytecode_len', 'bytecode_entropy', 'ast_len_nodes',  # Bytecode and AST features
] + [col for col in df.columns if 'Weight bytecode_character_' in col]  # Character weights

X = df[relevant_columns]
y = df['label']  # Labels

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [41]:
# Train Random Forest Classifier
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

In [45]:
# Evaluate the model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)


print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7607361963190185

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.95      0.85      5384
           1       0.64      0.23      0.34      1951

    accuracy                           0.76      7335
   macro avg       0.71      0.59      0.60      7335
weighted avg       0.74      0.76      0.72      7335


Confusion Matrix:
[[5122  262]
 [1493  458]]


In [47]:
def predict_solidity_code(raw_code):
    # Extract features from the raw Solidity code
    features = extract_features_from_raw_code(raw_code)
    features_df = pd.DataFrame([features])

    # Align features with the model's trained feature set
    for col in X_train.columns:  # X_train is the DataFrame used for training
        if col not in features_df.columns:
            features_df[col] = 0  # Add missing columns with default value 0

    # Drop any unexpected columns
    features_df = features_df[X_train.columns]

    # Predict
    prediction = model.predict(features_df)
    prediction_proba = model.predict_proba(features_df)
    return prediction[0], prediction_proba[0]


In [48]:
raw_code_example = """
pragma solidity ^0.8.0;

contract Example {
    uint256 private value;

    function setValue(uint256 _value) public {
        value = _value;
    }

    function getValue() public view returns (uint256) {
        return value;
    }
}
"""

In [49]:
pred, proba = predict_solidity_code(raw_code_example)
print("Prediction (0 = Secure, 1 = Vulnerable):", pred)
print("Prediction Probability:", proba)

Prediction (0 = Secure, 1 = Vulnerable): 0
Prediction Probability: [0.63 0.37]
