<a href="https://colab.research.google.com/github/owlmt/QuantumCybersecurityAnalytics/blob/main/GWU_CyberSecurity_Analytics_Course_8414_Unsupervised_Learning_II.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#  Step 1: Install Dependencies
!pip install scikit-learn pandas numpy clang tqdm

#  Step 2: Download and Unzip Juliet Test Suite
import os
import zipfile
import urllib.request

# Define the dataset URL
dataset_url = "https://samate.nist.gov/SARD/downloads/test-suites/2017-10-01-juliet-test-suite-for-c-cplusplus-v1-3.zip"
dataset_zip = "juliet_dataset.zip"
dataset_folder = "juliet_dataset"

# Download the dataset if not already downloaded
if not os.path.exists(dataset_zip):
    print("Downloading Juliet Test Suite...")
    urllib.request.urlretrieve(dataset_url, dataset_zip)

# Unzip the dataset
if not os.path.exists(dataset_folder):
    print("Extracting dataset...")
    with zipfile.ZipFile(dataset_zip, 'r') as zip_ref:
        zip_ref.extractall(dataset_folder)

print("Dataset is ready!")

#  Step 3: Load Dataset
def load_dataset(dataset_path):
    """
    Loads the Juliet Test Suite dataset for CWE-121 Stack-Based Buffer Overflow.

    Parameters:
        dataset_path (str): Path to the CWE-121 folder inside Juliet Test Suite.

    Returns:
        data (list): List of C/C++ code snippets.
        filenames (list): Corresponding filenames for reference.
    """
    data = []
    filenames = []

    for root, _, files in os.walk(dataset_path):
        for file in files:
            if file.endswith(".c") or file.endswith(".cpp"):
                with open(os.path.join(root, file), 'r', encoding='utf-8', errors='ignore') as f:
                    code = f.read()
                    data.append(code)
                    filenames.append(file)

    return data, filenames

# Define the dataset path for CWE-121 Stack-Based Buffer Overflow
dataset_path = os.path.join(dataset_folder, "C/testcases/CWE121_Stack_Based_Buffer_Overflow")

# Load the dataset
data, filenames = load_dataset(dataset_path)

# Print dataset stats
print(f"Total code samples: {len(data)}")

#  Step 4: Feature Extraction
import re
import pandas as pd
from tqdm import tqdm

# List of unsafe C functions
UNSAFE_FUNCTIONS = ["gets", "strcpy", "strcat", "sprintf", "vsprintf", "scanf", "sscanf", "fscanf", "strncpy", "memcpy", "memmove"]

# Extract features from C/C++ code
def extract_features(code):
    features = {}

    # Count unsafe functions
    for func in UNSAFE_FUNCTIONS:
        features[f'count_{func}'] = len(re.findall(r'\b' + func + r'\b', code))

    # Count total function calls
    features['num_function_calls'] = len(re.findall(r'\b\w+\s*\(', code))

    # Count loop constructs
    features['num_loops'] = len(re.findall(r'\b(for|while|do)\b', code))

    # Count pointer usage
    features['num_pointers'] = len(re.findall(r'\*', code))

    # Count array indexing
    features['num_array_access'] = len(re.findall(r'\[.*?\]', code))

    return features

# Convert dataset to feature vectors using tqdm for progress tracking
print("Extracting features...")
feature_vectors = [extract_features(code) for code in tqdm(data)]
df = pd.DataFrame(feature_vectors)

#  Step 5: Apply DBSCAN for Unsupervised Anomaly Detection
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Normalize the data (important for DBSCAN)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean')
clusters = dbscan.fit_predict(df_scaled)

# Assign cluster labels
df["Cluster"] = clusters
df["Filename"] = filenames

#  Step 6: Detect Anomalies (Potential Buffer Overflow)
anomalies = df[df["Cluster"] == -1]

# Print detected anomalies
print("\n🔍 Detected Buffer Overflow Anomalies:")
print(anomalies[["Filename", "Cluster"]])

#  Step 7: Test on a Sample Code
import joblib

# Save the DBSCAN model
model_filename = "dbscan_buffer_overflow.pkl"
joblib.dump(dbscan, model_filename)

print(f"DBSCAN model saved as {model_filename}")

# Sample vulnerable C code
c_code = r'''
#include <stdio.h>
#include <string.h>

void vulnerableFunction(char *input) {
    char buffer[10]; // Fixed-size buffer
    strcpy(buffer, input); // No bounds checking (unsafe!)
    printf("You entered: %s\n", buffer);
}

int main() {
    char userInput[50];
    printf("Enter input: ");
    gets(userInput); // Unsafe function (deprecated)
    vulnerableFunction(userInput);
    return 0;
}
'''

# Extract features from sample code
sample_features = extract_features(c_code)
df_sample = pd.DataFrame([sample_features])

# Normalize the sample
df_sample_scaled = scaler.transform(df_sample)

# Predict cluster assignment
sample_cluster = dbscan.fit_predict(df_sample_scaled)[0]
sample_result = "VULNERABLE" if sample_cluster == -1 else "SAFE"

# Print result
print("\n Prediction for the sample C code:")
print(f"Model Prediction: {sample_result}")


Dataset is ready!
Total code samples: 7889
Extracting features...


100%|██████████| 7889/7889 [00:07<00:00, 1070.26it/s]



🔍 Detected Buffer Overflow Anomalies:
                                               Filename  Cluster
2     CWE121_Stack_Based_Buffer_Overflow__CWE806_cha...       -1
111   CWE121_Stack_Based_Buffer_Overflow__CWE806_cha...       -1
124   CWE121_Stack_Based_Buffer_Overflow__CWE806_cha...       -1
176   CWE121_Stack_Based_Buffer_Overflow__CWE806_cha...       -1
249   CWE121_Stack_Based_Buffer_Overflow__CWE806_cha...       -1
...                                                 ...      ...
7722  CWE121_Stack_Based_Buffer_Overflow__CWE805_str...       -1
7736  CWE121_Stack_Based_Buffer_Overflow__CWE805_wch...       -1
7778  CWE121_Stack_Based_Buffer_Overflow__CWE805_wch...       -1
7825  CWE121_Stack_Based_Buffer_Overflow__CWE805_wch...       -1
7833  CWE121_Stack_Based_Buffer_Overflow__CWE805_wch...       -1

[251 rows x 2 columns]
DBSCAN model saved as dbscan_buffer_overflow.pkl

 Prediction for the sample C code:
Model Prediction: VULNERABLE
