In [9]:
import gensim
from gensim.models import Word2Vec
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

# Step 1 Load the Dataset

In [11]:
data = pd.read_csv('Security_Vulnerabilities.csv')
texts = data['source_code'].values
labels = data['vulnerable'].values

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Step 2 Tokenize the dataset

In [12]:
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, padding='post', maxlen=100)

# Step 3 Word 2 Vec Embedding

In [13]:
word2vec_model = Word2Vec([text.split() for text in texts], vector_size=100,
window=5, min_count=1)
embedding_matrix = word2vec_model.wv.vectors
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100


# Step 4 Build the Deep Neural Networks

In [16]:
print(embedding_matrix.shape)  # This should print (vocab_size, embedding_dim)


(5169, 100)


In [20]:
model = Sequential()
model.add(Embedding(input_dim= 5169,
                    output_dim= 100,  
                    weights=[embedding_matrix],
                    input_length=100,
                    trainable=False))

In [21]:
# Add LSTM layer or Dense layer
model.add(LSTM(128, return_sequences=False))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid')) # Binary classification (vulnerable or not)

In [22]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 5 Train the model 

In [23]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels,
test_size=0.2, random_state=42)
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test),
batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x160ca2ba250>

# Step 6 Evaluate the model

In [24]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")
# Example of predicting vulnerability
new_code_sample = "int main() { int a = 0; while(a != 10) { a++; } return 0; }"
new_sequence = tokenizer.texts_to_sequences([new_code_sample])
new_padded_sequence = pad_sequences(new_sequence, maxlen=100)
prediction = model.predict(new_padded_sequence)
vulnerable = prediction[0][0] > 0.5 # If > 0.5, we classify it as vulnerable
print(f"Vulnerability Prediction: {'Yes' if vulnerable else 'No'}")

Test Accuracy: 51.64%
Vulnerability Prediction: No
