In [None]:
!pip install javalang


Collecting javalang
  Downloading javalang-0.13.0-py3-none-any.whl.metadata (805 bytes)
Downloading javalang-0.13.0-py3-none-any.whl (22 kB)
Installing collected packages: javalang
Successfully installed javalang-0.13.0


In [None]:
import numpy as np
import pandas as pd
import javalang
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils import class_weight

# Load dataset from Excel file
df = pd.read_excel('/content/drive/MyDrive/meg2.xlsx')

# Function to preprocess code and handle common issues (like unmatched quotes)
def preprocess_code(code):
    if not isinstance(code, str):
        return ""
    # Replace unmatched single quotes with double quotes
    code = re.sub(r"(?<!\\)\'", '"', code)
    return code

# Tokenize Java code using javalang and ignore errors
def tokenize_java_code(code):
    try:
        # Tokenize and return list of tokens as strings
        tokens = list(javalang.tokenizer.tokenize(code, ignore_errors=True))
        return [str(token) for token in tokens]
    except javalang.tokenizer.LexerError as e:
        # Print the error and return an empty list for this row
        print(f"LexerError: {e} for code: {code}")
        return []

# Apply preprocessing to the func_before column
df['func_before_cleaned'] = df['func_before'].apply(preprocess_code)

# Apply tokenization to the cleaned column
df['tokenized_func_before'] = df['func_before_cleaned'].apply(tokenize_java_code)

# Filter out rows where tokenization failed
df = df[df['tokenized_func_before'].apply(lambda x: len(x) > 0)]

# Convert tokenized Java code back into text form for Keras Tokenizer
X = df['tokenized_func_before'].apply(lambda x: ' '.join(x))

# Convert labels to binary
y = df['is_vul'].apply(lambda x: 1 if str(x).strip().upper() == 'TRUE' else 0)

# Calculate class weights
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y),
    y=y
)

# Create class_weight_dict based on the entire dataset
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

# Preprocessing: Use Keras Tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
vocab_size = len(tokenizer.word_index) + 1

# Padding sequences to ensure uniform length
maxlen = 200  # Adjust based on typical code length
X_pad = pad_sequences(X_seq, padding='post', maxlen=maxlen)

# Build and train the model on the full dataset
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=128, input_length=maxlen),
    tf.keras.layers.LSTM(128, return_sequences=False),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_pad, y, epochs=5, batch_size=64, class_weight=class_weight_dict)

# Save the model
model.save('vul_detection_model.h5')

# Function to predict if code is vulnerable
def predict_vulnerability(code_snippet):
    code_snippet = preprocess_code(code_snippet)  # Preprocess the user input
    tokenized_code = ' '.join(tokenize_java_code(code_snippet))  # Tokenize input code
    seq = tokenizer.texts_to_sequences([tokenized_code])
    padded_seq = pad_sequences(seq, padding='post', maxlen=maxlen)
    prediction = model.predict(padded_seq)[0][0]
    print(prediction)
    return 'Vulnerable' if prediction > 0.5 else 'Not Vulnerable'

# Test the model with user input
user_code = input("Enter a Java code snippet: ")
result = predict_vulnerability(user_code)
print(f'The entered code is: {result}')


Epoch 1/5




[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 21ms/step - accuracy: 0.5455 - loss: 0.6794
Epoch 2/5
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 20ms/step - accuracy: 0.6770 - loss: 0.5973
Epoch 3/5
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 20ms/step - accuracy: 0.6020 - loss: 0.6177
Epoch 4/5
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 20ms/step - accuracy: 0.6223 - loss: 0.6088
Epoch 5/5
[1m656/656[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 20ms/step - accuracy: 0.8774 - loss: 0.4589




Enter a Java code snippet: public void saveAsAuthor(String comment, boolean minorEdit) throws XWikiException     {         XWikiContext xcontext = getXWikiContext();          getAuthors()             .setOriginalMetadataAuthor(getCurrentUserReferenceResolver().resolve(CurrentUserReference.INSTANCE));         DocumentReference author = getEffectiveAuthorReference();         if (hasAccess(Right.EDIT, author)) {             DocumentReference currentUser = xcontext.getUserReference();             try {                 xcontext.setUserReference(author);                  saveDocument(comment, minorEdit);             } finally {                 xcontext.setUserReference(currentUser);             }         } else {             java.lang.Object[] args = { author, xcontext.getDoc(), getFullName() };             throw new XWikiException(XWikiException.MODULE_XWIKI_ACCESS, XWikiException.ERROR_XWIKI_ACCESS_DENIED,                 "Access denied; user {0}, acting through script in document {1} cann

In [None]:
def predict_vulnerability(code_snippet):
    code_snippet = preprocess_code(code_snippet)  # Preprocess the user input
    tokenized_code = ' '.join(tokenize_java_code(code_snippet))  # Tokenize input code
    seq = tokenizer.texts_to_sequences([tokenized_code])
    padded_seq = pad_sequences(seq, padding='post', maxlen=maxlen)
    prediction = model.predict(padded_seq)[0][0]
    print(prediction)
    return 'Vulnerable' if prediction > 0.5 else 'Not Vulnerable'

# Test the model with user input
user_code = input("Enter a Java code snippet: ")
result = predict_vulnerability(user_code)
print(f'The entered code is: {result}')

Enter a Java code snippet: public void saveAsAuthor(String comment, boolean minorEdit) throws XWikiException     {         XWikiContext xcontext = getXWikiContext();          getAuthors()             .setOriginalMetadataAuthor(getCurrentUserReferenceResolver().resolve(CurrentUserReference.INSTANCE));         DocumentReference author = getEffectiveAuthorReference();         if (hasAccess(Right.EDIT, author)) {             DocumentReference currentUser = xcontext.getUserReference();             try {                 xcontext.setUserReference(author);                  saveDocument(comment, minorEdit);             } finally {                 xcontext.setUserReference(currentUser);             }         } else {             java.lang.Object[] args = { author, xcontext.getDoc(), getFullName() };             throw new XWikiException(XWikiException.MODULE_XWIKI_ACCESS, XWikiException.ERROR_XWIKI_ACCESS_DENIED,                 "Access denied; user {0}, acting through script in document {1} cann

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
