In [5]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout,Embedding
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import mode
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv(r"C:\Users\moham\OneDrive - The British University in Egypt\Desktop\Year 3\Semester1\GP\datasets\clean_sql_dataset.csv")

In [3]:

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['Query'])

# Convert text to sequences
X = tokenizer.texts_to_sequences(df['Query'])

# Pad sequences to ensure uniform length
X = pad_sequences(X, maxlen=100)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['Label'], test_size=0.2, random_state=42)

In [6]:


# Define the LSTM model
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=10000, output_dim=128, input_length=100))
model_lstm.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))

# Compile the model
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model_lstm.summary()



In [7]:
# Train the model
history = model_lstm.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m1484/1484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 83ms/step - accuracy: 0.9492 - loss: 0.1577 - val_accuracy: 0.9803 - val_loss: 0.0691
Epoch 2/5
[1m1484/1484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 90ms/step - accuracy: 0.9819 - loss: 0.0613 - val_accuracy: 0.9807 - val_loss: 0.0647
Epoch 3/5
[1m1484/1484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 90ms/step - accuracy: 0.9839 - loss: 0.0531 - val_accuracy: 0.9812 - val_loss: 0.0689
Epoch 4/5
[1m1484/1484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 93ms/step - accuracy: 0.9879 - loss: 0.0373 - val_accuracy: 0.9780 - val_loss: 0.0779
Epoch 5/5
[1m1484/1484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 94ms/step - accuracy: 0.9893 - loss: 0.0320 - val_accuracy: 0.9803 - val_loss: 0.0765


In [8]:
# Evaluate the model on the test data
loss, accuracy = model_lstm.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

# Predict on the test data
y_pred_lstm = model_lstm.predict(X_test)
y_pred_lstm = (y_pred_lstm > 0.5).astype(int)

# Compute evaluation metrics
accuracy_lstm = accuracy_score(y_test, y_pred_lstm)
precision_lstm = precision_score(y_test, y_pred_lstm)
recall_lstm = recall_score(y_test, y_pred_lstm)
f1_lstm = f1_score(y_test, y_pred_lstm)

# Print results
print(f"\n📌 LSTM Model Evaluation:")
print(f"✅ LSTM Accuracy: {accuracy_lstm:.4f}")
print(f"✅ LSTM Precision: {precision_lstm:.4f}")
print(f"✅ LSTM Recall: {recall_lstm:.4f}")
print(f"✅ LSTM F1-score: {f1_lstm:.4f}")

[1m928/928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 14ms/step - accuracy: 0.9797 - loss: 0.0779
Test Accuracy: 0.9798
[1m928/928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step

📌 LSTM Model Evaluation:
✅ LSTM Accuracy: 0.9798
✅ LSTM Precision: 0.9889
✅ LSTM Recall: 0.9724
✅ LSTM F1-score: 0.9805


In [9]:
import joblib

# Save the tokenizer
joblib.dump(tokenizer, 'tokenizer.pkl')
print("✅ Tokenizer saved successfully!")

✅ Tokenizer saved successfully!


In [10]:
# Save the LSTM model
model_lstm.save('lstm_model.h5')
print("✅ LSTM model saved successfully!")



✅ LSTM model saved successfully!


In [12]:
from tensorflow.keras.models import load_model

def predict_sql_injection_lstm(query):
    """
    Loads the saved LSTM model and tokenizer to predict if a query is malicious.
    """
    # Load the saved tokenizer
    tokenizer = joblib.load('tokenizer.pkl')
    model_lstm = load_model('lstm_model.h5')

    query_sequence = tokenizer.texts_to_sequences([query])
    query_padded = pad_sequences(query_sequence, maxlen=100)

    # Predict (0 = Benign, 1 = Malicious)
    prediction = model_lstm.predict(query_padded)[0][0]
    
    # Return the classification result
    return "Malicious SQL Injection" if prediction > 0.5 else "Benign Query"





[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 420ms/step
Prediction: Malicious SQL Injection


In [15]:
query_lstm = "45017c or DROP table users"
result_lstm = predict_sql_injection_lstm(query_lstm)
print(f"Prediction: {result_lstm}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 398ms/step
Prediction: Malicious SQL Injection


In [16]:
import os

# Check if the tokenizer is saved
if os.path.exists("tokenizer.pkl"):
    print("✅ Tokenizer is saved.")

# Check if the LSTM model is saved
if os.path.exists("lstm_model.h5"):
    print("✅ LSTM model is saved.")

✅ Tokenizer is saved.
✅ LSTM model is saved.


In [18]:
from tensorflow.keras.models import load_model

# Load the LSTM model
model_lstm = load_model('lstm_model.h5')

# Load the tokenizer
tokenizer = joblib.load('tokenizer.pkl')



In [19]:
# Preprocess for RF and SVM
X_test_tfidf = vectorizer.transform(df['Query'])

# Preprocess for LSTM
X_test_sequences = tokenizer.texts_to_sequences(df['Query'])

X_test_padded = pad_sequences(X_test_sequences, maxlen=100)

In [21]:
# RF predictions (probabilities)
y_pred_rf = model_rf.predict_proba(X_test_tfidf)[:, 1]

# SVM predictions (probabilities)
y_pred_svm = model_svm.predict_proba(X_test_tfidf)[:, 1]

# LSTM predictions (probabilities)
y_pred_lstm = model_lstm.predict(X_test_padded).flatten()

[1m4636/4636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 37ms/step


In [22]:
# Combine predictions into a new dataset
meta_features = np.column_stack((y_pred_rf, y_pred_svm, y_pred_lstm))

In [25]:
from sklearn.linear_model import LogisticRegression
# Train the meta-model
meta_model = LogisticRegression()
meta_model.fit(meta_features, df['Label'])  # df['Label'] is the true labels

# Make final predictions
final_predictions = meta_model.predict(meta_features)

In [26]:
# Compute evaluation metrics
accuracy = accuracy_score(df['Label'], final_predictions)
precision = precision_score(df['Label'], final_predictions)
recall = recall_score(df['Label'], final_predictions)
f1 = f1_score(df['Label'], final_predictions)

print(f"Ensemble Model Evaluation:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Ensemble Model Evaluation:
Accuracy: 0.9954
Precision: 0.9979
Recall: 0.9933
F1-score: 0.9956


In [27]:
import joblib

# Save the meta-model (e.g., Logistic Regression)
joblib.dump(meta_model, 'ensemble_model.pkl')

# Save the tokenizer and vectorizer
joblib.dump(tokenizer, 'tokenizer.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

print("✅ Ensemble model, tokenizer, and vectorizer saved successfully!")

✅ Ensemble model, tokenizer, and vectorizer saved successfully!
