In [1]:
import faiss
import numpy as np
import pandas as pd
import pickle
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline




In [2]:
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")



In [3]:
faiss_index = faiss.read_index("sqli_faiss.index")
print(f"Loaded FAISS index with {faiss_index.ntotal} vectors.")

Loaded FAISS index with 5 vectors.


In [4]:
with open("sqli_metadata.pkl", "rb") as f:
    metadata_dict = pickle.load(f)

In [5]:
def retrieve_similar_payloads(query, k=3):
    """
    Given a SQLi payload query, generate its embedding using SBERT,
    then search the FAISS index for the top-k similar payloads.
    Retrieve the metadata for each from the metadata_dict.
    """
    # Generate the embedding for the query
    query_embedding = sbert_model.encode([query], convert_to_numpy=True).astype(np.float32)
    
    # Search the FAISS index for the top-k nearest neighbors
    distances, indices = faiss_index.search(query_embedding, k)
    
    results = []
    for i in range(k):
        idx = int(indices[0][i])
        # Retrieve metadata using the dictionary (if available)
        record = metadata_dict.get(idx)
        if record is not None:
            results.append({
                "payload": record["payload"],
                "attack_type": record["attack_type"],
                "target_waf": record["target_waf"],
                "source": record["source"],
                "timestamp": record["timestamp"],
                "distance": distances[0][i]
            })
    return results

In [6]:
# import torch
# Load your local LLM model (update the path to your pre-trained model folder)
llm_model_path = "gpt_neo_1m"  # Replace with your local folder path
llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_path)
llm_model = AutoModelForCausalLM.from_pretrained(llm_model_path)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Create a text-generation pipeline using the local model
llm = pipeline("text-generation", model=llm_model, tokenizer=llm_tokenizer, device=0)

In [7]:
def generate_analysis(query, retrieved_payloads):
    """
    Create a prompt containing the user query and retrieved SQLi payloads,
    then use the local LLM to generate an analysis.
    """
    prompt = f"User query: {query}\n\nHere are similar SQLi payloads retrieved:\n"
    for i, payload in enumerate(retrieved_payloads):
        prompt += (f"{i+1}. Payload: {payload['payload']}\n"
                   f"   - Attack Type: {payload['attack_type']}\n"
                   f"   - Target WAF: {payload['target_waf']}\n"
                   f"   - Source: {payload['source']}\n"
                   f"   - Timestamp: {payload['timestamp']}\n\n")
    prompt += "Explain these SQL injection techniques and suggest mitigation strategies."
    
    # Generate text with the local LLM (adjust max_length and other parameters as needed)
    output = llm(prompt, max_length=512, do_sample=False)
    return output[0]['generated_text']

In [8]:
def rag_sqli_pipeline(user_query, k=3):
    """
    Full Retrieval-Augmented Generation (RAG) pipeline:
      1. Retrieve similar SQLi payloads from FAISS.
      2. Generate analysis using the local LLM.
    """
    # Retrieve similar payloads
    retrieved_payloads = retrieve_similar_payloads(user_query, k)
    
    # Generate analysis using the retrieved payloads
    analysis = generate_analysis(user_query, retrieved_payloads)
    
    # Display the results
    print("\n🔍 Retrieved SQLi Payloads:")
    for payload in retrieved_payloads:
        print(f"- {payload['payload']} (Attack: {payload['attack_type']}, Distance: {payload['distance']:.4f})")
    
    print("\n🤖 LLM Analysis:")
    print(analysis)
    
    return retrieved_payloads, analysis

In [9]:
user_query = "1 UNION SELECT username, password FROM users --"
    
# Run the RAG pipeline
retrieved_payloads, analysis = rag_sqli_pipeline(user_query, k=3)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



🔍 Retrieved SQLi Payloads:
- 1 UNION SELECT username, password FROM users -- (Attack: Data Extraction, Distance: 0.0000)
- SELECT * FROM users WHERE id='1' -- (Attack: Enumeration, Distance: 0.9033)
- ' UNION SELECT email, credit_card FROM payments -- (Attack: Data Extraction, Distance: 1.0425)

🤖 LLM Analysis:
User query: 1 UNION SELECT username, password FROM users --

Here are similar SQLi payloads retrieved:
1. Payload: 1 UNION SELECT username, password FROM users --
   - Attack Type: Data Extraction
   - Target WAF: Cloudflare
   - Source: Online Source
   - Timestamp: 2025-03-03

2. Payload: SELECT * FROM users WHERE id='1' --
   - Attack Type: Enumeration
   - Target WAF: Imperva
   - Source: Manual
   - Timestamp: 2025-03-01

3. Payload: ' UNION SELECT email, credit_card FROM payments --
   - Attack Type: Data Extraction
   - Target WAF: AWS WAF
   - Source: Online Source
   - Timestamp: 2025-03-02

Explain these SQL injection techniques and suggest mitigation strategies.%0b%2