In [2]:
from sentence_transformers import SentenceTransformer




In [3]:
model = SentenceTransformer("all-MiniLM-L6-v2")



In [None]:
# import pandas as pd

# # Load CSV file
# csv_path = "sqli_data.csv"  # Change to your actual file path
# df = pd.read_csv(csv_path)

# # Ensure correct column names
# print(df.head())  # Check if column names match expectations

# # Convert DataFrame to a list of dictionaries
# sqli_payloads = df.to_dict(orient="records")  # Each row as a dictionary
# print(f"Loaded {len(sqli_payloads)} SQLi payloads.")

In [13]:
sqli_payloads = [
    {"payload": "' OR 1=1 --", "attack_type": "Authentication Bypass", "target_waf": "ModSecurity", "source": "Attack Grammar", "timestamp": "2025-03-04"},
    {"payload": "1 UNION SELECT username, password FROM users --", "attack_type": "Data Extraction", "target_waf": "Cloudflare", "source": "Online Source", "timestamp": "2025-03-03"},
    {"payload": "' UNION SELECT email, credit_card FROM payments --", "attack_type": "Data Extraction", "target_waf": "AWS WAF", "source": "Online Source", "timestamp": "2025-03-02"},
    {"payload": "SELECT * FROM users WHERE id='1' --", "attack_type": "Enumeration", "target_waf": "Imperva", "source": "Manual", "timestamp": "2025-03-01"},
    {"payload": "DROP TABLE users; --", "attack_type": "Destructive Attack", "target_waf": "Akamai", "source": "Exploit DB", "timestamp": "2025-02-28"},
]

payload_texts = [entry["payload"] for entry in sqli_payloads]

In [14]:
embeddings = model.encode(payload_texts, convert_to_numpy=True).astype(np.float32)

print(f"Generated {embeddings.shape[0]} embeddings with dimension {embeddings.shape[1]}")

Generated 5 embeddings with dimension 384


In [1]:
import faiss

In [15]:
# Create FAISS IVF-HNSW Index
embedding_dim = embeddings.shape[1]
num_clusters = int(np.sqrt(len(sqli_payloads)))
quantizer = faiss.IndexHNSWFlat(embedding_dim, 32)
index = faiss.IndexIVFFlat(quantizer, embedding_dim, num_clusters, faiss.METRIC_L2)

In [16]:
# Train the FAISS index
index.train(embeddings)

# Assign metadata IDs and store embeddings
ids = np.arange(len(sqli_payloads))  # Generate unique IDs
index.add_with_ids(embeddings, ids)

# Store metadata in a dictionary (key = FAISS ID)
metadata_dict = {i: sqli_payloads[i] for i in ids}

print(f"FAISS index contains {index.ntotal} vectors")

FAISS index contains 5 vectors


In [17]:
#Save index
faiss.write_index(index, "sqli_faiss.index")

In [18]:
#Load index
index = faiss.read_index("sqli_faiss.index")

In [19]:
new_payload = ["1 UNION SELECT username, password FROM users --"]
new_embedding = model.encode(new_payload, convert_to_numpy=True).astype(np.float32)

# Search in FAISS
k = 3  # Number of nearest neighbors
distances, indices = index.search(new_embedding, k)

print("\nMost similar payloads with metadata:")
for i in range(k):
    idx = indices[0][i]  # FAISS index ID
    metadata = metadata_dict[idx]  # Retrieve metadata

    print(f"{i+1}. Payload: {metadata['payload']}")
    print(f"   Attack Type: {metadata['attack_type']}")
    print(f"   Target WAF: {metadata['target_waf']}")
    print(f"   Collection Source: {metadata['source']}")
    print(f"   Timestamp: {metadata['timestamp']}")
    print(f"   Distance: {distances[0][i]:.4f}\n")


Most similar payloads with metadata:
1. Payload: 1 UNION SELECT username, password FROM users --
   Attack Type: Data Extraction
   Target WAF: Cloudflare
   Collection Source: Online Source
   Timestamp: 2025-03-03
   Distance: 0.0000

2. Payload: SELECT * FROM users WHERE id='1' --
   Attack Type: Enumeration
   Target WAF: Imperva
   Collection Source: Manual
   Timestamp: 2025-03-01
   Distance: 0.9033

3. Payload: ' UNION SELECT email, credit_card FROM payments --
   Attack Type: Data Extraction
   Target WAF: AWS WAF
   Collection Source: Online Source
   Timestamp: 2025-03-02
   Distance: 1.0425

