In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install langchain
!pip install -U langchain-community
#!pip install bitsandbytes
!pip install --upgrade trl
# !pip install transformers==4.35.0
!pip install accelerate
!pip install -i https://test.pypi.org/simple/ bitsandbytes
!pip install --upgrade bitsandbytes
!pip install --upgrade accelerate
!pip install peft==0.10.0
!pip install transformers==4.37.2
!pip install chromadb

Collecting langchain-community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.41 (from langchain-community)
  Downloading langchain_core-0.3.43-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.20 (from langchain-community)
  Downloading langchain-0.3.20-py3-none-any.whl.metadata (7.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

In [None]:
import torch
import chromadb
import sys
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

# Cập nhật đường dẫn
VECTOR_DB_PATH = "/content/drive/MyDrive/NCKH/vectorDB"
DATASET_PATH = "/content/drive/MyDrive/NCKH/Datasets/XSS/small_XSS_Dataset.txt"
MODEL_PATH = "/content/drive/MyDrive/NCKH/Inference/models/pretrain_model_xss"

# Load mô hình embedding để nhúng payloads
embedding = HuggingFaceEmbeddings(model_name="/content/drive/MyDrive/NCKH/all-MiniLM-L6-v2")

# Khởi tạo ChromaDB
vector_db = Chroma(persist_directory=VECTOR_DB_PATH, embedding_function=embedding)

# Đọc dữ liệu từ file payloads
with open(DATASET_PATH, "r", encoding="utf-8") as f:
    xss_payloads = f.readlines()

# Kiểm tra nếu payloads không rỗng
if not xss_payloads:
    print("Lỗi: File payloads rỗng!")
    exit()

# Chuyển đổi từng payload thành Document và lưu vào Vector Database
documents = [Document(page_content=payload.strip(), metadata={"type": "XSS"}) for payload in xss_payloads]
vector_db.add_documents(documents)
vector_db.persist()
print(f"Đã lưu {len(documents)} payloads vào Vector Database tại {VECTOR_DB_PATH}!")

# Kết nối ChromaDB với PersistentClient
chroma_client = chromadb.PersistentClient(path=VECTOR_DB_PATH)
collection = chroma_client.get_or_create_collection(name="payloads")

# Lưu payloads vào collection nếu chưa có
if len(collection.get()["documents"]) == 0:
    ids = [str(i) for i in range(len(xss_payloads))]
    collection.add(ids=ids, documents=[p.strip() for p in xss_payloads])
    print(f"Đã lưu payloads vào ChromaDB với {len(ids)} entries!")

# Kiểm tra dữ liệu đã lưu thành công chưa
print(f"Dữ liệu trong ChromaDB: {collection.get()}")

# Load GPT-2 Model & Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model_pretrain = GPT2LMHeadModel.from_pretrained(MODEL_PATH)

# Kiểm tra GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt2_model_pretrain.to(device).eval()

def retrieve_payloads():
    """Lấy toàn bộ payload từ ChromaDB"""
    results = collection.get()
    return results["documents"] if "documents" in results else []

def generate_payload():
    """Dùng GPT-2 để tạo payload mới từ toàn bộ dữ liệu trong VectorDB"""
    retrieved_payloads = retrieve_payloads()

    if not retrieved_payloads:
        print("Không tìm thấy payload nào trong VectorDB!")
        return None

    input_text = " ".join(retrieved_payloads)[:1024]  # Giới hạn độ dài input
    tokens = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Sinh output từ mô hình pretrain
    with torch.no_grad():
        output_tokens = gpt2_model_pretrain.generate(
            tokens,
            max_length=100,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_k=50,
            top_p=0.95
        )

    return tokenizer.decode(output_tokens[0], skip_special_tokens=True)

# Test sinh payload mới
new_payload = generate_payload()
print(f"Payload mới từ VectorDB + GPT-2 Pretrain: {new_payload}")

Đã lưu 234 payloads vào Vector Database tại /content/drive/MyDrive/NCKH/vectorDB!
Dữ liệu trong ChromaDB: {'ids': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '