# Import libraries

In [1]:
import getpass
import os
from langchain_openai import OpenAIEmbeddings
from langchain_milvus import Milvus
from dotenv import load_dotenv

# Constant

In [2]:
NUMBER_OF_SAMPLE = 100
FILE_PATH = 'data/filtered_ReposVul_K_samples.jsonl'

# Define env variables

In [3]:

load_dotenv()
os.environ["LANGSMITH_TRACING"]=os.getenv("LANGSMITH_TRACING")
os.environ["LANGSMITH_ENDPOINT"]=os.getenv("LANGSMITH_ENDPOINT")
os.environ["LANGSMITH_API_KEY"]=os.getenv("LANGSMITH_API_KEY")
os.environ["LANGSMITH_PROJECT"]="RAG"
os.environ["OPENAI_API_BASE"]=os.getenv("OPENAI_API_BASE")
os.environ["OPENROUTER_API_KEY"] = os.getenv("OPENROUTER_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

MODEL_NAME = "mistralai/mistral-small-3.1-24b-instruct:free"
OPENAI_API_BASE = "https://openrouter.ai/api/v1"

# Embedding model

In [4]:
import getpass
import os

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Create a Milvus database

In [5]:
DB_NAME = "project_2_v2"
COLLECTION_NAME = "V_dataset"
FILE_PATH = 'data/filtered_ReposVul_V_samples.jsonl'
CHUNK_SIZE = 15000
CHUNK_OVERLAP = 200

In [6]:
from pymilvus import Collection, MilvusException, connections, db, utility

conn = connections.connect(host="127.0.0.1", port=19530)

# Check if the database exists
db_name = DB_NAME
try:
    existing_databases = db.list_database()
    if db_name in existing_databases:
        print(f"Database '{db_name}' already exists.")

        # # Use the database context
        # db.using_database(db_name)

        # # Drop all collections in the database
        # collections = utility.list_collections()
        # for collection_name in collections:
        #     collection = Collection(name=collection_name)
        #     collection.drop()
        #     print(f"Collection '{collection_name}' has been dropped.")

        # db.drop_database(db_name)
        # print(f"Database '{db_name}' has been deleted.")
    else:
        print(f"Database '{db_name}' does not exist.")
        database = db.create_database(db_name)
        print(f"Database '{db_name}' created successfully.")
except MilvusException as e:
    print(f"An error occurred: {e}")

Database 'project_2_v2' already exists.


In [7]:
from langchain_milvus import BM25BuiltInFunction, Milvus

URI = "http://localhost:19530"

vectorstore = Milvus(
    embedding_function=embeddings,
    connection_args={"uri": URI, "token": "root:Milvus", "db_name": DB_NAME},
    index_params={"index_type": "FLAT", "metric_type": "COSINE"},
    consistency_level="Strong",
    drop_old=False,  # set to True if seeking to drop the collection with that name if it exists
    auto_id=True,
    collection_name=COLLECTION_NAME,
)

# Indexing

## Load filtered_ReposVul_V_samples.jsonl

In [8]:
import json

# Đường dẫn đến file .jsonl
file_path = FILE_PATH

samples = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)  # Chuyển dòng JSON thành dict
        samples.append(data)

# In ra số lượng mẫu đọc được
print(f"Đã đọc {len(samples)} mẫu.")

# # In thử 5 mẫu đầu tiên để kiểm tra
for i, sample in enumerate(samples[:5], start=1):
    print(f"Sample {i}: {sample}")


Đã đọc 6774 mẫu.
Sample 4: {'index': 14, 'cve_id': 'CVE-2011-0991', 'cwe_id': ['CWE-399'], 'cve_language': 'C', 'commit_message': "Implement a reference queue API.\n\n\t* gc.c: A reference queue allows one to queue\n\tcallbcks for when objects are collected.\n\tIt allows for safe cleanup of objects that can\n\tonly be done when it is effectively collected.\n\tThe major difference with regular finalization\n\tis that the collector makes sure the object\n\twas collected - and can't be resurrected.\n\n\t* gc-internal.h: Export entrypoints for the\n\tnew API.", 'code': "/*\n * metadata/gc-internal.h: Internal GC interface\n *\n * Author: Paolo Molaro <lupus@ximian.com>\n *\n * (C) 2002 Ximian, Inc.\n */\n\n#ifndef __MONO_METADATA_GC_INTERNAL_H__\n#define __MONO_METADATA_GC_INTERNAL_H__\n\n#include <glib.h>\n#include <mono/metadata/object-internals.h>\n#include <mono/metadata/threads-types.h>\n#include <mono/utils/gc_wrapper.h>\n\n#define mono_domain_finalizers_lock(domain) EnterCriticalSec

In [9]:
print(samples[0].keys())


dict_keys(['index', 'cve_id', 'cwe_id', 'cve_language', 'commit_message', 'code', 'code_before'])


## Store documents to Collection

In [10]:
import re
import json
from uuid import uuid4
from langchain_core.documents import Document
# Cần import thêm Text Splitter từ LangChain
from langchain.text_splitter import RecursiveCharacterTextSplitter # Hoặc CharacterTextSplitter, ...

# --- Giả định vectorstore và samples đã được khởi tạo ---
# (Sử dụng lại ví dụ samples)

# --- Khởi tạo Text Splitter ---
# RecursiveCharacterTextSplitter thường tốt cho code vì nó cố gắng tách tại các dấu xuống dòng, khoảng trắng, v.v.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len, # Sử dụng hàm len chuẩn
    is_separator_regex=False, # Tách theo ký tự, không phải regex phức tạp
)

In [11]:
from math import ceil

documents_to_add = []
ids_to_add = []

for sample_index, sample in enumerate(samples[:NUMBER_OF_SAMPLE]): # Lấy 5 sample đầu ví dụ
    # Lấy code gốc, không làm sạch comment
    # print(f"\rĐang xử lý sample {sample_index + 1}/{len(samples)}...")
    original_code = sample.get('code', '')

    # Lấy metadata gốc một lần
    original_metadata = {}
    metadata_keys = ['index', 'cve_id', 'cwe_id', 'cve_language']
    for key in metadata_keys:
        if key in sample:
            value = sample[key]
            if key == 'cwe_id':
                if isinstance(value, list):
                    original_metadata[key] = ",".join(map(str, value))
                elif value is not None:
                    original_metadata[key] = str(value)
                else:
                    original_metadata[key] = ""
            else:
                 original_metadata[key] = value # Tạm gán trực tiếp

    # --- CHIA NHỎ CODE THÀNH CÁC CHUNKS ---
    code_chunks = text_splitter.split_text(original_code)

    # --- TẠO DOCUMENT CHO MỖI CHUNK ---
    for chunk_index, chunk_content in enumerate(code_chunks):
        # Tạo metadata cho chunk này, bao gồm thông tin gốc và thông tin chunk
        chunk_metadata = original_metadata.copy() # Sao chép metadata gốc
        chunk_metadata['original_sample_index'] = sample_index # Liên kết về sample gốc
        chunk_metadata['chunk_id'] = chunk_index # Đánh số thứ tự chunk
        chunk_metadata['total_chunks'] = len(code_chunks) # Tổng số chunk của sample này

        # Tạo đối tượng Document cho chunk
        doc = Document(page_content=chunk_content, metadata=chunk_metadata)
        documents_to_add.append(doc)

        # Tạo ID duy nhất cho mỗi chunk Document
        ids_to_add.append(str(uuid4()))


# Thêm vào vectorstore theo từng batch
if documents_to_add:
    try:
        print(f"Đã chuẩn bị {len(documents_to_add)} chunk documents")
        
        batch_size = 10
        total_batches = ceil(len(documents_to_add) / batch_size)

        for i in range(total_batches):
            start = i * batch_size
            end = start + batch_size
            batch_docs = documents_to_add[start:end]
            batch_ids = ids_to_add[start:end]

            print(f"\rĐang thêm batch {i + 1}/{total_batches}...", end="")
            vectorstore.add_documents(documents=batch_docs)

        print("\nThêm thành công tất cả các chunk documents vào vectorstore.")
    except Exception as e:
        print(f"\nĐã xảy ra lỗi khi thêm documents: {e}")
else:
    print("Không có documents nào được chuẩn bị.")


Đã chuẩn bị 350 chunk documents
Đang thêm batch 35/35...
Thêm thành công tất cả các chunk documents vào vectorstore.
