In [1]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from sentence_transformers import SentenceTransformer

client = QdrantClient("http://localhost:6333")

model = SentenceTransformer("BAAI/bge-m3")

collection_name = "db_schema"
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(size=1024, distance=models.Distance.COSINE)
)

  from .autonotebook import tqdm as notebook_tqdm
  client.recreate_collection(


True

In [2]:
import json

with open("schemas.json", "r", encoding="utf-8") as f:
    schema_data = json.load(f)

print(schema_data["customer"]["ddl"])       
print(schema_data["actor"]["des"])  

CREATE TABLE public.customer ( customer_id integer DEFAULT nextval('public.customer_customer_id_seq'::regclass) NOT NULL, store_id integer NOT NULL, first_name text NOT NULL, last_name text NOT NULL, email text, address_id integer NOT NULL, activebool boolean DEFAULT true NOT NULL, create_date date DEFAULT CURRENT_DATE NOT NULL, last_update timestamp with time zone DEFAULT now(), active integer );
Contains information about actors, including their names and last update timestamp.


In [3]:
payloads = []
texts = []
for key in schema_data:
    text = f"Table: {key}\nDescription: {schema_data[key]['des']}\nSchema: {schema_data[key]['ddl']}"
    texts.append(text)
    payloads.append({'table': key, 'description': schema_data[key]['des'], 'ddl': schema_data[key]['ddl']})

In [4]:
embeddings = model.encode(texts).tolist()

In [5]:
client.upsert(
    collection_name=collection_name,
    points=[
        models.PointStruct(id=i, vector=embeddings[i], payload=payloads[i])
        for i in range(len(embeddings))
    ]
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
query = "Find the top 5 customers who spent the most money renting films in the ‘Action’ category during 2022, showing their full name, email, total amount spent, and the store where they rented the most"
query_vector = model.encode([query])[0]

search_result = client.search(
    collection_name=collection_name,
    query_vector=query_vector,
    limit=7
)

for r in search_result:
    print(r.payload, r.score)

{'table': 'store', 'description': 'Represents a video rental store, linking staff (manager) and address.', 'ddl': "CREATE TABLE public.store ( store_id integer DEFAULT nextval('public.store_store_id_seq'::regclass) NOT NULL, manager_staff_id integer NOT NULL, address_id integer NOT NULL, last_update timestamp with time zone DEFAULT now() NOT NULL );"} 0.5286474
{'table': 'rental', 'description': 'Stores rental transactions, linking inventory, customers, and staff.', 'ddl': "CREATE TABLE public.rental ( rental_id integer DEFAULT nextval('public.rental_rental_id_seq'::regclass) NOT NULL, rental_date timestamp with time zone NOT NULL, inventory_id integer NOT NULL, customer_id integer NOT NULL, return_date timestamp with time zone, staff_id integer NOT NULL, last_update timestamp with time zone DEFAULT now() NOT NULL );"} 0.5238225
{'table': 'category', 'description': 'Stores film categories such as Action, Comedy, Drama.', 'ddl': "CREATE TABLE public.category ( category_id integer DEFAUL

In [18]:
query = "Table customer (customer_id PK, first_name, last_name, email), Table payment (payment_id PK, customer_id FK, rental_id FK, amount, payment_date), Table rental (rental_id PK, inventory_id FK, customer_id FK, rental_date), Table inventory (inventory_id PK, film_id FK, store_id FK), Table film (film_id PK), Table film_category (film_id PK FK, category_id PK FK), Table category (category_id PK, name), Table store (store_id PK)."
query_vector = model.encode([query])[0]

search_result = client.search(
    collection_name=collection_name,
    query_vector=query_vector,
    limit=7 
)

for r in search_result:
    print(r.payload, r.score)

{'table': 'film', 'description': 'Contains film details such as title, description, release year, language, length, rental rates, and special features.', 'ddl': "CREATE TABLE public.film ( film_id integer DEFAULT nextval('public.film_film_id_seq'::regclass) NOT NULL, title text NOT NULL, description text, release_year public.year, language_id integer NOT NULL, original_language_id integer, rental_duration smallint DEFAULT 3 NOT NULL, rental_rate numeric(4,2) DEFAULT 4.99 NOT NULL, length smallint, replacement_cost numeric(5,2) DEFAULT 19.99 NOT NULL, rating public.mpaa_rating DEFAULT 'G'::public.mpaa_rating, last_update timestamp with time zone DEFAULT now() NOT NULL, special_features text[], fulltext tsvector NOT NULL );"} 0.70965326
{'table': 'payment', 'description': 'Records payments made by customers for rentals, including amount and payment date.', 'ddl': "CREATE TABLE public.payment ( payment_id integer DEFAULT nextval('public.payment_payment_id_seq'::regclass) NOT NULL, custome

In [22]:
from qdrant_client.models import VectorParams, Distance, SparseIndexParams
client.recreate_collection(
    collection_name=collection_name,
    vectors_config={
        "dense": models.VectorParams(size=1024, distance=models.Distance.COSINE)
    },
    sparse_vectors_config={
        "sparse": models.SparseVectorParams()
    }
)

True

In [25]:
from FlagEmbedding import BGEM3FlagModel

In [26]:
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

Fetching 30 files: 100%|██████████| 30/30 [00:34<00:00,  1.16s/it]


In [29]:
from qdrant_client.models import VectorParams, Distance, SparseIndexParams, SparseVector

In [None]:
points = []
for i, text in enumerate(texts):
    
    output = model.encode(
        [text],
        return_dense=True,
        return_sparse=True,

    )
    
    dense_vec = output['dense_vecs'][0].tolist()
    
    sparse_data_dict = output['lexical_weights'][0] 
    
    
    indices = [int(k) for k in sparse_data_dict.keys()]
    
    values = [float(v) for v in sparse_data_dict.values()]
    
    sparse_vec = models.SparseVector(
        indices=indices,
        values=values   
    )
    

    point_struct = models.PointStruct(
        id=i,
        payload=payloads[i],
        vector={
            'dense': dense_vec,        
            'sparse': sparse_vec      
        }
    )
    points.append(point_struct)

Đã hoàn thành việc tạo list points. Sẵn sàng cho việc upsert vào Qdrant.


In [None]:
client.upsert(
    collection_name=collection_name,
    wait=True,
    points=points
)


Đã chèn 15 điểm dữ liệu vào collection.


In [None]:
def hybrid_search_bge_m3(
    client: QdrantClient, 
    model: BGEM3FlagModel, 
    query_text: str, 
    limit: int = 5,
    prefetch_limit: int = 10,
    collection_name: str = collection_name
):

    query_output = model.encode(
        [query_text],
        return_dense=True,
        return_sparse=True
    )
    
    query_dense_vec = query_output['dense_vecs'][0].tolist()
    
    sparse_data_dict = query_output['lexical_weights'][0]
    indices = [int(k) for k in sparse_data_dict.keys()]
    values = [float(v) for v in sparse_data_dict.values()]
    
    query_sparse_vec = models.SparseVector(
        indices=indices,
        values=values
    )

    search_result = client.query_points(
        collection_name=collection_name,
        
        query=models.FusionQuery(
            fusion=models.Fusion.RRF
        ),
        
        prefetch=[
            models.Prefetch(
                query=query_dense_vec, 
                using='dense',
                limit=prefetch_limit
            ),
            models.Prefetch(
                query=query_sparse_vec, 
                using='sparse',
                limit=prefetch_limit
            )
        ],
        limit=limit,
        with_payload=True
    )

    # 5. Định dạng kết quả
    results = []
    for point in search_result.points:
        results.append({
            "id": point.id,
            "score": point.score,
            "payload": point.payload
        })
        
    return results

In [48]:
"""
customer
payment
rental
inventory
film
film_category
category
"""

'\ncustomer\npayment\nrental\ninventory\nfilm\nfilm_category\ncategory\n'

In [54]:
query = "Find the top 5 customers who spent the most money renting films in the ‘Action’ category during 2022, showing their full name, email, total amount spent, and the store where they rented the most"
results = hybrid_search_bge_m3(client, model, query, 8)
for result in results:
    print(result['payload']['table'], result['score'])

store 0.5909091
customer 0.5
film_category 0.44444445
rental 0.44444445
category 0.4166667
inventory 0.375
payment 0.34285715
address 0.2909091


In [53]:
query = "Find the top 5 customers who spent the most money renting films in the ‘Action’ category during 2022, showing their full name, email, total amount spent, and the store where they rented the most. Table customer (customer_id PK, first_name, last_name, email), Table payment (payment_id PK, customer_id FK, rental_id FK, amount, payment_date), Table rental (rental_id PK, inventory_id FK, customer_id FK, rental_date), Table inventory (inventory_id PK, film_id FK, store_id FK), Table film (film_id PK), Table film_category (film_id PK FK, category_id PK FK), Table category (category_id PK, name), Table store (store_id PK)."
results = hybrid_search_bge_m3(client, model, query, 8)
for result in results:
    print(result['payload']['table'], result['score'])

payment 0.8333334
customer 0.5909091
rental 0.5833334
store 0.375
film 0.31111112
film_category 0.31111112
category 0.30952382
inventory 0.2916667
