# Azure AI Search: vector search, step by step

## Setup API client


In [2]:
import os
import dotenv
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential

dotenv.load_dotenv()

AZURE_SEARCH_SERVICE = os.getenv("AZURE_SEARCH_SERVICE")
AZURE_SEARCH_ENDPOINT = f"https://{AZURE_SEARCH_SERVICE}.search.windows.net"
AZURE_SEARCH_SERVICE_KEY = os.getenv("AZURE_SEARCH_SERVICE_KEY")

search_service_cred = AzureKeyCredential(AZURE_SEARCH_SERVICE_KEY)
index_client = SearchIndexClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=search_service_cred)

## Tìm kiếm trên index

### Tạo index
- Bước đầu tiên chúng ta phải tạo một index.
- Index giống như collection chứa các tài liệu
- Chúng ta cần định nghĩa:
    - Cấu trúc của index: các field và kiểu dữ liệu
    - Thuật toán tìm kiếm trên vector
- Đoạn mã sau tạo ra index có 2 field:
    - id: id của document
    - embedding: là vector đại diện cho document, ở đây chúng ta định nghĩa số chiều của vector (vector_search_dimensions) là 3 

In [6]:
from azure.search.documents.indexes.models import (
    HnswAlgorithmConfiguration,
    HnswParameters,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
)

AZURE_SEARCH_TINY_INDEX = "teeenytinyindex"

index = SearchIndex(
    name=AZURE_SEARCH_TINY_INDEX, 
    fields=[
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchField(name="embedding", 
                    type=SearchFieldDataType.Collection(SearchFieldDataType.Single), 
                    searchable=True, 
                    vector_search_dimensions=3,
                    vector_search_profile_name="embedding_profile")
    ],
    vector_search=VectorSearch(
        algorithms=[HnswAlgorithmConfiguration( # Hierachical Navigable Small World, IVF
                            name="hnsw_config",
                            kind=VectorSearchAlgorithmKind.HNSW,
                            parameters=HnswParameters(metric="cosine"),
                        )],
        profiles=[VectorSearchProfile(name="embedding_profile", algorithm_configuration_name="hnsw_config")]
    )
)

index_client.create_index(index)

<azure.search.documents.indexes.models._index.SearchIndex at 0x7f95dfed30d0>

### Thêm một vài document vào index

In [7]:
from azure.search.documents import SearchClient

search_client = SearchClient(AZURE_SEARCH_ENDPOINT, AZURE_SEARCH_TINY_INDEX, credential=search_service_cred)
search_client.upload_documents(documents=[
    {"id": "1", "embedding": [1, 2, 3]},
    {"id": "2", "embedding": [1, 1, 3]},
    {"id": "3", "embedding": [4, 5, 6]}])

[<azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f95dfef7e10>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f95dfef7510>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f95dfef7310>]

### Tìm kiếm với vector similarity

In [10]:
from azure.search.documents.models import VectorizedQuery

r = search_client.search(search_text=None, vector_queries=[
    VectorizedQuery(vector=[1, 2, 3], k_nearest_neighbors=3, fields="embedding")])
for doc in r:
    print(f"id: {doc['id']}, score: {doc['@search.score']}")

id: 1, score: 0.9999999
id: 3, score: 0.9752594
id: 2, score: 0.9680425


## Tìm kiếm với index lớn hơn
Giả sử chúng ta có thông tin sản phẩm như trong file products.txt, chúng ta cần lưu trữ các sản phẩm trong vector DB là Azure AI Search.

In [15]:
products = []
with open("../data/products.txt", "r") as f:
    for line in f:
        products.append(line)

In [21]:
products

['Dell, $999.00, Dell laptops are known for their durability and performance, making them a popular choice for both personal and professional use.\n',
 'Lenovo, $899.00, Lenovo laptops offer a great balance of performance and price, with a variety of models suitable for different needs.\n',
 'HP, $799.00, HP laptops are versatile and reliable, with a range of options from budget-friendly to high-end models.\n',
 'Apple, $1299.00, Apple laptops are known for their sleek design and powerful performance, ideal for creative professionals.\n',
 'Asus, $699.00, Asus laptops provide excellent value for money, with a focus on gaming and high-performance computing.\n',
 'Dell Laptop, $999.00, Dell laptops are known for their durability and performance, making them a popular choice for both personal and professional use. They come with a variety of features including high-resolution displays, powerful processors, and long battery life, ensuring that users can work efficiently and effectively.\n'

In [23]:
import dotenv
import openai

dotenv.load_dotenv()

# Initialize Azure search variables
AZURE_SEARCH_SERVICE = os.getenv("AZURE_SEARCH_SERVICE")
AZURE_SEARCH_ENDPOINT = f"https://{AZURE_SEARCH_SERVICE}.search.windows.net"

# Set up OpenAI client based on environment variables
dotenv.load_dotenv()
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")

openai_client = openai.AzureOpenAI(
    api_version="2023-07-01-preview",
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com")

def get_embedding(text):
    get_embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_EMBEDDING_DEPLOYMENT, input=text)
    return get_embeddings_response.data[0].embedding

Trước tiên, tạo một index khác.

Index bao gồm các field:
- id
- content: Lưu thông tin sản phẩm
- sourcefile: Lưu tên file để sử dụng cho việc trích dẫn sau này
- embedding: Lưu vector đại diện cho thông tin sản phẩm

Lưu ý: Kích thước vector là 1536 nếu sử dụng mô hình text-embedding-ada-002

In [17]:
from azure.search.documents.indexes.models import SearchableField
AZURE_SEARCH_FULL_INDEX = "gptkbindex"

index = SearchIndex(
    name=AZURE_SEARCH_FULL_INDEX, 
    fields=[
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="content", type=SearchFieldDataType.String),
        SearchableField(name="sourcefile", type=SearchFieldDataType.String),
        SearchField(name="embedding", 
                    type=SearchFieldDataType.Collection(SearchFieldDataType.Single), 
                    searchable=True, 
                    vector_search_dimensions=1536,
                    vector_search_profile_name="embedding_profile")
    ],
    vector_search=VectorSearch(
        algorithms=[HnswAlgorithmConfiguration( # Hierachical Navigable Small World, IVF
                            name="hnsw_config",
                            kind=VectorSearchAlgorithmKind.HNSW,
                            parameters=HnswParameters(metric="cosine"),
                        )],
        profiles=[VectorSearchProfile(name="embedding_profile", algorithm_configuration_name="hnsw_config")]
    )
)

index_client.create_index(index)

<azure.search.documents.indexes.models._index.SearchIndex at 0x7f95def84cd0>

Bước tiếp theo, chúng ta sẽ cần chuẩn bị danh sách documents từ products

In [24]:
documents = [{"id": str(i), "content": prod, "embedding": get_embedding(prod), "sourcefile": "products.txt"} for i, prod in enumerate(products)]

In [25]:
documents[0]

{'id': '0',
 'content': 'Dell, $999.00, Dell laptops are known for their durability and performance, making them a popular choice for both personal and professional use.\n',
 'embedding': [0.02280965819954872,
  -0.011798543855547905,
  0.0019088727422058582,
  -0.03614433482289314,
  -0.02052481845021248,
  0.010339861735701561,
  -0.019621210172772408,
  -0.025055769830942154,
  0.00036951128276996315,
  -0.011908267624676228,
  0.018575606867671013,
  0.04043002054095268,
  -0.014057564549148083,
  -0.004685855004936457,
  0.005705641582608223,
  0.014289921149611473,
  0.012598882429301739,
  -0.017400914803147316,
  0.024229614064097404,
  -0.01763327233493328,
  -0.046135663986206055,
  -0.006609249860048294,
  -0.002218681387603283,
  0.013057141564786434,
  -0.03431130200624466,
  -0.01076584868133068,
  0.02235785312950611,
  -0.007351499516516924,
  0.027779502794146538,
  0.012360071763396263,
  0.015425886027514935,
  0.009061900898814201,
  -0.03653159737586975,
  -0.01137

Upload document lên index

In [26]:

search_client = SearchClient(AZURE_SEARCH_ENDPOINT, AZURE_SEARCH_FULL_INDEX, credential=search_service_cred)
search_client.upload_documents(documents=documents)

[<azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f95d44fa7d0>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f95d4299050>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f95d4299010>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f95d42990d0>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f95d4299150>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f95d4299250>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f95d4299290>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f95d4299310>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f95d4299390>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f95d4299450>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f95d4299410>,
 <azure.se

### Bây giờ chúng ta có thể tìm kiếm thông tin sản phẩm

In [29]:
search_query = "do you have laptops?"
search_vector = get_embedding(search_query)
r = search_client.search(search_text=None, top=5, vector_queries=[
    VectorizedQuery(vector=search_vector, k_nearest_neighbors=5, fields="embedding")])
for doc in r:
    content = doc["content"].replace("\n", " ")[:150]
    print(f"Score: {doc['@search.score']:.5f}\tContent:{content}")

Score: 0.81818	Content:Dell, $999.00, Dell laptops are known for their durability and performance, making them a popular choice for both personal and professional use. 
Score: 0.81443	Content:Lenovo, $899.00, Lenovo laptops offer a great balance of performance and price, with a variety of models suitable for different needs. 
Score: 0.80903	Content:Asus, $699.00, Asus laptops provide excellent value for money, with a focus on gaming and high-performance computing. 
Score: 0.80895	Content:Samsung Smartphone, $999.00, Samsung smartphones are known for their cutting-edge technology and innovative features. They offer high-resolution displ
Score: 0.80821	Content:HP, $799.00, HP laptops are versatile and reliable, with a range of options from budget-friendly to high-end models. 
