1. Initializes an Elasticsearch client, defines an index named "documents" with custom text analysis and dense vector embeddings, and creates the index with specified mappings and settings. It includes fields for text, base64-encoded images, and multiple dense vector embeddings for AI-based search and retrieval

In [6]:
from elasticsearch import Elasticsearch

# Initialize Elasticsearch client
es = Elasticsearch(["http://localhost:9200"])

# define the index name
index_name = "documents"

# Define settings and mappings with the custom analyzer
settings = {
    "settings": {
        "analysis": {
            "analyzer": {
                "custom_english_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_stop",
                    ]
                }
            },
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": "_english_" 
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {"type": "text", "analyzer": "custom_english_analyzer"},
            "content": {"type": "text", "analyzer": "custom_english_analyzer"},
            "llava_content": {"type": "text", "analyzer": "custom_english_analyzer"},
            "image_data": {"type": "text"},  # Storing Base64-encoded image data
            "gte_embedding": {"type": "dense_vector", "dims":3584},
            "mistral_embedding": {"type": "dense_vector", "dims": 4096},
            "image_embedding": {"type": "dense_vector", "dims": 768},
            "llava_content_embedding": {"type": "dense_vector", "dims": 4096}
        }
    }
}

# Create a new index with these settings (or apply settings when creating the index initially)
es.indices.create(index=index_name, body=settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'documents'})

 2. To check the existence of specified indices, and retrieves their storage size in gigabytes (GB), printing the results for each index. It helps monitor Elasticsearch indices by verifying their presence and estimating their storage usage.

In [1]:
from elasticsearch import Elasticsearch

# Initialize Elasticsearch client
es = Elasticsearch(["http://localhost:9200"])

# Define the indices you want to check
indices = ["documents", "documents1", "docutrail","clipdocuments"]  # Replace with your actual index names

# Function to check if an index exists
def index_exists(index_name):
    return es.indices.exists(index=index_name)

# Function to get the size of documents in an index in GB
def get_index_size_gb(index_name):
    if index_exists(index_name):
        index_stats = es.indices.stats(index=index_name)
        if index_name in index_stats['indices']:
            size_bytes = index_stats['indices'][index_name]['total']['store']['size_in_bytes']
            size_gb = size_bytes / (1024 ** 3)  # Convert bytes to GB
            return size_gb
    return 0

# Check existence and size of each index in GB
for index in indices:
    exists = index_exists(index)
    size_gb = get_index_size_gb(index)
    if exists:
        print(f"Index '{index}' exists. Size: {size_gb:.2f} GB.")
    else:
        print(f"Index '{index}' does not exist.")

Index 'documents' exists. Size: 9.11 GB.
Index 'documents1' does not exist.
Index 'docutrail' does not exist.
Index 'clipdocuments' does not exist.


- (Optional)
3. Retrieves the mapping (schema) of the "documents" index, displaying its structure and field types. 

In [23]:
from elasticsearch import Elasticsearch

# Initialize Elasticsearch client
es = Elasticsearch(["http://localhost:9200"])

# Index name
index_name = 'documents'

# Get the mapping of the index
mapping = es.indices.get_mapping(index=index_name)

# Print the mapping information
print(mapping)

{'documents': {'mappings': {'properties': {'content': {'type': 'text', 'analyzer': 'custom_english_analyzer'}, 'gte_embedding': {'type': 'dense_vector', 'dims': 3584, 'index': True, 'similarity': 'cosine', 'index_options': {'type': 'int8_hnsw', 'm': 16, 'ef_construction': 100}}, 'image_data': {'type': 'text'}, 'image_embedding': {'type': 'dense_vector', 'dims': 768, 'index': True, 'similarity': 'cosine', 'index_options': {'type': 'int8_hnsw', 'm': 16, 'ef_construction': 100}}, 'llava_content': {'type': 'text', 'analyzer': 'custom_english_analyzer'}, 'llava_content_embedding': {'type': 'dense_vector', 'dims': 4096, 'index': True, 'similarity': 'cosine', 'index_options': {'type': 'int8_hnsw', 'm': 16, 'ef_construction': 100}}, 'mistral_embedding': {'type': 'dense_vector', 'dims': 4096, 'index': True, 'similarity': 'cosine', 'index_options': {'type': 'int8_hnsw', 'm': 16, 'ef_construction': 100}}, 'title': {'type': 'text', 'analyzer': 'custom_english_analyzer'}}}}}


In [None]:
# to check the index and it's size: curl -X GET "http://localhost:9200/_cat/indices?v"
# delete the index: curl -X DELETE "http://localhost:9200/index_name"

- (Optional)
4. Updates the mapping of the "documents" index by adding a new clip_embedding field as a dense vector with 768 dimensions. Then verify the update by retrieving and printing the updated mapping.

In [3]:
from elasticsearch import Elasticsearch

# Initialize Elasticsearch client
es = Elasticsearch(["http://localhost:9200"])

# Index name
index_name = 'documents'

# Define the new field mapping
new_field_mapping = {
    "properties": {
        "clip_embedding": {
            "type": "dense_vector",
            "dims": 768
        }
    }
}

# Update the mapping of the index
try:
    response = es.indices.put_mapping(index=index_name, body=new_field_mapping)
    print("Mapping update successful:", response)
except Exception as e:
    print("An error occurred while updating mapping:", str(e))

# Get and print the updated mapping to verify
updated_mapping = es.indices.get_mapping(index=index_name)
print("Updated mapping:", updated_mapping)

Mapping update successful: {'acknowledged': True}
Updated mapping: {'documents': {'mappings': {'properties': {'clip_embedding': {'type': 'dense_vector', 'dims': 768, 'index': True, 'similarity': 'cosine', 'index_options': {'type': 'int8_hnsw', 'm': 16, 'ef_construction': 100}}, 'content': {'type': 'text'}, 'gte_embedding': {'type': 'dense_vector', 'dims': 3584, 'index': True, 'similarity': 'cosine', 'index_options': {'type': 'int8_hnsw', 'm': 16, 'ef_construction': 100}}, 'image_data': {'type': 'text'}, 'mistral_embedding': {'type': 'dense_vector', 'dims': 4096, 'index': True, 'similarity': 'cosine', 'index_options': {'type': 'int8_hnsw', 'm': 16, 'ef_construction': 100}}, 'title': {'type': 'text'}}}}}


- (Optional)

In [24]:
import torch
import gc

def clear_gpu_memory():
    # Clear all unused cached memory
    torch.cuda.empty_cache()
    # Collect garbage to free up memory
    gc.collect()

# Clear GPU memory
clear_gpu_memory()

- (Optional)

In [2]:
import pandas as pd

# Define the structure based on the provided mappings
data = {
    'Field': ['content', 'gte_embedding', 'image_data', 'image_embedding', 'llava_content', 'llava_content_embedding', 'mistral_embedding', 'title'],
    'Type': ['text', 'dense_vector', 'text', 'dense_vector', 'text', 'dense_vector', 'dense_vector', 'text'],
    'Dimensions': ['-', '3584', '-', '768', '-', '4096', '4096', '-'],
    'Analyzer/Index Options': ['custom_english_analyzer', 'cosine (int8_hnsw, m=16, ef_construction=100)', '-', 
                               'cosine (int8_hnsw, m=16, ef_construction=100)', 'custom_english_analyzer', 
                               'cosine (int8_hnsw, m=16, ef_construction=100)', 'cosine (int8_hnsw, m=16, ef_construction=100)', 
                               'custom_english_analyzer']
}

# Create a DataFrame to display the table
df = pd.DataFrame(data)

# Display the table
df

Unnamed: 0,Field,Type,Dimensions,Analyzer/Index Options
0,content,text,-,custom_english_analyzer
1,gte_embedding,dense_vector,3584,"cosine (int8_hnsw, m=16, ef_construction=100)"
2,image_data,text,-,-
3,image_embedding,dense_vector,768,"cosine (int8_hnsw, m=16, ef_construction=100)"
4,llava_content,text,-,custom_english_analyzer
5,llava_content_embedding,dense_vector,4096,"cosine (int8_hnsw, m=16, ef_construction=100)"
6,mistral_embedding,dense_vector,4096,"cosine (int8_hnsw, m=16, ef_construction=100)"
7,title,text,-,custom_english_analyzer
