## Re -Indexing Script for Sparse index

### Step 1. Import Required Libraries

In [None]:
! pip install elasticsearch

In [2]:
import re
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from transformers import AutoTokenizer
import os
from elasticsearch import Elasticsearch
import json
from datetime import datetime
from elasticsearch import helpers, exceptions, RequestError
import ast

### Step 2. Elastic Search Instance

In [None]:
# Create a global client connection to elastic search
es_client = Elasticsearch(
    "https://es-endpoint:port",
    basic_auth=("username", "password"),
    verify_certs=False,
    request_timeout=10000
)

In [None]:
es_client.info()

### Step 3. Creating Index

In [5]:
## get the files from specific folder
def get_all_files(folder_name):
    # Change the directory
    os.chdir(folder_name)
    # iterate through all file
    file_path_list =[]
    for file in os.listdir():
        print(file)
        file_path = f"{folder_name}/{file}"
        file_path_list.append(file_path)
    return file_path_list


## create the index
def create_index(index_name,mapping):
    try:
        es_client.indices.create(index=index_name,body = mapping)
        print(f"Index '{index_name}' created successfully.")
    except RequestError as e:
        if e.error == 'resource_already_exists_exception':
            print(f"Index '{index_name}' already exists.")
        else:
            print(f"An error occurred while creating index '{index_name}': {e}")


### Step 4. ELSER Model Name

In [6]:
es_model_name    = ".elser_model_2_linux-x86_64"

### Step 5.  Create ELSER ingest pipeline

In [7]:
def create_ingest_pipeline(client):
    client.ingest.put_pipeline(
            id="elser-ingest-pipeline",
            description="Ingest pipeline for ELSER",
            processors=[
            {
            "inference": {
                "model_id": es_model_name,
                 "input_output": [ 
                    {
                        "input_field": "chunk_text",
                        "output_field": "chunk_tokens"
                    }
                    ]
            }
            }
        ]
        )

In [None]:
create_ingest_pipeline(es_client)

### Step 6. Create ELSER index 

In [9]:
## Index mapping 
elser_index_mapping = {
    "settings" :{
    "index": {"default_pipeline": "elser-ingest-pipeline"},
    "number_of_replicas": 0,
        "number_of_shards": 1,
        "refresh_interval": "1m",
        "analysis": {
            "filter": {
                "possessive_english_stemmer": {
                    "type": "stemmer",
                    "language": "possessive_english"
                },
                "light_english_stemmer": {
                    "type": "stemmer",
                    "language": "light_english"
                },
                "english_stop": {
                    "ignore_case": "true",
                    "type": "stop",
                    "stopwords": ["a", "about", "all", "also", "am", "an", "and", "any", "are", "as", "at",
                                  "be", "been", "but", "by", "can", "de", "did", "do", "does", "for", "from",
                                  "had", "has", "have", "he", "her", "him", "his", "how", "if", "in", "into",
                                  "is", "it", "its", "more", "my", "nbsp", "new", "no", "non", "not", "of",
                                  "on", "one", "or", "other", "our", "she", "so", "some", "such", "than",
                                  "that", "the", "their", "then", "there", "these", "they", "this", "those",
                                  "thus", "to", "up", "us", "use", "was", "we", "were", "what", "when", "where",
                                  "which", "while", "why", "will", "with", "would", "you", "your", "yours"]
                }
            },
            "analyzer": {
                "text_en_no_stop": {
                    "filter": [
                        "lowercase",
                        "possessive_english_stemmer",
                        "light_english_stemmer"
                    ],
                    "tokenizer": "standard"
                },
                "text_en_stop": {
                    "filter": [
                        "lowercase",
                        "possessive_english_stemmer",
                        "english_stop",
                        "light_english_stemmer"
                    ],
                    "tokenizer": "standard"
                },
                "whitespace_lowercase": {
                    "tokenizer": "whitespace",
                    "filter": [
                        "lowercase"
                    ]
                }
            },
            "normalizer": {
                "keyword_lowercase": {
                    "filter": [
                        "lowercase"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "id": {"type": "keyword"},
            "body_content": {"type": "text"},
            "content_heading": {"type": "text"},
            "domains": {"type": "keyword"},
            "headings": {"type": "text"},
            "last_crawled_at": {"type": "date"},
            "links": {"type": "keyword"},
            "main_content": { "type": "text"},
            "meta_description": {"type": "text"},
            "meta_keywords": {"type": "keyword"},
            "title": {"type": "text"},
            "url": {"type": "keyword"},
            "url_path": {"type": "keyword"},
            "chunk_no":{"type":"text"},
            "chunk_text":{"type":"text"},
            "chunk_heading":{"type":"text"},
            "chunk_tokens": { "type": "rank_features"}
                 }
            }
        
}

In [10]:
index_name = "main-index-name"

In [None]:
create_index(index_name,elser_index_mapping) ## CHANGE INDEX NAME

### Step 7. Chunking

In [12]:
# create tokenizer for chunking
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
def length_function(text):
    return len(tokenizer.encode(text, add_special_tokens=False))

# create text splitter
text_splitter = RecursiveCharacterTextSplitter(
    length_function = length_function,
    chunk_size = 512,
    chunk_overlap = 100
)

### Step 8. Get all docs from Index

In [13]:
def get_all_docs_es(index_name):
    try:
        search_query={
            "query": {
                "match_all": {}
            }
            }
        query_nested_index = es_client.search(
        index=index_name,
        body=search_query,
        scroll='5m',  # Set the scroll timeout (e.g., 5 minutes)
        size=800 ## set value to get all documnets
        )
    except Exception as e:
        return {"msg": "Error searching indexes", "error": e}
    
    # Get relevant chunks and format
    references_context1 = [(chunks["_source"], chunks["_score"]) for chunks in query_nested_index["hits"]["hits"]]
    return references_context1

### Step 9. Create a docs for ELSER index

In [14]:
def create_index_doc_elser(resp):
      document_list =[]
      try:
            l=0
            for (ind, score) in resp:
                print("***********************", l)
                l += 1
                id = ind.get('id')
                body_content = ind.get('body_content','')
                content_heading= ind.get('content_heading','')
                domains = ind.get('domains','')
                url = ind.get('url', None)
                headings = ind.get('headings', '')
                last_crawled_at = ind.get('last_crawled_at', '')
                links = ind.get('links', '')
                main_content = ind.get('main_content', '')
                meta_description = ind.get('meta_description', '')
                meta_keywords = ind.get('meta_keywords', '')
                title = ind.get('title', '')
                url_path = ind.get('url_path', '')
                if main_content == '':
                      main_content = body_content
                if len(main_content) > 512:
                    chunks = text_splitter.split_text(main_content)
                    for i,chunk in enumerate(chunks):
                                doc_json = {
                                    "id": id,
                                    "title":title,
                                    "headings":headings,
                                    "body_content":body_content,
                                    "content_heading":content_heading,
                                    "domains":domains,
                                    "url":url,
                                    "last_crawled_at":last_crawled_at,
                                    "links":links,
                                    "main_content":main_content,
                                    "meta_description":meta_description,
                                    "meta_keywords":meta_keywords,
                                    "url_path":url_path,
                                    "chunk_no": i,
                                    "chunk_heading":content_heading,
                                    "chunk_text": chunk,
                                        }
                                document_list.append(doc_json)
                else:
                    doc_json = {
                                    "id": id,
                                    "title":title,
                                    "headings":headings,
                                    "body_content":body_content,
                                    "content_heading":content_heading,
                                    "domains":domains,
                                    "url":url,
                                    "last_crawled_at":last_crawled_at,
                                    "links":links,
                                    "main_content":main_content,
                                    "meta_description":meta_description,
                                    "meta_keywords":meta_keywords,
                                    "url_path":url_path,
                                    "chunk_no": 0,
                                    "chunk_heading":content_heading,
                                    "chunk_text": main_content,
                                }
                    document_list.append(doc_json)
            return document_list
      except Exception as e:
         print(e)
  

### Step 10. Getting Documnets form base_crawlled_index

In [None]:
base_index_name = 'base-index-name'  ## Change the index_name
references_context1 = get_all_docs_es(base_index_name)
len(references_context1)

In [None]:
document_list = create_index_doc_elser(references_context1)

In [None]:
len(document_list)

### Step 11. Bulk Ingestion

In [18]:
import time
documents = []
for doc in document_list:
    documents.append(
        {
            "_index": index_name, ## CHANGE INDEX NAME
            "_source": doc,
        }
    )

In [None]:
len(documents)

In [None]:
documents[0:1]

In [None]:
def chunk_documents(documents, num_chunks):
    chunk_size = len(documents) // num_chunks
    remainder = len(documents) % num_chunks

    start = 0
    for i in range(num_chunks):
        chunk_end = start + chunk_size + (1 if i < remainder else 0)
        yield documents[start:chunk_end]
        start = chunk_end

# Example usage
total_docs = len(documents)
num_chunks = 19

start_time = time.time()
for i, chunk in enumerate(chunk_documents(documents, num_chunks)):
    #clear_output(wait=True)
    print(f"Chunk {i+1}: {len(chunk)} documents")
    try:
        response =helpers.bulk(es_client, chunk)
        print(response)
        print("Done indexing documents into ",{index_name}, "index!",{len(chunk)}) ## CHANGE INDEX NAME
    except Exception as e: 
        # Handle the exception
        print("An error occurred:", e)
    

end_time = time.time()

In [None]:
elapsed_time =end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")