# Elastic Search Labs Workplace Search document loader notebook

Ingest json documents into Elasticsearch for use with the [Elastic Search Labs Workplace Search](https://github.com/elastic/elasticsearch-labs/tree/main/example-apps/workplace-search) example application


## Import Libraries


In [11]:
import os

from dotenv import load_dotenv
from elasticsearch import Elasticsearch
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticsearchStore

from getpass import getpass

## Define Functions


In [12]:
# Metadata extraction function
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["name"] = record.get("name")
    metadata["summary"] = record.get("summary")
    metadata["url"] = record.get("url")
    metadata["category"] = record.get("category")
    metadata["updated_at"] = record.get("updated_at")
    metadata["source"] = "gov.uk"

    return metadata

## Define Variables


In [13]:
FILE = "../data/data.json"
INDEX = "search-hmrc.app-docs"

# Define the ML model to use for embeddings
ml_model = ".elser_model_1"


# Use getpass to add credentials at runtime
ELASTIC_CLOUD_ID = getpass("What is your ELASTIC_CLOUD_ID: ")
ELASTIC_USERNAME = getpass("What is your ELASTIC_USERNAME: ")
ELASTIC_PASSWORD = getpass("What is your ELASTIC_PASSWORD: ")

# # Or, load environment variables using dotenv - make sure you have defined a `.env` file
# load_dotenv()
#
# ELASTIC_CLOUD_ID = os.getenv("ELASTIC_CLOUD_ID")
# ELASTIC_USERNAME = os.getenv("ELASTIC_USERNAME")
# ELASTIC_PASSWORD = os.getenv("ELASTIC_PASSWORD")

## Create Elasticsearch Client


In [None]:
elasticsearch_client = Elasticsearch(
    cloud_id=ELASTIC_CLOUD_ID, basic_auth=(ELASTIC_USERNAME, ELASTIC_PASSWORD)
)

## Load Data


In [None]:
# Initialize a JSONLoader object with the file path, schema, content key, and metadata function
print(f"Loading data from ${FILE}")

loader = JSONLoader(
    file_path=FILE,
    jq_schema=".[]",
    content_key="content",
    metadata_func=metadata_func,
)

# Load the workplace documents using the loader
workplace_docs = loader.load()

# Print the number of loaded documents
print(f"Loaded {len(workplace_docs)} documents")

# Initialize a CharacterTextSplitter object with chunk size and overlap
text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=400)

# Transform the workplace documents into chunks using the text splitter
docs = text_splitter.transform_documents(workplace_docs)

# Print the number of split documents and chunks
print(f"Split {len(workplace_docs)} documents into {len(docs)} chunks")

# Create an Elasticsearch sparse vector store in Elastic Cloud
print(
    f"Creating Elasticsearch sparse vector store in Elastic Cloud: {ELASTIC_CLOUD_ID}"
)

# Create the ElasticsearchStore with the documents, Elasticsearch connection, index name, retrieval strategy, and ML model
ElasticsearchStore.from_documents(
    docs,
    es_connection=elasticsearch_client,
    index_name=INDEX,
    strategy=ElasticsearchStore.SparseVectorRetrievalStrategy(),
    ml_model=ml_model,
)