In [1]:
import yaml
import os

In [2]:
import os
from dotenv import load_dotenv

from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient 
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import (
    ComplexField,
    CorsOptions,
    SearchIndex,
    ScoringProfile,
    SearchFieldDataType,
    SimpleField,
    SearchableField
)


In [3]:
load_dotenv()

index_name = "lucasmeyer-test"
service_name = "dsna-wu-cognitivesearch"

# Create an SDK client
endpoint = f"https://{service_name}.search.windows.net/"

print(endpoint)

admin_key = os.getenv("AZURE_SEMANTIC_SEARCH_KEY")

admin_client = SearchIndexClient(endpoint=endpoint,
                      index_name=index_name,
                      credential=AzureKeyCredential(admin_key))

search_client = SearchClient(endpoint=endpoint,
                      index_name=index_name,
                      credential=AzureKeyCredential(admin_key))

https://dsna-wu-cognitivesearch.search.windows.net/


In [19]:
try:
    result = admin_client.delete_index(index_name)
    print ('Index', index_name, 'Deleted')
except Exception as ex:
    print (ex)

Index lucasmeyer-test Deleted


In [24]:
# Specify the index schema
name = index_name
fields = [
        SimpleField(name="chunk_id", type=SearchFieldDataType.String, key=True),
        SimpleField(name="filename", type=SearchFieldDataType.String, sortable=True),
        SearchableField(name="title", type=SearchFieldDataType.String, sortable=True),
        SearchableField(name="content", type=SearchFieldDataType.String, analyzer_name="en.lucene"),
        SimpleField(name="date", type=SearchFieldDataType.DateTimeOffset, facetable=True, filterable=True, sortable=True)
    ]
cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
scoring_profiles = []


In [25]:
index = SearchIndex(
    name=name,
    fields=fields,
    scoring_profiles=scoring_profiles,
    semantic_settings = {
      "configurations": [
        {
          "name": "lucasmeyer-test-semantic-config",
          "prioritizedFields": {
            "titleField": {
                  "fieldName": "title"
                },
            "prioritizedContentFields": [
              {
                "fieldName": "content"
              }
            ],
            "prioritizedKeywordsFields": [
            ]
          }
        }
      ]
    },
    cors_options=cors_options)

try:
    result = admin_client.create_index(index)
    print ('Index', result.name, 'created')
except Exception as ex:
    print (ex)

Index lucasmeyer-test created


In [28]:
# for each file in the directory posts

# Create a list of dicts
# each dict has a title, date, and content

search_items = []

for filename in os.listdir("posts"):
    # if the file is not .qmd, skip it
    if not filename.endswith(".qmd"):
        continue

    # open the file
    with open("posts/" + filename, "r", encoding='utf-8') as f:
    
        # read the file
        content = f.read()

        # separate into YAML and content
        parts = content.split("---")
        yaml_content = parts[1]
        post_content = parts[2]

        # from the YAML, get the title and the date
        yaml_dict = yaml.load(yaml_content, Loader=yaml.FullLoader)
        title = yaml_dict["title"]
        date = yaml_dict["date"]

        # Break the post_content into chunks of about one paragraph
        # (separated by two newlines)
        chunks = post_content.split("\n\n")

        i = 0
        for c in chunks:
            i += 1
            # if c is empty, skip it
            if c == "":
                continue

            # create a dict with title, date, and content
            search_item = {}
            search_item["@search.action"] = "upload"
            search_item["chunk_id"] = filename[:-4] + "-" + str(i)
            search_item["filename"] = filename[:-4]
            search_item["title"] = title

            # save the date as a string in YYYY-MM-DD format
            search_item["date"] = f'{date.strftime("%Y-%m-%d")}T00:00:00Z'
            search_item["content"] = c.replace("\n", " ")


            # add the dict to the list
            search_items.append(search_item)

# print(search_items)

In [29]:
try:
    result = search_client.upload_documents(documents=search_items)
    print(f"Upload of new document succeeded: {result[0].succeeded}")
except Exception as ex:
    print (ex.message)

Upload of new document succeeded: True


In [42]:
results = search_client.search(search_text="which hurricane has the author of this blog post experienced?", 
                               include_total_count=True, 
                               query_type="semantic", 
                               query_language="en", 
                               semantic_configuration_name="lucasmeyer-test-semantic-config",
                               top=10)

print ('Total Documents Matching Query:', results.get_count())
for result in results:
    if result["@search.reranker_score"]:
        print(result)

Total Documents Matching Query: 460
{'date': '2017-09-11T00:00:00Z', 'filename': '2017-09-11-hurricane-irma', 'content': 'Hurricane Irma happened when I was out on a business trip, and my family was in Parkland, FL. Forecasts went from "maybe it will hit you" to "if you stay you\'ll surely die" to "I think it\'s not going to hit you anymore" to "Phew, it will miss" in four days.', 'chunk_id': '2017-09-11-hurricane-irma-3', 'title': 'Hurricane Irma', '@search.score': 8.536456, '@search.reranker_score': 0.616607666015625, '@search.highlights': None, '@search.captions': None}
{'date': '2017-09-11T00:00:00Z', 'filename': '2017-09-11-hurricane-irma', 'content': 'The uncertainty seemed higher for places that were north of Tampa: it was hard to tell whether the hurricane would turn either way in 3 days. On the other hand, the forecasts are better in close proximity to the hurricane, and it seemed pretty certain that the hurricane would miss the Parkland/Boca Raton area. So we made the somewha