**Copy Books Dataset to Azure AI Search**

- Create the Index
- Validate Data
- Copy Data to index


In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import SearchIndex, SimpleField, SearchFieldDataType, SearchField, ComplexField

# Define your Azure Cognitive Search service and API key
service_name = 'xxxxxxxxx'
admin_key = 'xxxxxxxxxx'
index_name = 'books-index'

# Create a client
endpoint = f'https://{service_name}.search.windows.net'
admin_client = SearchIndexClient(endpoint=endpoint, credential=AzureKeyCredential(admin_key))

# Define the fields of the index
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, retrievable=True, stored=True),
    SearchField(name="Title", type=SearchFieldDataType.String, searchable=True, filterable=True, retrievable=True, stored=True, analyzer_name="standard.lucene"),
    SearchField(name="Description", type=SearchFieldDataType.String, searchable=True, filterable=False, retrievable=True, stored=True, analyzer_name="standard.lucene"),
    SearchField(name="Author", type=SearchFieldDataType.String, searchable=True, filterable=True, retrievable=True, stored=True, sortable=True, facetable=True, analyzer_name="standard.lucene"),
    SearchField(name="Genres", type=SearchFieldDataType.Collection(SearchFieldDataType.String), searchable=True, filterable=True, retrievable=True, stored=True, facetable=True, analyzer_name="standard.lucene"),
    SimpleField(name="Rating", type=SearchFieldDataType.Double, filterable=True, retrievable=True, stored=True, sortable=True, facetable=True)
]

# Define the index
index = SearchIndex(name=index_name, fields=fields)

# Create the index
admin_client.create_index(index)
print(f'Index "{index_name}" created successfully.')


In [None]:
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
import json

# Define your Azure Cognitive Search credentials and endpoint
service_endpoint = "https://azaivztqx.search.windows.net"
index_name = "books-index"
admin_key = "UvNc9RS47BkkZi0Hz7XPdSkpvi9QXDuqbg6rrejGw5AzSeBxWhxe"

# Create a SearchClient
client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=AzureKeyCredential(admin_key))

# Load the JSON data
output_file = "/lakehouse/default/Files/bookersdata.json"
with open(output_file, 'r') as file:
    documents = json.load(file)  # Load the JSON array directly

# Initialize lists for valid and invalid documents
valid_documents = []
invalid_documents = []  # <-- Declare this list here

# Ensure all required fields, including the description, are present
for doc in documents:
    if 'Description' not in doc:
        doc['Description'] = ''  # Fill missing descriptions with an empty string

# Validate each document
for doc in documents:
    valid = True
    
    # Validate and convert Rating to float
    if 'Rating' in doc:
        try:
            doc['Rating'] = float(doc['Rating'])
        except ValueError:
            print(f"Invalid value for Rating in document ID {doc.get('id', 'unknown')}: {doc['Rating']}")
            invalid_documents.append(doc)
            valid = False
    
    # Validate Genres is a list of strings
    if 'Genres' in doc:
        if isinstance(doc['Genres'], str):
            try:
                doc['Genres'] = json.loads(doc['Genres'])
            except json.JSONDecodeError:
                print(f"Invalid JSON format for Genres in document ID {doc.get('id', 'unknown')}: {doc['Genres']}")
                invalid_documents.append(doc)
                valid = False
        elif isinstance(doc['Genres'], list):
            if not all(isinstance(genre, str) for genre in doc['Genres']):
                print(f"Unexpected format for Genres in document ID {doc.get('id', 'unknown')}: {doc['Genres']}")
                invalid_documents.append(doc)
                valid = False
        else:
            print(f"Unexpected format for Genres in document ID {doc.get('id', 'unknown')}: {doc['Genres']}")
            invalid_documents.append(doc)
            valid = False
    
    if valid:
        valid_documents.append(doc)

# Log the number of valid and invalid documents
print(f"Valid documents: {len(valid_documents)}")
print(f"Invalid documents: {len(invalid_documents)}")

# Upload valid documents to the Azure Search index
if valid_documents:
    result = client.upload_documents(documents=valid_documents)
    print(f"Uploaded {len(valid_documents)} documents to the Azure Search index. Results: {result}")
else:
    print("No valid documents to upload.")
