# SharePoint Indexer Setup (Traditional Approach)

This notebook sets up the **indexer-based pipeline** to crawl SharePoint documents into an Azure AI Search index with:
- Text chunking
- Vector embeddings (text-embedding-ada-002)
- Semantic ranking
- ACL-based security trimming

**Pipeline**: Data Source → Index → Skillset → Indexer

**API Version**: `2025-11-01-preview` (SharePoint indexer is in public preview)

> **Prerequisites**:
> - Azure AI Search (Basic+) with system-assigned managed identity enabled
> - Azure OpenAI with `text-embedding-ada-002` deployed
> - Entra ID app registration with `Files.Read.All` + `Sites.FullControl.All` (Application permissions)
> - Admin consent granted in the tenant
> - Copy `.env.template` to `.env` in this `notebooks/` folder and fill in values

## Step 0: Configuration

Loads values from `notebooks/.env` (same folder as this notebook).

In [None]:
import os
import json
import requests
import time
from dotenv import load_dotenv

# Load .env from the same directory as this notebook
load_dotenv()

# Azure AI Search
SEARCH_URL = os.getenv("AZURE_SEARCH_ENDPOINT")
SEARCH_API_KEY = os.getenv("AZURE_SEARCH_ADMIN_KEY")
API_VERSION = os.getenv("AZURE_SEARCH_API_VERSION", "2025-11-01-preview")

# Azure OpenAI (embedding model)
AOAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AOAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-ada-002")
AOAI_MODEL = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002")

# SharePoint / Entra ID
SPO_ENDPOINT = os.getenv("SPO_ENDPOINT")
SPO_APP_ID = os.getenv("SPO_APP_ID")
SPO_APP_SECRET = os.getenv("SPO_APP_SECRET")
SPO_TENANT_ID = os.getenv("SPO_TENANT_ID")

# Common headers
HEADERS = {
    "Content-Type": "application/json",
    "api-key": SEARCH_API_KEY
}

# Resource names
DATASOURCE_NAME = "sharepoint-datasource"
INDEX_NAME = "sharepoint-docs-index"
SKILLSET_NAME = "sharepoint-vectorization-skillset"
INDEXER_NAME = "sharepoint-indexer"

# Validate
missing = [v for v in ["AZURE_SEARCH_ENDPOINT", "AZURE_SEARCH_ADMIN_KEY", "AZURE_OPENAI_ENDPOINT",
                        "SPO_ENDPOINT", "SPO_APP_ID", "SPO_APP_SECRET", "SPO_TENANT_ID"] if not os.getenv(v)]
if missing:
    print(f"\u274c Missing env vars: {', '.join(missing)}")
    print("   Copy .env.template to .env and fill in values.")
else:
    print(f"Search endpoint: {SEARCH_URL}")
    print(f"OpenAI endpoint: {AOAI_ENDPOINT}")
    print(f"SharePoint site: {SPO_ENDPOINT}")
    print("Configuration loaded \u2713")

### Helper function

In [None]:
def call_search_api(method, path, body=None, expected_status=None):
    """Make a call to Azure AI Search REST API."""
    url = f"{SEARCH_URL}/{path}"
    params = {"api-version": API_VERSION}

    response = requests.request(method, url, headers=HEADERS, params=params, json=body)

    status = response.status_code
    if expected_status and status not in expected_status:
        print(f"\u274c {method} {path} \u2192 HTTP {status}")
        print(response.text)
        return None

    print(f"\u2713 {method} {path} \u2192 HTTP {status}")
    if response.text:
        try:
            return response.json()
        except ValueError:
            return response.text
    return None

## Step 1: Create Data Source

Creates a SharePoint data source with application permissions and ACL ingestion (`userIds`, `groupIds`).

In [None]:
datasource_body = {
    "name": DATASOURCE_NAME,
    "type": "sharepoint",
    "indexerPermissionOptions": ["userIds", "groupIds"],
    "credentials": {
        "connectionString": (
            f"SharePointOnlineEndpoint={SPO_ENDPOINT};"
            f"ApplicationId={SPO_APP_ID};"
            f"ApplicationSecret={SPO_APP_SECRET};"
            f"TenantId={SPO_TENANT_ID}"
        )
    },
    "container": {
        "name": "defaultSiteLibrary",
        "query": None
    }
}

result = call_search_api("POST", "datasources", datasource_body, expected_status=[201])
if result:
    print(f"Data source '{DATASOURCE_NAME}' created successfully.")

## Step 2: Create Search Index

Creates the search index with:
- **Text fields**: `content`, `title` (keyword + semantic search)
- **Vector field**: `content_vector` (1536 dims, HNSW, cosine)
- **ACL fields**: `UserIds`, `GroupIds` (permission filtering)
- **Semantic config**: title + content prioritization
- **Vectorizer**: Azure OpenAI for query-time vectorization

> This must be created **before** the skillset (index projections reference the index).

In [None]:
index_body = {
    "name": INDEX_NAME,
    "fields": [
        {"name": "id", "type": "Edm.String", "key": True, "filterable": True, "analyzer": "keyword"},
        {"name": "parent_id", "type": "Edm.String", "filterable": True},
        {"name": "metadata_spo_item_path", "type": "Edm.String", "filterable": False, "retrievable": True},
        {"name": "title", "type": "Edm.String", "searchable": True, "filterable": True, "retrievable": True},
        {"name": "content", "type": "Edm.String", "searchable": True, "retrievable": True},
        {
            "name": "content_vector",
            "type": "Collection(Edm.Single)",
            "searchable": True,
            "retrievable": False,
            "stored": False,
            "dimensions": 1536,
            "vectorSearchProfile": "vector-profile-hnsw"
        },
        {"name": "source_url", "type": "Edm.String", "retrievable": True, "filterable": False},
        {"name": "last_modified", "type": "Edm.DateTimeOffset", "filterable": True, "sortable": True},
        {"name": "file_type", "type": "Edm.String", "filterable": True, "facetable": True},
        {
            "name": "UserIds",
            "type": "Collection(Edm.String)",
            "permissionFilter": "userIds",
            "filterable": True,
            "retrievable": False
        },
        {
            "name": "GroupIds",
            "type": "Collection(Edm.String)",
            "permissionFilter": "groupIds",
            "filterable": True,
            "retrievable": False
        }
    ],
    "permissionFilterOption": "enabled",
    "vectorSearch": {
        "algorithms": [
            {
                "name": "hnsw-algorithm",
                "kind": "hnsw",
                "hnswParameters": {
                    "m": 4,
                    "efConstruction": 400,
                    "efSearch": 100,
                    "metric": "cosine"
                }
            }
        ],
        "profiles": [
            {
                "name": "vector-profile-hnsw",
                "algorithm": "hnsw-algorithm",
                "vectorizer": "openai-vectorizer"
            }
        ],
        "vectorizers": [
            {
                "name": "openai-vectorizer",
                "kind": "azureOpenAI",
                "azureOpenAIParameters": {
                    "resourceUri": AOAI_ENDPOINT,
                    "deploymentId": AOAI_DEPLOYMENT,
                    "modelName": AOAI_MODEL
                }
            }
        ]
    },
    "semantic": {
        "defaultConfiguration": "default",
        "configurations": [
            {
                "name": "default",
                "prioritizedFields": {
                    "titleField": {"fieldName": "title"},
                    "prioritizedContentFields": [
                        {"fieldName": "content"}
                    ]
                }
            }
        ]
    }
}

result = call_search_api("POST", "indexes", index_body, expected_status=[201])
if result:
    print(f"Index '{INDEX_NAME}' created successfully.")

## Step 3: Create Skillset

Creates the skillset with:
- **Text Split Skill**: chunks documents into 2000-character pages with 500-char overlap
- **Azure OpenAI Embedding Skill**: generates 1536-dim vectors for each chunk
- **Index Projections**: maps chunks into the index as separate documents

> Run this **after** the index is created (index projections reference the index).

In [None]:
skillset_body = {
    "name": SKILLSET_NAME,
    "skills": [
        {
            "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
            "name": "text-chunking",
            "description": "Split documents into chunks for embedding",
            "context": "/document",
            "textSplitMode": "pages",
            "maximumPageLength": 2000,
            "pageOverlapLength": 500,
            "unit": "characters",
            "defaultLanguageCode": "en",
            "inputs": [
                {"name": "text", "source": "/document/content"}
            ],
            "outputs": [
                {"name": "textItems", "targetName": "chunks"}
            ]
        },
        {
            "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
            "name": "embedding",
            "description": "Generate embeddings for each chunk",
            "context": "/document/chunks/*",
            "resourceUri": AOAI_ENDPOINT,
            "deploymentId": AOAI_DEPLOYMENT,
            "modelName": AOAI_MODEL,
            "dimensions": 1536,
            "inputs": [
                {"name": "text", "source": "/document/chunks/*"}
            ],
            "outputs": [
                {"name": "embedding", "targetName": "content_vector"}
            ]
        }
    ],
    "indexProjections": {
        "selectors": [
            {
                "targetIndexName": INDEX_NAME,
                "parentKeyFieldName": "parent_id",
                "sourceContext": "/document/chunks/*",
                "mappings": [
                    {"name": "content", "source": "/document/chunks/*"},
                    {"name": "content_vector", "source": "/document/chunks/*/content_vector"},
                    {"name": "title", "source": "/document/metadata_spo_item_name"},
                    {"name": "source_url", "source": "/document/metadata_spo_item_weburi"},
                    {"name": "last_modified", "source": "/document/metadata_spo_item_last_modified"},
                    {"name": "file_type", "source": "/document/metadata_spo_item_extension"},
                    {"name": "UserIds", "source": "/document/metadata_user_ids"},
                    {"name": "GroupIds", "source": "/document/metadata_group_ids"}
                ]
            }
        ],
        "parameters": {
            "projectionMode": "skipIndexingParentDocuments"
        }
    }
}

result = call_search_api("POST", "skillsets", skillset_body, expected_status=[201])
if result:
    print(f"Skillset '{SKILLSET_NAME}' created successfully.")

## Step 4: Create Indexer

Creates the indexer that ties everything together:
- Connects to the SharePoint data source
- Targets the search index
- Uses the skillset for chunking + embedding
- Runs on a 2-hour schedule
- Maps ACL fields (`UserIds`, `GroupIds`) from SharePoint metadata

In [None]:
indexer_body = {
    "name": INDEXER_NAME,
    "dataSourceName": DATASOURCE_NAME,
    "targetIndexName": INDEX_NAME,
    "skillsetName": SKILLSET_NAME,
    "schedule": {
        "interval": "PT2H"
    },
    "parameters": {
        "batchSize": 10,
        "maxFailedItems": -1,
        "maxFailedItemsPerBatch": -1,
        "configuration": {
            "indexedFileNameExtensions": ".pdf,.docx,.pptx,.xlsx,.txt",
            "excludedFileNameExtensions": ".png,.jpg,.jpeg,.gif",
            "dataToExtract": "contentAndMetadata"
        }
    },
    "fieldMappings": [
        {
            "sourceFieldName": "metadata_spo_site_library_item_id",
            "targetFieldName": "id",
            "mappingFunction": {"name": "base64Encode"}
        },
        {
            "sourceFieldName": "metadata_user_ids",
            "targetFieldName": "UserIds"
        },
        {
            "sourceFieldName": "metadata_group_ids",
            "targetFieldName": "GroupIds"
        }
    ]
}

result = call_search_api("POST", "indexers", indexer_body, expected_status=[201])
if result:
    print(f"Indexer '{INDEXER_NAME}' created successfully.")
    print("The indexer will begin crawling SharePoint. This may take several minutes.")

## Step 5: Check Indexer Status

Monitor the indexer's progress. Run this cell periodically until the status shows `success`.

In [None]:
result = call_search_api("GET", f"indexers/{INDEXER_NAME}/status", expected_status=[200])
if result:
    status = result.get("lastResult", {}).get("status", "unknown")
    doc_count = result.get("lastResult", {}).get("itemCount", 0)
    failed = result.get("lastResult", {}).get("failedItemCount", 0)
    print(f"Status:         {status}")
    print(f"Documents:      {doc_count}")
    print(f"Failed items:   {failed}")
    if result.get("lastResult", {}).get("errors"):
        print("Errors:")
        for err in result["lastResult"]["errors"][:5]:
            print(f"  - {err.get('message', '')}")

## Step 6: Test Search (Hybrid: Keyword + Vector + Semantic)

Run a hybrid search query against the index.

> If `permissionFilterOption` is `enabled`, unauthenticated queries return empty results. Use Step 7 to toggle ACLs off for testing.

In [None]:
SEARCH_QUERY = "What do you know about ZAVA?"  # Change this to your query

search_body = {
    "search": SEARCH_QUERY,
    "queryType": "semantic",
    "semanticConfiguration": "default",
    "select": "title, content, source_url",
    "top": 5,
    "count": True,
    "vectorQueries": [
        {
            "kind": "text",
            "text": SEARCH_QUERY,
            "fields": "content_vector",
            "k": 5
        }
    ]
}

result = call_search_api("POST", f"indexes/{INDEX_NAME}/docs/search", search_body, expected_status=[200])
if result:
    count = result.get("@odata.count", 0)
    print(f"Total matching documents: {count}\n")
    for i, doc in enumerate(result.get("value", []), 1):
        title = doc.get("title", "N/A")
        content_preview = doc.get("content", "")[:200]
        url = doc.get("source_url", "N/A")
        print(f"--- Result {i} ---")
        print(f"Title:   {title}")
        print(f"URL:     {url}")
        print(f"Content: {content_preview}...\n")

## Step 7: Toggle ACL Filter (for testing)

Temporarily disable the ACL filter to allow unauthenticated queries.

**Always re-enable before going to production!**

In [None]:
index_def = call_search_api("GET", f"indexes/{INDEX_NAME}", expected_status=[200])

if index_def:
    current = index_def.get("permissionFilterOption", "unknown")
    new_value = "disabled" if current == "enabled" else "enabled"

    index_def["permissionFilterOption"] = new_value

    # Remove read-only properties before PUT
    for key in ["@odata.context", "@odata.etag"]:
        index_def.pop(key, None)

    result = call_search_api("PUT", f"indexes/{INDEX_NAME}", index_def, expected_status=[200, 204])
    print(f"\nACL filter toggled: {current} \u2192 {new_value}")

## Cleanup (Optional)

Delete all pipeline resources in reverse order.

> This will permanently delete the index and all indexed data.

In [None]:
# Uncomment to delete all resources
# call_search_api("DELETE", f"indexers/{INDEXER_NAME}", expected_status=[204, 404])
# call_search_api("DELETE", f"skillsets/{SKILLSET_NAME}", expected_status=[204, 404])
# call_search_api("DELETE", f"indexes/{INDEX_NAME}", expected_status=[204, 404])
# call_search_api("DELETE", f"datasources/{DATASOURCE_NAME}", expected_status=[204, 404])
# print("All resources deleted.")