
# SharePoint Search Pipeline (Graph + Azure OpenAI + Azure AI Search)

This notebook ingests content from **SharePoint via Microsoft Graph Search**, chunks & embeds it with **Azure OpenAI**, indexes it into **Azure AI Search**, and lets you run **hybrid search**.

> Fill the config in the next cell (Tenant/App details and Azure keys). Then run cells from top to bottom.



## Prerequisites

- **Azure AD App (application permissions):**
  - `SearchQuery.All` (Graph)
  - `Sites.Read.All` (Graph)
  - Optionally `Files.Read.All` if you need file bodies
  - **Admin consent granted**
- **Azure OpenAI** with an embeddings deployment (e.g., `text-embedding-3-large`)
- **Azure AI Search** (vector-enabled service)

> ⚠️ This notebook uses `requests` against REST APIs (Graph + Azure AI Search). No SDKs required.


In [None]:

# If running locally, uncomment to install requirements
# %pip install msal python-dotenv requests


## Configuration

In [None]:

import os

# --- Microsoft Entra ID / Graph ---
TENANT_ID = os.getenv("TENANT_ID", "YOUR_TENANT_ID")
CLIENT_ID = os.getenv("CLIENT_ID", "YOUR_CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET", "YOUR_CLIENT_SECRET")
GRAPH_SCOPE = os.getenv("GRAPH_SCOPE", "https://graph.microsoft.com/.default")

# --- Azure OpenAI (Embeddings) ---
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "https://YOUR-RESOURCE.openai.azure.com")
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY", "YOUR_AOAI_KEY")
AZURE_OPENAI_EMBED_DEPLOY = os.getenv("AZURE_OPENAI_EMBED_DEPLOY", "text-embedding-3-large")

# --- Azure AI Search ---
AI_SEARCH_ENDPOINT = os.getenv("AI_SEARCH_ENDPOINT", "https://YOUR-SEARCH.search.windows.net")
AI_SEARCH_KEY = os.getenv("AI_SEARCH_KEY", "YOUR_SEARCH_KEY")
AI_SEARCH_INDEX = os.getenv("AI_SEARCH_INDEX", "sp-hybrid-index")

# --- Chunking ---
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1500"))
CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "200"))

print("Config loaded. Edit values above or set environment variables before running.")


## Auth & HTTP helpers

In [None]:

import msal, requests

def get_graph_token(tenant_id, client_id, client_secret, scope):
    app = msal.ConfidentialClientApplication(
        client_id=client_id,
        client_credential=client_secret,
        authority=f"https://login.microsoftonline.com/{tenant_id}"
    )
    result = app.acquire_token_silent([scope], account=None)
    if not result:
        result = app.acquire_token_for_client(scopes=[scope])
    if "access_token" not in result:
        raise RuntimeError(f"Graph token error: {result}")
    return result["access_token"]

def graph_get(url, token, **kwargs):
    headers = {"Authorization": f"Bearer {token}"}
    return requests.get(url, headers=headers, **kwargs)

def graph_post(url, token, json):
    headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
    return requests.post(url, headers=headers, json=json)


## Microsoft Graph SharePoint Search

In [None]:

GRAPH_SEARCH_URL = "https://graph.microsoft.com/v1.0/search/query"

def sp_search(query="*", size=100):
    token = get_graph_token(TENANT_ID, CLIENT_ID, CLIENT_SECRET, GRAPH_SCOPE)
    body = {
        "requests": [{
            "entityTypes": ["driveItem","listItem","site","list"],
            "query": {"queryString": query},
            "contentSources": ["sharepoint"],
            "from": 0,
            "size": min(size, 500)
        }]
    }
    resp = graph_post(GRAPH_SEARCH_URL, token, json=body)
    resp.raise_for_status()
    data = resp.json()
    hits = []
    for res in data.get("value", []):
        for h in res.get("hitsContainers", []):
            hits += h.get("hits", [])
    return hits

# Quick smoke test (requires valid credentials):
# hits = sp_search("policy", size=5)
# len(hits), hits[0].keys() if hits else None


## Chunking utility

In [None]:

import re

def split_text(text, chunk_size=1500, overlap=200):
    tokens = re.split(r"(\n\n|\n|\.|\?|!)", text)
    buf, cur = [], 0
    for t in tokens:
        piece = t if t is not None else ""
        if cur + len(piece) > chunk_size and buf:
            yield "".join(buf)
            buf = [piece[-overlap:]] if overlap else []
            cur = len(buf[0]) if buf else 0
        else:
            buf.append(piece)
            cur += len(piece)
    if buf:
        yield "".join(buf)


## Azure OpenAI Embeddings

In [None]:

def embed_texts(texts):
    url = f"{AZURE_OPENAI_ENDPOINT.rstrip('/')}"
    url += f"/openai/deployments/{AZURE_OPENAI_EMBED_DEPLOY}/embeddings?api-version=2024-02-01"
    headers = {"api-key": AZURE_OPENAI_KEY, "Content-Type": "application/json"}
    payload = {"input": texts}
    r = requests.post(url, headers=headers, json=payload)
    r.raise_for_status()
    return [d["embedding"] for d in r.json()["data"]]


## Azure AI Search client

In [None]:

import json

def ai_search_headers():
    return {"Content-Type": "application/json", "api-key": AI_SEARCH_KEY}

def ai_search_create_index_if_missing():
    url = f"{AI_SEARCH_ENDPOINT.rstrip('/')}/indexes/{AI_SEARCH_INDEX}?api-version=2024-07-01"
    schema = {
      "name": AI_SEARCH_INDEX,
      "fields": [
        {"name":"id","type":"Edm.String","key":True,"searchable":False},
        {"name":"title","type":"Edm.String","searchable":True},
        {"name":"content","type":"Edm.String","searchable":True},
        {"name":"url","type":"Edm.String","searchable":False},
        {"name":"siteDomain","type":"Edm.String","filterable":True},
        {"name":"fileType","type":"Edm.String","filterable":True},
        {"name":"vector","type":"Collection(Edm.Single)","searchable":True,
         "vectorSearchDimensions":1536,"vectorSearchProfileName":"vprof"}
      ],
      "vectorSearch": {"profiles":[{"name":"vprof","algorithm":"hnsw"}]}
    }
    r = requests.put(url, headers=ai_search_headers(), data=json.dumps(schema))
    if r.status_code not in (200,201,204,409):
        r.raise_for_status()

def ai_search_upsert(docs):
    url = f"{AI_SEARCH_ENDPOINT.rstrip('/')}/indexes/{AI_SEARCH_INDEX}/docs/index?api-version=2024-07-01"
    payload = {"value": [{"@search.action":"mergeOrUpload", **d} for d in docs]}
    r = requests.post(url, headers=ai_search_headers(), data=json.dumps(payload))
    r.raise_for_status()

def ai_search_search(query, top=8, filters=None, vector=None):
    url = f"{AI_SEARCH_ENDPOINT.rstrip('/')}/indexes/{AI_SEARCH_INDEX}/docs/search?api-version=2024-07-01"
    body = {"search": query, "top": top}
    if vector is not None:
        body["vectorQueries"] = [{"vector": vector, "kNearestNeighbors": top, "fields": "vector"}]
    if filters:
        clauses = []
        for k,v in filters.items():
            if isinstance(v, str):
                clauses.append(f"{k} eq '{v}'")
        if clauses:
            body["filter"] = " and ".join(clauses)
    r = requests.post(url, headers=ai_search_headers(), data=json.dumps(body))
    r.raise_for_status()
    return r.json().get("value", [])


## Ingestion pipeline

In [None]:

import hashlib

def fetch_text_from_hit(hit):
    # Prefer summary; download if a direct link is present and accessible as text.
    props = hit.get("resource", {})
    text = props.get("summary") or ""
    download = props.get("downloadUrl")
    if download:
        try:
            r = requests.get(download, timeout=10)
            if r.ok and r.headers.get("Content-Type","").startswith(("text/", "application/json")):
                text = r.text
        except Exception:
            pass
    return text

def build_doc_records(hits):
    docs, chunks = [], []
    for h in hits:
        props = h.get("resource", {})
        url = props.get("webUrl") or props.get("webUrlPreview") or ""
        title = props.get("name") or props.get("title") or "Untitled"
        fileType = (props.get("fileType") or props.get("fileExtension") or "").lower()
        siteDomain = url.split("/")[2] if "//" in url else ""
        body_text = fetch_text_from_hit(h)

        for c in split_text(body_text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
            doc_id = hashlib.sha1((url + c).encode()).hexdigest()
            rec = {
                "id": doc_id,
                "title": title,
                "content": c,
                "url": url,
                "fileType": fileType,
                "siteDomain": siteDomain
            }
            docs.append(rec)
            chunks.append(c)
    return docs, chunks

def ingest_sharepoint(query="*", size=100):
    ai_search_create_index_if_missing()
    hits = sp_search(query=query, size=size)
    if not hits:
        return {"indexed": 0}
    docs, chunks = build_doc_records(hits)
    if not chunks:
        return {"indexed": 0}
    vectors = embed_texts(chunks)
    for d, v in zip(docs, vectors):
        d["vector"] = v
    ai_search_upsert(docs)
    return {"indexed": len(docs)}


## Search convenience function

In [None]:

def hybrid_search(query, top_k=8, filters=None):
    qvec = embed_texts([query])[0]
    results = ai_search_search(query, top=top_k, filters=filters, vector=qvec)
    def shape(r):
        return {
            "score": r.get("@search.score", 0.0),
            "title": r.get("title","Untitled"),
            "snippet": (r.get("content","")[:320] + ("…" if len(r.get("content",""))>320 else "")),
            "url": r.get("url",""),
            "fileType": r.get("fileType",""),
            "siteDomain": r.get("siteDomain","")
        }
    return list(map(shape, results))

# Example (after ingest):
# hybrid_search("expense reimbursement policy", top_k=5, filters={"fileType":"pdf"})


## Run the pipeline

In [None]:

# 1) Create/ensure index exists, then ingest some SharePoint content
# result = ingest_sharepoint(query="*", size=100)
# result


In [None]:

# 2) Try a search
# results = hybrid_search("travel expense policy", top_k=5, filters=None)
# for r in results:
#     print(f"[{r['score']:.2f}] {r['title']} — {r['url']}")
#     print(r['snippet'])
#     print()
