In [None]:
# Install required packages on Databricks cluster
%pip install openai pydantic python-dotenv

In [None]:
# Restart Python kernel to load new packages
dbutils.library.restartPython()

In [None]:
# Import required modules
from pydantic import BaseModel, Field, field_validator
from datetime import datetime
import json
from pathlib import Path

# ==========================================
# PYDANTIC SCHEMAS - Definice struktury dat
# ==========================================
# Tyto t≈ô√≠dy definuj√≠ OƒåEK√ÅVAN√ù FORM√ÅT v√Ωstupu z LLM.
# Pydantic zaji≈°≈•uje:
# 1. Type safety - ka≈æd√© pole m√° jasn√Ω datov√Ω typ (str, list[str], dict...)
# 2. Validation - automatick√° kontrola form√°tu (nap≈ô. ISO 8601 datum)
# 3. Default values - pokud LLM pole vynech√°, pou≈æije se v√Ωchoz√≠ hodnota
# 4. Field descriptions - metadata pro LLM prompt (vysvƒõtluje, co pole obsahuje)

class ProjectMetadata(BaseModel):
    """Metadata o business projektu.

    Obsahuje z√°kladn√≠ identifikaƒçn√≠ √∫daje:
    - project_name: N√°zev projektu (nap≈ô. "Supplier Risk Insights 2.0")
    - sponsor: Jm√©no sponzora/zadavatele (nap≈ô. "Marek Hrub√Ω")
    - submitted_at: Datum pod√°n√≠ ≈æ√°dosti ve form√°tu ISO 8601 (YYYY-MM-DD)
    - extra: Slovn√≠k pro dodateƒçn√° pole (nap≈ô. "oddƒõlen√≠", "priorita")

    Pydantic automaticky validuje, ≈æe v≈°echna pole maj√≠ spr√°vn√Ω typ.
    """

    project_name: str = Field(description="Name of the project")
    sponsor: str = Field(description="Name of the project sponsor")
    submitted_at: str = Field(description="Date when the request was submitted, in ISO 8601 format (YYYY-MM-DD)")
    extra: dict[str, str] = Field(default_factory=dict, description="Additional metadata fields as key-value pairs")

    @field_validator('submitted_at')
    @classmethod
    def validate_iso_date(cls, v: str) -> str:
        """Custom validator: Zkontroluje, ≈æe datum je ve spr√°vn√©m ISO 8601 form√°tu.

        Jak to funguje:
        1. Pydantic zavol√° tuto funkci PO kontrole typu (≈æe 'v' je str)
        2. Pokus√≠me se datum parsovat pomoc√≠ datetime.fromisoformat()
        3. Pokud √∫spƒõch ‚Üí vr√°t√≠me p≈Øvodn√≠ string (validace OK)
        4. Pokud chyba ‚Üí vyhod√≠me ValueError s jasnou chybovou zpr√°vou

        P≈ô√≠klad:
        - "2025-10-28" ‚Üí ‚úÖ projde
        - "28.10.2025" ‚Üí ‚ùå ValueError
        - "unknown" ‚Üí ‚ùå ValueError
        """
        try:
            datetime.fromisoformat(v)
            return v
        except ValueError:
            raise ValueError(f"Date must be in ISO 8601 format (YYYY-MM-DD), got: {v}")


class BusinessRequest(BaseModel):
    """Kompletn√≠ struktura parsovan√©ho business dokumentu.

    Obsahuje 9 hlavn√≠ch ƒç√°st√≠ business po≈æadavku:
    1. project_metadata - z√°kladn√≠ √∫daje o projektu (nested object)
    2. goal - hlavn√≠ c√≠l projektu (string)
    3. scope_in - co JE v rozsahu (string)
    4. scope_out - co NEN√ç v rozsahu (string)
    5. entities - kl√≠ƒçov√© business entity (array of strings)
    6. metrics - KPI metriky (array of strings)
    7. sources - datov√© zdroje (array of strings)
    8. constraints - omezen√≠ a po≈æadavky (array of strings)
    9. deliverables - po≈æadovan√© v√Ωstupy (array of strings)

    Default hodnoty:
    - Stringy: "unknown" (pokud LLM pole nenajde)
    - Listy: [] (pr√°zdn√Ω array m√≠sto None)

    D√≠ky Pydantic m≈Ø≈æeme:
    - validated = BusinessRequest(**json_dict) ‚Üí automatick√° validace
    - validated.model_dump() ‚Üí zpƒõt do dict pro ulo≈æen√≠ do JSON
    - validated.project_metadata.sponsor ‚Üí type-safe p≈ô√≠stup k pol√≠m
    """

    project_metadata: ProjectMetadata = Field(description="Project metadata including name, sponsor, and submission date")
    goal: str = Field(default="unknown", description="Main goal or objective of the project")
    scope_in: str = Field(default="unknown", description="What is included in the project scope")
    scope_out: str = Field(default="unknown", description="What is explicitly excluded from the project scope")
    entities: list[str] = Field(default_factory=list, description="Key business entities involved in the project")
    metrics: list[str] = Field(default_factory=list, description="Key metrics or KPIs to be tracked")
    sources: list[str] = Field(default_factory=list, description="Expected data sources for the project")
    constraints: list[str] = Field(default_factory=list, description="Constraints, limitations, or special requirements")
    deliverables: list[str] = Field(default_factory=list, description="Required deliverables or artifacts from the project")

print("‚úÖ Schemas defined successfully")


## 1. Configure Azure OpenAI with Databricks Secrets

**Setup Instructions:**
```bash
# Create secret scope (run once in Databricks CLI or UI)
databricks secrets create-scope --scope mcop

# Add Azure OpenAI credentials
databricks secrets put --scope mcop --key azure-openai-endpoint
databricks secrets put --scope mcop --key azure-openai-api-key
databricks secrets put --scope mcop --key azure-openai-deployment-name
```

**Expected values:**
- `azure-openai-endpoint`: https://minar-mhi2wuzy-swedencentral.cognitiveservices.azure.com/openai/v1/
- `azure-openai-deployment-name`: test-gpt-5-mini

In [None]:
# Get Azure configuration from Databricks secrets
AZURE_ENDPOINT = dbutils.secrets.get(scope="mcop", key="azure-openai-endpoint").strip()
AZURE_API_KEY = dbutils.secrets.get(scope="mcop", key="azure-openai-api-key").strip()
DEPLOYMENT_NAME = dbutils.secrets.get(scope="mcop", key="azure-openai-deployment-name").strip()

if not all([AZURE_ENDPOINT, AZURE_API_KEY, DEPLOYMENT_NAME]):
    raise ValueError("Missing Azure configuration in Databricks secrets (scope: mcop)")

print(f"‚òÅÔ∏è Azure OpenAI configured from Databricks secrets")
print(f"   Endpoint: {AZURE_ENDPOINT}")
print(f"   Deployment: {DEPLOYMENT_NAME}")


## 2. Load Sample Business Document

In [None]:
# Hardcoded sample business document
business_document = """# ≈Ω√°dost o datov√Ω projekt ‚Äì Supplier Risk Insights 2.0

## Projekt
**N√°zev:** Supplier Risk Insights 2.0
**Sponzor:** Marek Hrub√Ω (VP Procurement Excellence)
**Datum:** 2025-10-28
**Oddƒõlen√≠:** Group Procurement Analytics
**Priorita:** Kritick√° ‚Äì Q4 OKR "Stabilizace dodavatelsk√©ho ≈ôetƒõzce"

## C√≠l
Dodat konsolidovan√Ω pohled na spolehlivost dodavatel≈Ø nap≈ô√≠ƒç BA/BS datamar≈•y a SAP ECC zdroji. V√Ωsledn√Ω reporting mus√≠ upozor≈àovat na dodavatele s rostouc√≠m lead time, ƒçast√Ωmi reklamacemi nebo blokacemi plateb, aby procurement dok√°zal vƒças p≈ôesmƒõrovat objem a eskalovat smluvn√≠ pokuty.

## Rozsah

### In Scope
- Historick√° data o purchase orders (posledn√≠ch 36 mƒõs√≠c≈Ø) vƒçetnƒõ RU/DE regionu.
- Dimenze dodavatel, produkt, dodac√≠ lokace, n√°kupn√≠ organizace.
- SLA metriky: on-time delivery, defect rate, invoice dispute count.
- Sp√°rov√°n√≠ se security klasifikac√≠ (Confidential vs Internal).
- Export KPI do Power BI workspace "Supplier Control Tower".

### Out of Scope
- Forecasting budouc√≠ch objedn√°vek (≈ôe≈°√≠ Supply Planning t√Ωm).
- Integrace s CRM a risk ratingy t≈ôet√≠ch stran.
- Real-time streaming ze SCADA nebo IoT senzor≈Ø.
- Detailn√≠ finanƒçn√≠ mar≈æe ‚Äì pou≈æ√≠v√° Finance Controlling.

## Kl√≠ƒçov√© entity & metriky

### Entity
- Supplier Master (Collibra/Unity Catalog `dimv_supplier`).
- Purchase Order Header + Item (`factv_purchase_order`, `factv_purchase_order_item`).
- Quality Incident (`factv_quality_notification`).
- Delivery Calendar Dimension (`dimv_delivery_date`).

### Metriky
- Supplier Reliability Index (v√°≈æen√Ω mix on-time %, dispute rate, defect rate).
- Average Goods Receipt Lead Time (dny).
- % PO s ‚Äûblocked for payment" statusem.
- NCR Count (non-conformance reports) za posledn√≠ kvart√°l.
- Spend concentration top 10 dodavatel≈Ø.

## Oƒçek√°van√© zdroje
- Databricks Unity Catalog: `dm_ba_purchase`, `dm_bs_purchase` schemata.
- Collibra Data Catalog export (zaji≈°≈•uje lineage a vlastn√≠ky).
- SAP ECC tabulky: `EKKO`, `EKPO`, `LFA1`, `MKPF`.
- SharePoint slo≈æka "Supplier Audits" pro manu√°ln√≠ NCR z√°pisy.

## Omezen√≠
- GDPR: ≈æ√°dn√° osobn√≠ data supplier kontakt≈Ø v datasetu; pseudonymizace ID.
- Data retention: pouze 3 roky historie v produkƒçn√≠m modelu.
- Ka≈æd√Ω dashboard refresh < 5 min, jinak neprojde SLA.
- Row Level Security podle regionu (EMEA, AMER, APAC).
- Pouze read-only p≈ô√≠stup do SAP; ≈æ√°dn√© z√°pisy zpƒõt.

## Po≈æadovan√© artefakty
- Kur√°torovan√© `business_request.json` a `structure.json` pro Tool 3/7.
- Quality report shrnuj√≠c√≠ articulationScore + missingFromSource flagy.
- Power BI semantic model + definice DAX measures.
- Governance runbook popisuj√≠c√≠ validace a kontakty (owner, steward).
- Checklist P0/P1/P2 mitigac√≠ pro Supplier Risk komisi.
"""

print(f"üìÑ Business document loaded ({len(business_document)} characters)")
print("\nFirst 300 characters:")
print("=" * 60)
print(business_document[:300])
print("...")

## 3. Parse Document Using Azure OpenAI

In [None]:
# ==========================================
# AZURE OPENAI CLIENT - Inicializace LLM
# ==========================================
from openai import AzureOpenAI

print(f"üîÑ Parsing document with Azure OpenAI ({DEPLOYMENT_NAME})...")

# System prompt - instrukce pro LLM, jak m√° parsovat dokument
# Popisuje:
# - Jak√Ω je √∫kol (extract structured information)
# - Jak√© sekce oƒçek√°v√°me (Projekt, C√≠l, Rozsah...)
# - Jak m√° LLM form√°tovat v√Ωstup (JSON, ISO 8601 datumy, arrays...)
# - Co dƒõlat, kdy≈æ nƒõco chyb√≠ (pou≈æ√≠t "unknown")
SYSTEM_PROMPT = """You are a business requirements parser. Your task is to extract structured information from business request documents.

Documents may contain a mix of Czech and English. Common section headers include:
- "Projekt" / "Project" - project metadata (name, sponsor, date)
- "C√≠l" / "Goal" - main project objective
- "Rozsah" / "Scope" - what is in/out of scope
- "Kl√≠ƒçov√© entity & metriky" / "Key entities & metrics" - business entities and KPIs
- "Oƒçek√°van√© zdroje" / "Expected sources" - data sources
- "Omezen√≠" / "Constraints" - limitations and requirements
- "Po≈æadovan√© artefakty" / "Required artifacts" - deliverables

IMPORTANT INSTRUCTIONS:
1. Extract information into the structured JSON format exactly as specified
2. Use "unknown" for any missing sections
3. Ensure dates are in ISO 8601 format (YYYY-MM-DD)
4. Extract lists as arrays of strings, not concatenated text
5. For project metadata, look for project name, sponsor name, and submission date
6. Any additional metadata fields should go into the "extra" dictionary
7. Be thorough - extract all relevant information from the document
8. Return ONLY valid JSON, no markdown or code blocks

Expected JSON schema:
{
  "project_metadata": {
    "project_name": "string",
    "sponsor": "string",
    "submitted_at": "YYYY-MM-DD",
    "extra": {}
  },
  "goal": "string",
  "scope_in": "string",
  "scope_out": "string",
  "entities": [],
  "metrics": [],
  "sources": [],
  "constraints": [],
  "deliverables": []
}
"""

# ==========================================
# AZURE OPENAI CLIENT SETUP
# ==========================================
# D≈Øle≈æit√©: AzureOpenAI SDK vy≈æaduje endpoint BEZ /openai/v1/ suffixu
# SDK automaticky p≈ôid√° spr√°vnou cestu podle API version
azure_endpoint = AZURE_ENDPOINT.replace("/openai/v1/", "").replace("/openai/v1", "").rstrip("/")

client = AzureOpenAI(
    azure_endpoint=azure_endpoint,      # https://minar-mhi2wuzy-swedencentral.cognitiveservices.azure.com/
    api_key=AZURE_API_KEY,               # API kl√≠ƒç z Databricks secrets
    api_version="2024-10-21"             # Azure OpenAI API verze
)

# User message - spoj√≠me business dokument s instrukcemi
user_message = f"""Parse the following business request document:

{business_document}

Extract all information into the structured JSON format."""

# ==========================================
# LLM CALL - Vol√°n√≠ Azure OpenAI
# ==========================================
# Kl√≠ƒçov√© parametry:
# - model: deployment name (test-gpt-5-mini)
# - messages: system prompt + user message (standard Chat Completion format)
# - response_format: {"type": "json_object"} ‚Üí LLM MUS√ç vr√°tit valid JSON
#
# JSON mode zaji≈°≈•uje:
# 1. LLM v≈ædycky vr√°t√≠ parsovateln√Ω JSON (ne markdown, ne prose text)
# 2. Nutn√© explicitnƒõ ≈ô√≠ct v system promptu "return JSON"
# 3. Pydantic pak JSON validuje proti na≈°emu sch√©matu
response = client.chat.completions.create(
    model=DEPLOYMENT_NAME,
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_message}
    ],
    response_format={"type": "json_object"}  # ‚úÖ KRITICK√â: Vynut√≠ JSON output
)

# Extract JSON string from response
raw_response = response.choices[0].message.content

# ==========================================
# PYDANTIC VALIDATION - Kontrola struktury
# ==========================================
try:
    # 1. Parse JSON string ‚Üí Python dict
    parsed_json = json.loads(raw_response)

    # 2. Validate dict against Pydantic schema
    #    Toto provede:
    #    - Type checking (str? list? dict?)
    #    - Custom validators (@field_validator pro datum)
    #    - Default value injection (pokud pole chyb√≠)
    #    - Vyhod√≠ ValidationError, pokud struktura nesed√≠
    validated = BusinessRequest(**parsed_json)

    # 3. Convert validated model back to dict (pro ulo≈æen√≠)
    #    model_dump() vr√°t√≠ ƒçist√Ω dict, ale u≈æ validovan√Ω
    parsed_json = validated.model_dump()

    print("‚úÖ Parsing complete!")
    print(f"   Model: {response.model}")
    print(f"   Tokens: {response.usage.total_tokens}")
    print(f"   Validation: ‚úÖ Passed")

except json.JSONDecodeError as e:
    # JSON mode selhal ‚Üí LLM vr√°til nevalidn√≠ JSON (shouldn't happen)
    print(f"‚ùå JSON parsing error: {e}")
    print(f"Raw response: {raw_response}")
    raise

except Exception as e:
    # Pydantic validation selhala ‚Üí JSON m√° ≈°patnou strukturu
    # (nap≈ô. chyb√≠ povinn√© pole, ≈°patn√Ω typ, custom validator error)
    print(f"‚ùå Validation error: {e}")
    print(f"Parsed JSON: {parsed_json}")
    raise

# Full prompt for audit (save to DBFS later)
prompt_used = f"System: {SYSTEM_PROMPT}\n\nUser: {user_message}"


## 4. Display Parsed JSON

In [None]:
# Display parsed JSON
print("üìä Parsed Business Request:")
print("=" * 60)
print(json.dumps(parsed_json, indent=2, ensure_ascii=False))

# Also show as Pydantic model
print("\n" + "=" * 60)
print("üìã Validation:")
try:
    validated = BusinessRequest.model_validate(parsed_json)
    print(f"‚úÖ Schema valid: {validated.project_metadata.project_name}")
    print(f"   Sponsor: {validated.project_metadata.sponsor}")
    print(f"   Date: {validated.project_metadata.submitted_at}")
    print(f"   Entities: {len(validated.entities)} found")
    print(f"   Sources: {len(validated.sources)} found")
except Exception as e:
    print(f"‚ùå Validation error: {e}")

## 5. Save Results to DBFS

Save to `/dbfs/FileStore/mcop/tool0_samples/` for persistence.

In [None]:
# Save results to DBFS
timestamp = datetime.now().isoformat().replace(':', '-')  # DBFS-safe filename
output_dir = Path('/dbfs/FileStore/mcop/tool0_samples')
output_dir.mkdir(parents=True, exist_ok=True)

# Save JSON result
json_path = output_dir / f"{timestamp}.json"
with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(parsed_json, f, indent=2, ensure_ascii=False)

# Save prompt
md_path = output_dir / f"{timestamp}.md"
with open(md_path, 'w', encoding='utf-8') as f:
    f.write(f"# Parse Request - {timestamp}\n\n")
    f.write(f"## Prompt Used\n\n```\n{prompt_used}\n```\n\n")
    f.write(f"## Raw Response\n\n```\n{raw_response}\n```\n\n")
    f.write(f"## Parsed JSON\n\n```json\n{json.dumps(parsed_json, indent=2, ensure_ascii=False)}\n```\n")

print(f"üíæ Results saved to DBFS:")
print(f"   JSON: {json_path}")
print(f"   Markdown: {md_path}")
print(f"\nüìÇ View files in Databricks:")
print(f"   dbfs:/FileStore/mcop/tool0_samples/")

## 6. Summary

‚úÖ **Databricks Deployment Complete:**
- [x] Azure OpenAI credentials from Databricks secrets (scope: `mcop`)
- [x] Pydantic schemas for validation (ProjectMetadata, BusinessRequest)
- [x] Sample business document (Czech/English mix)
- [x] OpenAI SDK with Azure endpoint and JSON mode
- [x] Pydantic validation after parsing
- [x] Results saved to DBFS (`/dbfs/FileStore/mcop/tool0_samples/`)

**Databricks-Specific Changes:**
- ‚úÖ `dbutils.secrets.get()` instead of `.env` file
- ‚úÖ `/dbfs/FileStore/` paths instead of local `data/` directory
- ‚úÖ `%pip install` cell for package installation
- ‚úÖ `dbutils.library.restartPython()` to reload packages
- ‚úÖ DBFS-safe filenames (replaced `:` with `-` in timestamps)

**Azure AI Foundry Configuration:**
- **Endpoint:** https://minar-mhi2wuzy-swedencentral.cognitiveservices.azure.com/openai/v1/
- **Deployment:** test-gpt-5-mini
- **Model:** gpt-5-mini-2025-08-07
- **Pattern A:** Direct OpenAI SDK with JSON mode + Pydantic validation

**Next Steps:**
1. Create secret scope: `databricks secrets create-scope --scope mcop`
2. Add credentials: `databricks secrets put --scope mcop --key azure-openai-endpoint`
3. Run notebook on Databricks cluster
4. Verify files in DBFS: `dbfs:/FileStore/mcop/tool0_samples/`