# Workflow

## Setup

Setup magic autoreload

In [1]:
%load_ext autoreload
%autoreload 2

Setup logging

In [2]:
import logging
from pathlib import Path
from IPython.display import Image, display, Markdown
from langchain_core.runnables.graph import CurveStyle, MermaidDrawMethod, NodeStyles
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# In case of debugging, set the level to logging.DEBUG

Ensure that you have already configured your environment variable following the instructions in the [README](../README.md)

In [5]:
import os
assert os.getenv('LLM_DSL_CONFIG_PATH') is not None, "Please set the LLM_DSL_CONFIG_PATH environment variable to the path of the config file"

# 1. First glance

In [None]:
from dsl_gen.core.flows import build_rag_flow

flow = build_rag_flow()
display(
    Image(
        flow.get_graph().draw_mermaid_png(
            draw_method=MermaidDrawMethod.API,
        )
    )
)

## How to use

In [None]:
from dsl_gen import CFG

CFG.MODEL_CFG.active_model = "openai"
challenge_path = Path(CFG.PATH_CFG.CHALLENGES_PATH) / "c008.json" 

flow = build_rag_flow()

result = flow.invoke({"challenge_path": str(challenge_path)}) 

display(Markdown(f"```envision\n{result['completion']}```"))

Or you can simply pass a string as the question for the workflow.

In [None]:
query = 'Define a text literal called greetings with value "Hello" and display greetings on the dashboard as a label.'

result = flow.invoke({"query": query})

display(Markdown(f"```envision\n{result['completion']}```"))

# 2. Components

## VectorDB

Uncomment the following line to build vectorstore manually.

In [3]:
# from dsl_gen.core.vector_store import _build_vectorstore
# store = _build_vectorstore()

### Visualize a question

Before building vectorstore, we can visualize a question to understand the data.

In [None]:
from IPython.display import Markdown


display(Markdown(query))
answer = '```envision\ngreetings = "Hello" // define the text literal\nshow label greetings // show the text literal as a label. There should be no \'with\' !\n```'
display(Markdown(answer))

## Build vectorstore

In [None]:
from dsl_gen.core.vector_store import get_vectorstore
from dsl_gen.config import CFG

# It is normal that you see INFO - Failed to load GPU Faiss
# Since we are using CPU Faiss
vectorstore = get_vectorstore()

<span style="color:green;"><b>It is normal that you see the message</b></span> `Failed to load GPU Faiss` <span style="color:green;"><b>since we are using</b></span> `Faiss-CPU`.

Let's see how to retrieve docs by their indices

In [None]:
from IPython.display import Markdown
doc_ids = list(vectorstore.index_to_docstore_id.values())[:5]
for doc_id in doc_ids:
    document = vectorstore.docstore.search(doc_id)
    display(Markdown(document.page_content))

In [None]:
doc_ids = list(vectorstore.index_to_docstore_id.values())[50:53]

display(Markdown('### Displaying documents 50, 51, 52'))

for doc_id in doc_ids:
    document = vectorstore.docstore.search(doc_id)
    
    display(Markdown(document.page_content))

## Similarity search

In [9]:
vectorstore = get_vectorstore()

In [None]:
query="Define a text literal called greetings with value \"Hello\" and display greetings on the dashboard as a label."
documents = vectorstore.similarity_search(k = 3, query=query)
document

In [None]:
for idx, doc in enumerate(documents):
    display(Markdown(f"### Document {idx} \n {doc.page_content}"))
display(Markdown('---'))

## Try queries with a few questions

In [None]:
from dsl_gen.core.vector_store import get_vectorstore
from dsl_gen.config import CFG
from pathlib import Path
import json

vectorstore = get_vectorstore()

file_path = Path(CFG.PATH_CFG.CHALLENGES_PATH) / "c001.json"

with open(file_path, 'r') as f:
    data = json.load(f)

question = data['question']

print(question)

In [None]:
documents = vectorstore.similarity_search(k = 20, query=question)
document

In [None]:
for idx, doc in enumerate(documents):
    print(f"Document {idx}")
    print(doc.page_content)
    print('\n')

# Test RAG flow

## Visualize workflow

In [None]:
from IPython.display import Image, display
from langchain_core.runnables.graph import CurveStyle, MermaidDrawMethod, NodeStyles

from dsl_gen.core.flows import build_rag_flow

flow = build_rag_flow()
display(
    Image(
        flow.get_graph().draw_mermaid_png(
            draw_method=MermaidDrawMethod.API,
        )
    )
)

In [None]:
print(flow.get_graph().draw_mermaid())

---

## Invoke workflow

In [None]:
from dsl_gen import CFG
from pathlib import Path
from dsl_gen.core.flows import build_rag_flow
import logging
from IPython.display import Image, display
from langchain_core.runnables.graph import CurveStyle, MermaidDrawMethod, NodeStyles

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# Reset the active model to openai to boost inference speed
CFG.MODEL_CFG.active_model = "openai"

challenge_path = Path(CFG.PATH_CFG.CHALLENGES_PATH) / "c001.json" 

flow = build_rag_flow()

# It is normal that you see 'INFO - Failed to load GPU Faiss'
# since we are using CPU
result = flow.invoke({"challenge_path": str(challenge_path)}) 

print(result["completion"]) 

---

# Appendix: VectorStore

Below is a detailed explanation of VectorStore content display and similarity search filters:

---

## How to View Chunk Content in VectorStore

### Method 1: Directly Accessing the Underlying Storage (Using FAISS as an Example)

```python
# Assuming an initialized vectorstore object

from langchain_community.vectorstores import FAISS

def show_all_chunks(vectorstore: FAISS) -> list:
    """Displays all stored chunks and their metadata"""
    chunks = []
    # Iterate through all document IDs
    for doc_id in vectorstore.index_to_docstore_id.values():
        document = vectorstore.docstore.search(doc_id)
        chunks.append({
            "id": doc_id,
            "content": document.page_content,
            "metadata": document.metadata
        })
    return chunks

# Example usage
for chunk in show_all_chunks(vectorstore):
    print(f"[ID: {chunk['id']}]")
    print(f"Metadata: {chunk['metadata']}")
    print(f"Content: {chunk['content'][:50]}...\n")

# Sample Output:
# [ID: 89a3b2f1]
# Metadata: {'source': 'manual.pdf', 'page': 23}
# Content: Envision language supports time series analysis using window functions...
```

### Method 2: Retrieve All Chunks via Search (Temporary Approach)
```python
# Retrieve all documents by searching an empty string (Use with caution)
all_docs = vectorstore.similarity_search(query="", k=1000)  # Set k to a sufficiently large number
for i, doc in enumerate(all_docs):
    print(f"Chunk {i+1}: {doc.page_content[:80]}...")
```

---

## **Understanding the `similarity_search` Filter Parameter**

### Purpose
- **Metadata Filtering**: Restrict searches to documents that meet specific criteria.
- **Performance Optimization**: Reduce the number of candidate documents that need to be compared.
- **Business Adaptability**: Dynamically adjust the search scope based on use cases.

### Filtering Syntax Examples
```python
# Basic filtering (Equality)
vectorstore.similarity_search(
    "Time Series Forecasting",
    filter={"source": "finance_docs"},  # Search only financial documents
    k=3
)

# Comparison Operators
vectorstore.similarity_search(
    "Data Cleaning",
    filter={
        "page": {"$gte": 50},          # Page number >= 50
        "version": {"$ne": "draft"}    # Exclude draft versions
    }
)

# Multiple Conditions
vectorstore.similarity_search(
    "Machine Learning",
    filter={
        "$and": [
            {"category": "AI"},
            {"security_level": {"$lte": 2}}
        ]
    }
)

# Array Containment
vectorstore.similarity_search(
    "Distributed Computing", 
    filter={
        "tags": {"$in": ["spark", "hadoop"]}  # Includes any specified tags
    }
)
```

### Supported Operators
| Operator | Description             | Example                          |
|----------|-------------------------|----------------------------------|
| `$eq`    | Equals (default)         | `{"author": "John"}`            |
| `$ne`    | Not equals               | `{"status": {"$ne": "draft"}}`  |
| `$gt`    | Greater than             | `{"views": {"$gt": 1000}}`      |
| `$gte`   | Greater than or equal to | `{"year": {"$gte": 2020}}`      |
| `$lt`    | Less than                | `{"priority": {"$lt": 5}}`      |
| `$lte`   | Less than or equal to    | `{"rating": {"$lte": 4.5}}`     |
| `$in`    | Contained in array       | `{"tags": {"$in": ["AI"]}}`     |
| `$nin`   | Not contained in array   | `{"lang": {"$nin": ["zh"]}}`    |
| `$and`   | Logical AND              | See combined example above      |
| `$or`    | Logical OR               | `{"$or": [cond1, cond2]}`       |

---

## **Best Practices and Practical Advice**

### 1. **Metadata Design Tips**
- Inject structured metadata when splitting documents.
```python
from langchain.docstore.document import Document

Document(
    page_content=chunk_text,
    metadata={
        "source": "user_manual_v3.pdf",
        "section": "API Reference",
        "last_updated": "2024-03-15"
    }
)
```

### 2. **Efficient Filtering Strategies**
```python
# Predefined common filters
FILTERS = {
    "technical_docs": {"doc_type": "technical"},
    "recent_updates": {"last_updated": {"$gte": "2024-01-01"}},
    "high_priority": {"priority": {"$gte": 8}}
}

# Dynamically build filters
def build_filter(include_filters: list[str]):
    return {"$and": [FILTERS[f] for f in include_filters]}
```

### 3. **Debugging and Metadata Analysis**
```python
# Display metadata field distributions
from collections import defaultdict

def analyze_metadata(vectorstore):
    field_stats = defaultdict(set)
    for doc_id in vectorstore.index_to_docstore_id.values():
        doc = vectorstore.docstore.search(doc_id)
        for k, v in doc.metadata.items():
            field_stats[k].add(str(v))  # Convert to string to avoid type issues
  
    for field, values in field_stats.items():
        print(f"Field '{field}':")
        print(f"  Unique values ({len(values)}): {', '.join(list(values)[:3])}...")

# Sample Output:
# Field 'source': 
#   Unique values (4): manual_v2.pdf, blog_post.md, api_spec.json...
```

By effectively utilizing metadata filtering and content inspection methods, you can significantly enhance the **controllability** and **interpretability** of a Retrieval-Augmented Generation (RAG) system.