# Imports

In [None]:
# Import python packages
import warnings
warnings.filterwarnings("ignore")
import streamlit as st

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
from snowflake.core import Root
from snowflake.cortex import complete
session = get_active_session()
root = Root(session)

# 1. Unstructured Data

## 1.1 Overview
In this notebook we are preparing unstructured documents for usage in Cortex Agents.
The documents are __PDF and Word documents__ as well as __Powerpoint Presentations__ for marketing campaigns and news articles.

Users will be able to ask questions like this to the Agent:
> ___"What products were featured in the stay hydrated campaign?"___ 

This involves a classic RAG approach (based on [Cortex Search](https://docs.snowflake.com/en/user-guide/snowflake-cortex/cortex-search/cortex-search-overview)) where the Agent needs to identify relevant documents from the marketing campaigns that contain information about the featured products and then uses this document as context for an LLM to generate an answer.

## 2. Retrieval Services using Cortex Search (RAG)

Snowflake's [TO_FILE](https://docs.snowflake.com/en/sql-reference/functions/to_file) function returns a FILE object from a file url.  
[FL_GET_CONTENT_TYPE](https://docs.snowflake.com/en/sql-reference/functions/fl_get_content_type) and [FL_GET_FILE_TYPE](https://docs.snowflake.com/en/sql-reference/functions/fl_get_file_type) provide the type and modality of the file which is useful to filter for specific files in case you have multiple file types in a stage.

In [None]:
-- Query files in Snowflake stage
SELECT 
  RELATIVE_PATH, 
  FL_GET_CONTENT_TYPE(TO_FILE(file_url)) as MIME_TYPE,
  FL_GET_FILE_TYPE(TO_FILE(file_url)) as FILE_MODALITY
FROM DIRECTORY(@DOCUMENTS) LIMIT 5;

## 2.1 Extract Contents from Documents

Snowflake's [PARSE_DOCUMENT]() function runs OCR and (optionally) layout detection on documents.

Supported Document Types:
* PDF
* DOCX
* PPTX

In [None]:
-- Layout extraction for PDF documents
CREATE OR REPLACE TABLE _DOCUMENT_CONTENTS AS
WITH DOCUMENTS AS (
    SELECT 
        RELATIVE_PATH,
        GET_PRESIGNED_URL(@DOCUMENTS, RELATIVE_PATH, 604800) AS URL,
        -- Extract document
        SNOWFLAKE.CORTEX.PARSE_DOCUMENT(
            '@DOCUMENTS',
            RELATIVE_PATH,
            {'mode': 'LAYOUT'}
        ) AS PARSE_DOCUMENT_OUTPUT,
        -- Parse outputs from PARSE_DOCUMENT
        PARSE_DOCUMENT_OUTPUT['content']::TEXT AS RAW_CONTENT,
        PARSE_DOCUMENT_OUTPUT['metadata']['pageCount']::INT AS PAGE_COUNT,
        -- Filter for text after first header
        SUBSTRING(RAW_CONTENT, POSITION('#', RAW_CONTENT)) AS DOCUMENT_CONTENT
    FROM 
        DIRECTORY('@DOCUMENTS')
    WHERE 
        FL_GET_FILE_TYPE(TO_FILE(file_url)) = 'document'
)
SELECT 
    RELATIVE_PATH,
    URL,
    PAGE_COUNT,
    DOCUMENT_CONTENT
FROM 
    DOCUMENTS;

SELECT * FROM _DOCUMENT_CONTENTS;

## 2.2 Create Retrieval Services (Cortex Search)

[Cortex Search](https://docs.snowflake.com/en/user-guide/snowflake-cortex/cortex-search/cortex-search-overview) enables low-latency, high-quality “fuzzy” search over your Snowflake data. It powers a broad array of search experiences for Snowflake users including Retrieval Augmented Generation (RAG) applications leveraging Large Language Models (LLMs).

In [None]:
-- Create a Cortex Search Service (Marketing Campaigns)
CREATE CORTEX SEARCH SERVICE IF NOT EXISTS SEARCH_MARKETING_CAMPAIGNS
  ON DOCUMENT_CONTENT
  ATTRIBUTES RELATIVE_PATH, URL
  WAREHOUSE = AI_WH
  TARGET_LAG = '12 hours'
  EMBEDDING_MODEL = 'snowflake-arctic-embed-l-v2.0-8k'
AS (
  SELECT
    RELATIVE_PATH,
    URL,
    DOCUMENT_CONTENT
  FROM 
    _DOCUMENT_CONTENTS
  WHERE
    startswith(RELATIVE_PATH, 'marketing_campaigns')
);

In [None]:
-- Create a Cortex Search Service (News Articles)
CREATE CORTEX SEARCH SERVICE IF NOT EXISTS SEARCH_NEWS_ARTICLES
  ON DOCUMENT_CONTENT
  ATTRIBUTES RELATIVE_PATH, URL
  WAREHOUSE = AI_WH
  TARGET_LAG = '12 hours'
  EMBEDDING_MODEL = 'snowflake-arctic-embed-l-v2.0-8k'
AS (
  SELECT
    RELATIVE_PATH,
    URL,
    DOCUMENT_CONTENT
  FROM 
    _DOCUMENT_CONTENTS
  WHERE
    startswith(RELATIVE_PATH, 'news_articles')
);

## 3. Test Retrieval Service in (minimalistic) RAG pipeline

We can quickly test the created Search Services by using the [Python-API](https://docs.snowflake.com/en/user-guide/snowflake-cortex/cortex-search/query-cortex-search-service#python-api). The API will return the most relevant documents based on the user's question. The retrieved context is then used in [AI_COMPLETE](https://docs.snowflake.com/en/sql-reference/functions/ai_complete-single-string) to generate an answer.

In [None]:
# Question
question = 'What products were featured in the stay hydrated campaign?'
model = 'mistral-large2'

# Connect to Search Service
search_service = (root
  .databases["AI_DEVELOPMENT"]
  .schemas["SI_THE_FOOD_AND_BEVERAGE_COMPANY"]
  .cortex_search_services["SEARCH_MARKETING_CAMPAIGNS"]
)

# Search relevant documents
search_results = search_service.search(
  query=question,
  columns=["RELATIVE_PATH","DOCUMENT_CONTENT"],
  limit=1
)

retrieved_document = search_results.results[0]['DOCUMENT_CONTENT']

# Generate response with provided context
_ = st.write_stream(complete(model, f"{question} Answer based on this context: {retrieved_document}", stream=True))

with st.expander('Source:'):
    st.markdown(retrieved_document)