### Set up snowflake

Before running this notebook, make sure you have set up your Snowflake database, schema and stages. 

- Go to your Snowflake UI and create a database called "RAG_DEMO". 
- In that database, create a schema called "RAG_DEMO_SCHEMA". 
- In that schema, create one internal stage called "UDF" and one internal stage called "FED_PRESS_CONF".
- Open that stage called "FED_PRESS_CONF", click on +files, and load the FOMC PDF into that stage. 




In [None]:
### Import Libraries
import pandas as pd
from PyPDF2 import PdfFileReader
from snowflake.snowpark.files import SnowflakeFile
from io import BytesIO
from snowflake.snowpark.types import StringType, StructField, StructType
from langchain.text_splitter import RecursiveCharacterTextSplitter


from snowflake.snowpark.session import Session
import json

session = get_active_session()

In [None]:
session

### Create PDF Extract Function using PyPDF2

Create a function to extract text from a PDF and store it as characters in a table.

In [None]:


def readpdf(file_path):
    whole_text = ""
    with SnowflakeFile.open(file_path, 'rb') as file:
        f = BytesIO(file.readall())
        pdf_reader = PdfFileReader(f)
        whole_text = ""
        for page in pdf_reader.pages:
            whole_text += page.extract_text()
    return whole_text

In [None]:
### Register Function as User Defined Function

session.udf.register(
    func = readpdf
  , return_type = StringType()
  , input_types = [StringType()]
  , is_permanent = True
  , name = 'SNOWPARK_PDF_TWO'
  , replace = True
  , packages=['snowflake-snowpark-python','pypdf2']
  , stage_location = 'RAG_DEMO.RAG_DEMO_SCHEMA.UDF'
)

In [None]:
# Create UDTF For Chunking via Langchain
# A class for chunking text and returning a table via UDTF
from snowflake.snowpark.types import StringType, StructField, StructType
from langchain.text_splitter import RecursiveCharacterTextSplitter

class text_chunker:

    def process(self,text):        
        text_raw=[]
        text_raw.append(text) 
        
        text_splitter = RecursiveCharacterTextSplitter(
            separators = ["\n"], # Define an appropriate separator. New line is good typically!
            chunk_size = 1000, # Adjust this as you see fit
            chunk_overlap  = 50, # This let's text have some form of overlap. Useful for keeping chunks contextual
            length_function = len,
            add_start_index = True # Optional but useful if you'd like to feed the chunk before/after
        )
    
        chunks = text_splitter.create_documents(text_raw)
        # df = pd.DataFrame(chunks, columns=['chunks','meta'])
        df = pd.DataFrame([[d.page_content, d.metadata] for d in chunks], columns=['chunks','meta'])
        
        yield from df.itertuples(index=False, name=None)

In [None]:
### Register the UDTF - set the stage location


schema = StructType([
     StructField("chunk", StringType()),
    StructField("meta", StringType()),
 ])

session.udtf.register( 
    handler = text_chunker,
    output_schema= schema, 
    input_types = [StringType()] , 
    is_permanent = True , 
    name = 'CHUNK_TEXT_TWO' , 
    replace = True , 
    packages=['pandas','langchain'], stage_location = 'RAG_DEMO.RAG_DEMO_SCHEMA.UDF' )

In [None]:
-- invoke UDF to extract text from PDF
CREATE OR REPLACE TABLE FED_RAW_TEXT AS
SELECT
    relative_path
    , file_url
    , SNOWFLAKE.CORTEX.PARSE_DOCUMENT(@fed_press_conf, relative_path) as raw_text
from directory(@fed_press_conf);

SELECT *
FROM FED_RAW_TEXT;


In [None]:
SELECT *
FROM FED_RAW_TEXT;

In [None]:
-- Create the chunked version of the table
CREATE OR REPLACE TABLE FED_CHUNK_TEXT AS
SELECT
        relative_path,
        func.*
    FROM FED_RAW_TEXT AS raw,
         TABLE(chunk_text_two(raw_text)) as func;



In [None]:
SELECT *
FROM FED_CHUNK_TEXT LIMIT 10;

In [None]:
--Convert your chunks to embeddings
CREATE OR REPLACE TABLE FED_VECTOR_STORE AS
SELECT
RELATIVE_PATH as PRESS_CONF,
CHUNK AS CHUNK,
snowflake.cortex.embed_text_768('snowflake-arctic-embed-m', chunk) as chunk_embedding
FROM FED_CHUNK_TEXT;

SELECT *
FROM FED_VECTOR_STORE;



In [None]:
SELECT *
FROM FED_VECTOR_STORE LIMIT 10;

In [None]:
SELECT PRESS_CONF, CHUNK, CHUNK_EMBEDDING from RAG_DEMO.RAG_DEMO_SCHEMA.FED_VECTOR_STORE
            ORDER BY VECTOR_COSINE_SIMILARITY(
            snowflake.cortex.embed_text_768('snowflake-arctic-embed-m', 
            'have rates peaked?'
            ), CHUNK_EMBEDDING
            ) limit 5
        ;