In [1]:
import ray
import yaml
import logging
import tiktoken
from IPython.display import display, Markdown
logger = logging.getLogger(__name__)

In [2]:
from flows_new.components import PrintComponent, SquareComponent
from flows_new.blocks import Block
from flows_new.workflow_builder import WorkflowBuilder

from dotenv import load_dotenv
load_dotenv()

True

### Test 1 - Serial block

In [3]:
comp1 = PrintComponent("Hello, World!")
comp2 = PrintComponent(component_order=1, expects_input=True)
comp3 = PrintComponent(component_order=2, expects_input=True)

block = Block([comp1, comp2, comp3], type='serial')
block.execute()

0 message: Hello, World!
1 message: Hello, World!
2 message: Hello, World!


In [4]:
print(block.components[0].output)
print(block.components[1].output)
print(block.components[2].output)

Hello, World!
Hello, World!
Hello, World!


### Test 2 - Parallel block

In [5]:
comp1 = PrintComponent("Hello, World!")
comp2 = PrintComponent(message="Hello", component_order=1)
comp3 = PrintComponent(message="World", component_order=2)
block = Block([comp1, comp2, comp3], type='parallel')
block.execute()

2023-08-29 10:55:11,295	INFO worker.py:1621 -- Started a local Ray instance.


In [6]:
print(block.components[0].output)
print(block.components[1].output)
print(block.components[2].output)

Hello, World!
Hello
World


### Test 3 - workflow builder with yaml

In [7]:
builder = WorkflowBuilder('test2.yaml')
builder.execute()

0 message: Hello, World!
1 message: Hello, World!


In [8]:
print(builder.blocks[0].components[0].output)
print(builder.blocks[0].components[1].output)

Hello, World!
Hello, World!


### Test 4 - Square component

In [9]:
# Build a serial block
comp1 = SquareComponent(number=2)
comp2 = SquareComponent(expects_input=True, component_order=1)
comp3 = SquareComponent(expects_input=True, component_order=2)
block = Block([comp1, comp2, comp3], type='serial')
block.execute()

0 message: 4
1 message: 16
2 message: 256


In [10]:
print(block.components[0].output)
print(block.components[1].output)
print(block.components[2].output)

4
16
256


In [11]:
workflow = WorkflowBuilder('test_sq.yaml')
workflow.execute()

0 message: 4
1 message: 16
2 message: 256


In [12]:
print(workflow.blocks[1].components[0].output)
print(workflow.blocks[1].components[1].output)
print(workflow.blocks[1].components[2].output)

4
9
hello, world!


### Develop reader component

In [3]:
from flows_new.components import Component
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
import faiss
import pickle

In [4]:
class PDFReaderComponent(Component):
    def __init__(self, filename, **kwargs):
        super().__init__(**kwargs)
        self.filename = filename
        self.loader = PyPDFLoader(file_path=self.filename)

    def execute(self):
        self.output = self.loader.load()

comp1 = PDFReaderComponent(filename='./data/zhou2020.pdf')

In [5]:
class ChunkerComponent(Component):
    def __init__(self, chunk_size=1500, chunk_overlap=100, **kwargs):
        super().__init__(**kwargs)
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)

    def execute(self):
        assert self.input_from_prev is not None, "Component expects input but none was provided"
        self.output = self.splitter.split_documents(self.input_from_prev)

In [6]:
class OpenAIFAISSComponent(Component):
    def __init__(self, save_path, **kwargs):
        super().__init__(**kwargs)
        self.save_path = save_path
        

    def execute(self):
        # Assert self.input_from_prev is not None
        assert self.input_from_prev is not None, "Component expects input but none was provided"

        input = self.input_from_prev
        store = FAISS.from_documents(input, embedding=OpenAIEmbeddings())
        self.output = store

        if self.save_path is not None:
            faiss.write_index(store.index, f"{self.save_path}/docs.index")

            store.index = None
            
            # Pickle and store the VecDB.
            with open(f"{self.save_path}/faiss_store.pkl", "wb") as f:
                pickle.dump(store, f)


In [7]:
comp1 = PDFReaderComponent(filename='./data/zhou2020.pdf')
comp2 = ChunkerComponent(expects_input=True, component_order=1)
comp3 = OpenAIFAISSComponent(save_path='./data', component_order=1, expects_input=True)

block = Block([comp1, comp2, comp3], type='serial')
block.execute()

In [8]:
workflow = WorkflowBuilder('read_pdf.yaml')
workflow.execute()

2023-08-29 11:59:46,872	INFO worker.py:1621 -- Started a local Ray instance.


In [9]:
workflow.blocks[0].components

[<flows_new.components.PDFReaderComponent at 0x15f0fbaf0>,
 <flows_new.components.ChunkerComponent at 0x1074e6950>,
 <flows_new.components.OpenAIFAISSComponent at 0x15f0f9d50>]

### Load retriever and chat

In [10]:
class LoadRetrieversComponent(Component):
    def __init__(self, path, k, **kwargs):
        super().__init__(**kwargs)
        self.path = path
        self.k = 4

    def execute(self):
        index = faiss.read_index(f"{self.path}/docs.index")

        with open(f"{self.path}/faiss_store.pkl", "rb") as f:
            vectorstore = pickle.load(f)

        vectorstore.index = index

        self.output = vectorstore.as_retriever(search_kwargs={'k': self.k})

In [11]:
class RetrieverComponent(Component):
    def __init__(self, query, concatenate_docs, max_tokens=3000, enc_model="gpt-3.5-turbo", **kwargs):
        super().__init__(**kwargs)
        self.query = query
        self.concatenate_docs = concatenate_docs
        self.max_tokens = max_tokens
        
        self.enc_model = enc_model
        try:
            self.encoding = tiktoken.encoding_for_model(self.enc_model)
        except KeyError:
            logger.info(f"Encoding for model {self.enc_model} not found. Using default encoding.")
            self.encoding = tiktoken.get_encoding("cl100k_base")

    def execute(self):
        assert self.input_from_prev is not None, "Component expects input but none was provided"

        retriever = self.input_from_prev
        docs = retriever.get_relevant_documents(self.query)

        if self.concatenate_docs:
            self.output = self.concatenate_documents(docs)

    def concatenate_documents(self, documents):
        """Combine documents up to a certain token limit."""
        combined_docs = ""
        token_count = 0
        used_docs = []

        for doc in documents:
            doc_tokens = self.calculate_tokens(doc.page_content)
            if (token_count + doc_tokens) <= self.max_tokens:
                combined_docs += f"\n\n{doc.page_content}\nSource: {doc.metadata['source']}"
                token_count += doc_tokens
                used_docs.append(doc)

        return combined_docs, used_docs
    
    def calculate_tokens(self, document):
        """Calculate the number of tokens in a list of documents."""
        return len(self.encoding.encode(document))

In [12]:
final_answer_system_template = """
As an AI assistant helping answer a user's question, your task is to provide the answer to the user's question based on the collection of documents provided. Each document is demarcated by the 'Source:' tag. 

In most cases, the answer to the user's question can be found in one of the documents.

If the documents do not contain the required information to answer user's question, respond with 'I don't know'. In this case, you can provide a link to the Chainlink documentation.

Each point in your answer should be formatted with corresponding reference(s) using markdown. Conclude your response with a footnote that enumerates all the references involved. Please make sure to use only the references provided in the documents and not to use any external references. 

The footnote should be formatted as follows: 
```
References:
[^1^]: <reference 1> 
[^2^]: <reference 2> 
[^3^]: <reference 3>
```
Please avoid duplicating references. For example, if the same reference is used twice in the answer, please only include it once in the footnote.
"""

final_answer_human_template = """
User's question: {question}

Document: {document}

Answer:
"""



In [13]:
class QandAComponent(Component):
    def __init__(self, query, system_message, user_message, **kwargs):
        super().__init__(**kwargs)
        self.query = query
        self.system_message = system_message
        self.user_message = user_message
        FINAL_ANSWER_PROMPT = ChatPromptTemplate.from_messages(
            [
                SystemMessagePromptTemplate.from_template(self.system_message),
                HumanMessagePromptTemplate.from_template(self.user_message),
            ]
        )
        llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.)
        self.chain = LLMChain(llm=llm, prompt=FINAL_ANSWER_PROMPT)

    def execute(self):
        assert self.input_from_prev is not None, "Component expects input but none was provided"

        relevant_docs = self.input_from_prev
        self.output = self.chain.predict(question=self.query, document=relevant_docs)


In [14]:
comp1 = LoadRetrieversComponent(path='./data', k=4)
query = "What is this about?"
comp2 = RetrieverComponent(query=query, concatenate_docs=True, component_order=1, expects_input=True)
comp3 = QandAComponent(query=query, system_message=final_answer_system_template, user_message=final_answer_human_template, component_order=2, expects_input=True)

block = Block([comp1, comp2, comp3], type='serial')
block.execute()

In [15]:
Markdown(block.components[-1].output)

This document discusses the research on multifunctional bioactive peptides (MBAPs), specifically focusing on antihypertensive peptides[^1^]. MBAPs have various properties, such as antitumor, antioxidant, and immunomodulating activities[^1^]. The document mentions that many achievements have been made in the field of MBAPs, particularly in terms of antihypertensive peptides[^1^]. However, the commercial products that make use of MBAPs are still very rare[^1^]. The document suggests that more in-depth studies, including in vivo testing and investigation of the underlying mechanism of action, should be the focus of future research[^1^].

References:
[^1^]: J. Zhou, et al. Food Research International 134 (2020) 109230

In [3]:
workflow = WorkflowBuilder('qanda.yaml')
workflow.execute()

2023-08-29 12:08:41,356	INFO worker.py:1621 -- Started a local Ray instance.


In [7]:
Markdown(workflow.blocks[0].components[-1].output)

The document is a review on mushroom-derived bioactive peptides, focusing on their preparation and biological activities[^1^]. It discusses the extraction of bioactive peptides directly or indirectly from mushrooms or their mycelia. Mushrooms are considered promising sources of bioactive peptides due to their high-quality proteins[^1^]. The review highlights the beneficial effects of mushroom bioactive peptides, including antihypertensive, antioxidant, and antimicrobial activities[^1^]. It also mentions that more in-depth studies, such as in vivo testing and investigation of the underlying mechanism of action, should be the focus of future research in this field[^1^].

References:
[^1^]: Juanjuan Zhou et al. "A review on mushroom-derived bioactive peptides: Preparation and biological activities." Food Research International 134 (2020): 109230.