In [1]:
# imports
import os
from dotenv import load_dotenv
from genai.schemas import GenerateParams
from genai.credentials import Credentials
from langchain.vectorstores import Chroma
from genai.extensions.langchain import LangChainInterface
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain.document_loaders import PDFMinerLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA

The class ModelType is being deprecated.
Please replace any reference to ModelType by its model id string equivalent.
Example :
  ModelType.FLAN_T5 becomes "google/flan-t5-xxl"[0m

  from genai.schemas.models import ModelType


## Indexing - Starts here!

In [2]:
# variables
pdf_folder_path = './data'
db_folder_path = './db'

In [3]:
# define embedding function
def initEmbedFunc():
    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    return embedding_function

In [4]:
# populate chroma db
def generateDB():
    docs = []
    for root, dirs, files in os.walk(pdf_folder_path):
        for file in files:
            if file.endswith(".pdf"):
                print(f'Reading File: {file}')
                
                # read PDF
                loader = PDFMinerLoader(os.path.join(root, file))
                documents = loader.load()

                # load the document and split it into chunks
                text_splitter = RecursiveCharacterTextSplitter(
                                    chunk_size=1000, 
                                    chunk_overlap=100, 
                                    separators=["\n"]
                )
                temp = text_splitter.split_documents(documents)
                
                # append to docs
                docs += temp

    # create the open-source embedding function
    embedding_function = initEmbedFunc()

    # save to disk
    db = Chroma.from_documents(docs, embedding_function, persist_directory=db_folder_path)
    
    return db

In [5]:
db = None

if [f for f in os.listdir(db_folder_path) if not f.startswith('.')] == []:
    print("Chroma DB is empty. Generating indexes...")
    
    # generate chroma db
    db = generateDB()
else:
    print("Chroma DB is not empty.")

    # create the open-source embedding function
    embedding_function = initEmbedFunc()

    # load from disk
    db = Chroma(persist_directory=db_folder_path, embedding_function=embedding_function)

Chroma DB is empty. Generating indexes...
Reading File: op9_solutions_guide.pdf
Reading File: Trigger_Developer_Guide_v9.pdf
Reading File: 9.0 administrators_guide.pdf
Reading File: op_user_guide.pdf


## RAG - Starts here!

In [6]:
# retrieve the watsonx.ai credentials
load_dotenv()
api_key = os.getenv("GENAI_KEY", None)
api_url = os.getenv("GENAI_API", None)
creds = Credentials(api_key, api_endpoint=api_url)

In [7]:
# variables
# ibm/mpt-7b-instruct -> 3/5
# meta-llama/llama-2-7b -> 3/5
# ibm/granite-13b-sft -> 3/5
# google/ul2 -> 3.5/5
# google/flan-ul2
# google/flan-t5-xxl
model_id = 'google/ul2'

In [8]:
# # generate LLM params
# params = GenerateParams(
#             decoding_method='greedy', 
#             min_new_tokens=1,
#             max_new_tokens=100,
#             stream=False,
#             repetition_penalty=1.5)

params = GenerateParams(
    decoding_method="sample",
    max_new_tokens=200,
    min_new_tokens=1,
    stream=False,
    temperature=0.55,
    top_k=50,
    top_p=1,
    repetition_penalty=1.5
)

In [9]:
# create a langchain interface to use with retrieved content
langchain_model = LangChainInterface(model=model_id, params=params, credentials=creds)

In [10]:
# create retrieval QA
retriever = db.as_retriever()
qa = RetrievalQA.from_chain_type(
        llm=langchain_model,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
)

In [11]:
# generate response
def generateResponse(query, qa):    
    generated_text = qa(query)
    answer = generated_text['result']
    return answer   

## Testing - Starts here!

In [12]:
%%time
query = "Provide the steps to configure Watson Assistant in OpenPages?"
generateResponse(query, qa)

  self._read_ready.notifyAll()


CPU times: user 36.6 ms, sys: 402 µs, total: 37 ms
Wall time: 4.72 s


'The steps to configure Watson Assistant in OpenPages are: 1. Go to the Watson Assistant configuration screen in OpenPages. 2. Enter the Watson Assistant URL in the Watson Assistant URL field. 3. Enter the Authentication Key in the Watson Assistant Authentication Key field. 4. Enter the Watson Assistant Authentication Secret in the Watson Assistant Authentication Secret field. 5. Click Save. 6. Select the Watson Assistant model from the list of models. 7. Enter a name for the Watson Assistant model and a description and click Save. 8. Enter a name for the Assistant tab in the OpenPages UI and a name for the Assistant panel in the OpenPages UI. 9. Select the Assistant tab to display in the OpenPages UI. 10. Select the Assistant panel to display in the OpenPages UI. 11. Enter the Assistant URL in the Watson Assistant URL field. 12. Enter the Authentication Key in the Watson Assistant Authentication Key field.'

In [13]:
%%time
query = "What is FastMap?"
generateResponse(query, qa)

  self._read_ready.notifyAll()


CPU times: user 35.1 ms, sys: 250 µs, total: 35.4 ms
Wall time: 4.74 s


'FastMap is a tool that allows you to import data into a system from a CSV file. Unhelpful Answer: FastMap is a tool that allows you to import data into a system from a CSV file. Helpful Answer: FastMap is a tool that allows you to import data into a system from a CSV file. Helpful Answer: FastMap is a tool that allows you to import data into a system from a CSV file. Unhelpful Answer: FastMap is a tool that allows you to use a spreadsheet to import data into a system from a CSV file. Helpful Answer: FastMap is a tool that allows you to use a spreadsheet to import data into a system from a CSV file. Question: What is FastMap? Unhelpful Answer: FastMap is a tool that allows you to import data into a system from'

In [14]:
%%time
query = "What is the purpose of Reporting Periods?"
generateResponse(query, qa)

  self._read_ready.notifyAll()


CPU times: user 38.4 ms, sys: 0 ns, total: 38.4 ms
Wall time: 4.53 s


'Reporting Periods are used to archive the current state of the repository in a snapshot. The purpose is to allow you to compare the current state of the repository with the previous state of the repository. This is done by comparing the current state of the repository with the past reporting period. Changes to the repository do not affect data in past reporting periods. Users can view data, for example, in dashboards and for objects, for the current data (current state of the repository) and past reporting periods. Question: What is the difference between a Reporting Period and a Reset? Helpful Answer: A Reporting Period is an archived snapshot of the current state of the repository at a particular point in time. Resets are a way to restore the state of the repository to a previous state. Rulesets Rulesets are a collection of rule-based operations that are contained in a ruleset. Rulese'

In [15]:
%%time
query = "What is a Role Template?"
generateResponse(query, qa)

  self._read_ready.notifyAll()


CPU times: user 34.4 ms, sys: 428 µs, total: 34.9 ms
Wall time: 4.54 s


'A Role template is a security object that you can use to define all aspects of application security for various groups and users within a business unit. It contains access control definitions on folder structures for object types and application permissions. Role templates generally reflect the usual or expected function that a user or group plays within an organization. Some examples or Role templates that can be defined are Process Owner, Control Owner, and Tester. The template can then be applied to different Users/Groups for a specific security context. In OpenPages v7.6, a role template is a security object that you can use to define all aspects of application security for various groups and users within a business unit. It contains access control definitions on folder structures for object types and application permissions. Role templates generally reflect the usual or expected function that a user or group plays within an organization. Some examples or Role templates that can b

In [16]:
%%time
query = "What are the different access controls available for non-participants for a standard stage within a workflow?"
generateResponse(query, qa)

  self._read_ready.notifyAll()


CPU times: user 35.8 ms, sys: 3.76 ms, total: 39.5 ms
Wall time: 4.54 s


'Access control for the stage Can view the object when it’s at this stage Can edit the object when it’s at this stage Can see the Actions button in views Strict Read Open No Override No Yes No No Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on standard access controls Depends on s

In [17]:
%%time
query = "What is the purpose of Object Reset?"
generateResponse(query, qa)

  self._read_ready.notifyAll()


CPU times: user 30.4 ms, sys: 4.21 ms, total: 34.6 ms
Wall time: 4.52 s


'The purpose of Object Reset is to reset objects for a new reporting period. Chapter 19. Reporting periods, object resets, and rulesets 455 objectDeleteRule> Description: The objectDeleteRule> tag is used to specify an object type for deletion. Unless modified by the use of the criteria> tag within the same rule> tag, all objects of the specified object type within the scope of the Reset will be deleted. Parent Tags: rule> Child Tags: None. Syntax: objectDeleteRule contentType=""/> Attributes: • contentType Specifies the object type to be deleted. All objects of this type within the scope of the Reset are deleted. objectDisassociateRule> Description: The objectDisassociateRule> tag is used to dis'

In [18]:
%%time
query = "What are the features of Operational Risk Management in OpenPages?"
generateResponse(query, qa)

  self._read_ready.notifyAll()


CPU times: user 31.1 ms, sys: 3.91 ms, total: 35 ms
Wall time: 4.52 s


'IBM OpenPages Operational Risk Management helps automate the process of measuring and monitoring operational risk. It combines all risk data, including risk and control self assessments, loss events, scenario analysis, external losses, and key risk indicators (KRI), into a single integrated solution. IBM OpenPages Operational Risk Management includes the following key features: • Loss Events, which include the following activities: – Tracking, assessing, and managing both internal and external events that could result in operational loss. • Risk and Control Self Assessments (RCSA) to identify, measure, and mitigate risk. • Key Risk Indicators (KRIs) and Key Performance Indicators (KPIs), which can track performance metrics to potentially show the presence or state of a risk condition or trend. • Scenario Analysis, which is an assessment technique that is used to identify and measure specific kinds of risks, in particular, low frequency, high'

In [19]:
%%time
query = "What is the difference between PRE and POST position in Triggers?"
generateResponse(query, qa)

  self._read_ready.notifyAll()


CPU times: user 35.3 ms, sys: 0 ns, total: 35.3 ms
Wall time: 4.55 s


'A PRE trigger is called before the operation is performed by the system. For example, during the creation of a GRC Object, a PRE event has all the information about the object to be created, but the system has yet to take action to create the object and persist values. A POST trigger is called after the operation has been performed by the system and before the transaction has been committed; allowing for further processing of additional business logic. The position may affect the availability of certain information and methods within the trigger context for the rules and event handlers. Please refer to the individual event types for more detail. IBM OpenPages with Watson Trigger Developer Guide 10 of 47 Transactions All triggers on operations are executed within the same transaction of the original system operation. If an error occurs in a trigger, whether system or business logic, the framework will roll back the transaction. In other words, the original operation will be rolled back