### Install necessary dependencies
If you need to uninstall the dependencies, you can run e.g. `!pip uninstall -r ../requirements.txt -y` in the cell below.

In [1]:
!python -m pip install -r ../requirements.txt --quiet

### Make Python scripts accessible
There is a couple of Python scripts in the `/src` directory. We can make them accessible by adding the directory to the path ENV variable. We insert the path to position 1 to make it the first path scanned for the required modules to not confuse our scripts with scripts with the same name but in unrelated locations. This change to ENV is temporary.

In [2]:
import os
import sys

path_to_src = os.path.abspath('../src')

if path_to_src not in sys.path:
    sys.path.insert(1, path_to_src)

### Import dependencies

Just common dependencies except the aws_cli - that is our own script for accessing Amazon Bedrock service. The Bedrock is a home of the models we are going to use for text embedding and text generation.

In [3]:
import numpy as np
import pandas as pd
from rank_bm25 import BM25Okapi
import re
import string
import textwrap

from aws_cli import Client

### Basic RAG QA
The final solution from the previous notebook has been just copy-pasted here.

In [4]:
class BasicRAGQA:
    def __init__(self, chunk_size=250, chunk_overlap=50, k=8):
        self.client = Client()
        
        # ensure the chunking settings make sense
        chunk_size = abs(chunk_size)
        chunk_overlap = abs(chunk_overlap)

        if (chunk_size - chunk_overlap) < 1:
            raise Exception('The chunk_size needs to be larger than chunk_overlap')

        k = abs(k)

        if not 0 < k <= 20:
            raise Exception('The k needs to be between 1 and 20')

        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.k = k


    def load_knowledge_base(self, dir_path):
        # read all documents
        documents_list = []
        for file_name in os.listdir(dir_path):
            if file_name.endswith('.json'):
                documents_list.append(pd.read_json(os.path.join(dir_path, file_name)))
    
        # build database
        documents = pd.concat(documents_list, ignore_index=True)
    
        # split documents into chunks
        documents['chunks'] = documents['content'].apply(self._str2chunks)
        documents = documents.explode('chunks').reset_index(drop=True)
        documents = documents.drop(columns=['content'])
        documents = documents.rename(columns={'chunks': 'content'})
    
        # build index
        embeddings = documents['content'].apply(self.client.embed_text)
    
        # store knowledge base
        self.knowledge_base = {'documents': documents, 'index': np.stack(embeddings, axis=0)}

    def _str2chunks(self, text):    
        return [text[a:a + self.chunk_size] for a in range(0, len(text), self.chunk_size - self.chunk_overlap)]

    def _retriever(self, query):
        # embed user query
        query_embedding = self.client.embed_text(query)  # use the same embedding that was used for the knowledge base

        # retrieve most similar document
        similarities = np.dot(self.knowledge_base['index'], query_embedding)
        top_k_idxs = np.argpartition(similarities, -self.k)[-self.k:]
        top_k_sorted_idxs = top_k_idxs[np.argsort(similarities[top_k_idxs])][::-1]
        top_k_documents = self.knowledge_base['documents'].iloc[top_k_sorted_idxs]
    
        return top_k_documents, similarities[top_k_sorted_idxs]

    def _construct_prompt(self, query_text, documents_text):
        prompt = textwrap.dedent(
            '''\
            <s>[INST]Use only the below-given KNOWLEDGE and not prior knowledge to provide an accurate, helpful, concise, and clear answer to the QUERY below.
            Avoid copying word-for-word from the KNOWLEDGE and try to use your own words when possible.
    
            KNOWLEDGE:
            {texts}
    
            Answer the QUERY using only the provided KNOWLEDGE. Don't provide notes, comments, or explanations.
            After the answer, write a paragraph starting with "References: " followed by the [id] of each reference article containing information needed to answer the query.
            If none of the articles from KNOWLEDGE contains information needed to provide a precise answer, or if you are not 100 % sure, reply with string: "DONT_KNOW".

            QUERY: "{query_text}"
            ANSWER:[/INST]
            '''
        ).format(
            query_text = query_text,
            texts = '\n\n'.join(['Article [{}]: """\n{}\n"""'.format(idx, text) for idx, text in documents_text.reset_index(drop=True).items()])
        )
    
        return prompt

    def process_query(self, query):
        documents, similarities = self._retriever(query)
        prompt = self._construct_prompt(query, documents.content)
        answer = self.client.execute_prompt(prompt)

        answer_and_refs = answer.split('References:')
        answer = answer_and_refs[0].strip()

        llm_references = []
        if len(answer_and_refs) > 1:
            for ref_id in re.findall(r'\[(\d+)\]', answer_and_refs[1]):
                doc = documents.iloc[int(ref_id)]
                llm_references.append((doc.title, doc.url, similarities[int(ref_id)]))

            llm_references = sorted(llm_references, key=lambda x: x[1], reverse=True)

        if ('DONT_KNOW' in answer) or (len(llm_references) == 0):
            answer = 'I\'m sorry, I don\'t know answer to your query.'
            llm_references = []

        return answer, llm_references

Create the basic RAG, load the Wikipedia knowledge base, and test it on a couple of examples from the previous notebook.

In [5]:
basic_rag = BasicRAGQA()
basic_rag.load_knowledge_base('../data/wikipedia_kb/')

query1 = 'How many employees did Socialbakers have in 2016?'
query2 = 'Who is Jan Rus?'
query3 = 'Where is Jan Rus working now?'
query4 = 'What is a dog?'
query5 = 'Do dogs love social media?'
query6 = 'Hi!'
query7 = 'Is Russia bigger than Belarus?'  # Elaborate
query8 = 'Which country is bigger, Russia or Belarus?'  # Elaborate

answer1, reference1 = basic_rag.process_query(query1)
answer2, reference2 = basic_rag.process_query(query2)
answer3, reference3 = basic_rag.process_query(query3)
answer4, reference4 = basic_rag.process_query(query4)
answer5, reference5 = basic_rag.process_query(query5)
answer6, reference6 = basic_rag.process_query(query6)
answer7, reference7 = basic_rag.process_query(query7)
answer8, reference8 = basic_rag.process_query(query8)

print(f'RAG: "{answer1}"\n     [ref: {reference1}]')
print(f'RAG: "{answer2}"\n     [ref: {reference2}]')
print(f'RAG: "{answer3}"\n     [ref: {reference3}]')
print(f'RAG: "{answer4}"\n     [ref: {reference4}]')
print(f'RAG: "{answer5}"\n     [ref: {reference5}]')
print(f'RAG: "{answer6}"\n     [ref: {reference6}]')
print(f'RAG: "{answer7}"\n     [ref: {reference7}]')
print(f'RAG: "{answer8}"\n     [ref: {reference8}]')

RAG: "In 2016, Socialbakers had 350 employees."
     [ref: [('Emplifi', 'https://en.wikipedia.org/wiki/Emplifi', 126.11330250122865), ('Emplifi', 'https://en.wikipedia.org/wiki/Emplifi', 82.78630937431362)]]
RAG: "Jan Rus is a Research Team Lead who is currently working at Emplifi."
     [ref: [('Emplifi', 'https://en.wikipedia.org/wiki/Emplifi', 99.09258361662546)]]
RAG: "Jan Rus is currently working at Emplifi."
     [ref: [('Emplifi', 'https://en.wikipedia.org/wiki/Emplifi', 86.93297576438992), ('Emplifi', 'https://en.wikipedia.org/wiki/Emplifi', 86.93297576438992)]]
RAG: "I'm sorry, I don't know answer to your query."
     [ref: []]
RAG: "I'm sorry, I don't know answer to your query."
     [ref: []]
RAG: "I'm sorry, I don't know answer to your query."
     [ref: []]
RAG: "Yes, Russia is bigger than Belarus."
     [ref: [('Belarus', 'https://en.wikipedia.org/wiki/Belarus', 205.95386592863792), ('Belarus', 'https://en.wikipedia.org/wiki/Belarus', 198.6775665542217)]]
RAG: "Based on t

### Advanced RAG QA
Let's see if a very basic hybrid search will help with the problematic cases and not break anything else.

In [6]:
class AdvancedRAGQA:
    stopwords = [
        'actually', 'almost', 'already', 'also', 'although', 'among', 'around', 'behind', 'beside', 'besides', 'beyond', 'can', 'done',
        'else', 'even', 'ever', 'get', 'go', 'got', 'however', 'just', 'may', 'might', 'must', 'need', 'now', 'one', 'otherwise',
        'per', 'rather', 'really', 'since', 'still', 'than', 'till', 'us', 'via', 'whether', 'will', 'within', 'without', 'yet',
        'aren', 'couldn', 'didn', 'doesn', 'don', 'hadn', 'hasn', 'haven', 'isn', 'mustn', 'needn', 'shouldn', 'wasn', 'weren', 'won', 'wouldn',
        'a', 'the', 'and', 'i', 'you', 'is', 'are', 'not', 'was', 'were', 'no', 'me'
    ]

    sparse_search_min_similarity = 0.01
    dense_search_min_similarity = 30.0

    def __init__(self, chunk_size=250, chunk_overlap=50, k=8, alfa=0.5):
        self.client = Client()
        
        # ensure the chunking settings make sense
        chunk_size = abs(chunk_size)
        chunk_overlap = abs(chunk_overlap)

        if (chunk_size - chunk_overlap) < 1:
            raise Exception('The chunk_size needs to be larger than chunk_overlap')

        k = abs(k)

        if not 0 < k <= 20:
            raise Exception('The k needs to be between 1 and 20')

        alfa = abs(alfa)

        if not 0.0 <= alfa <= 1.0:
            raise Exception('The alfa needs to be between 0.0 and 1.0')

        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.k = k
        self.alfa = alfa

    def _tokenize_text(self, text):
        clean_txt = text.translate(str.maketrans('', '', string.punctuation))
        clean_txt = re.sub(r'\n', ' ', clean_txt)
        clean_txt = re.sub(r'\s+', ' ', clean_txt)
        clean_txt = clean_txt.strip()
        
        tokens = clean_txt.lower().split(' ')
        tokens = [token for token in tokens if (token not in set(self.stopwords)) and (len(token)>2)]
    
        return tokens

    def load_knowledge_base(self, dir_path):
        # read all documents
        documents_list = []
        for file_name in os.listdir(dir_path):
            if file_name.endswith('.json'):
                documents_list.append(pd.read_json(os.path.join(dir_path, file_name)))
    
        # build database
        documents = pd.concat(documents_list, ignore_index=True)
    
        # split documents into chunks
        documents['chunks'] = documents['content'].apply(self._str2chunks)
        documents = documents.explode('chunks').reset_index(drop=True)
        documents = documents.drop(columns=['content'])
        documents = documents.rename(columns={'chunks': 'content'})
    
        # build dense index
        embeddings = documents['content'].apply(self.client.embed_text)

        # build sparse index
        corpus = list(documents['content'].values)
        tokenized_corpus = [self._tokenize_text(document) for document in corpus]
        bm25 = BM25Okapi(tokenized_corpus)
    
        # store knowledge base
        self.knowledge_base = {
            'documents': documents,
            'index_dense': np.stack(embeddings, axis=0),
            'index_sparse': bm25
        }

    def _str2chunks(self, text):    
        return [text[a:a + self.chunk_size] for a in range(0, len(text), self.chunk_size - self.chunk_overlap)]

    def _retriever(self, query):
        # embed user query for dense search
        query_embedding = self.client.embed_text(query)  # use the same embedding that was used for the knowledge base

        # tokenize query for sparse search
        query_tokens = self._tokenize_text(query)

        # dense: retrieve most similar document
        similarities = np.dot(self.knowledge_base['index_dense'], query_embedding)
        top_k_idxs = np.argpartition(similarities, -self.k * 2)[-self.k * 2:]
        top_k_sorted_idxs_dense = top_k_idxs[np.argsort(similarities[top_k_idxs])][::-1]
        
        # sparse: retrieve most similar documents
        doc_scores = self.knowledge_base['index_sparse'].get_scores(query_tokens)
        top_k_idxs = np.argpartition(doc_scores, -self.k * 2)[-self.k * 2:]
        top_k_sorted_idxs_sparse = top_k_idxs[np.argsort(doc_scores[top_k_idxs])][::-1]

        # hybrid score calculation
        hybrid_table = {}
        for idx, doc_id in enumerate(top_k_sorted_idxs_dense):
            if doc_id not in hybrid_table:
                hybrid_table[doc_id] = 0
            if similarities[doc_id] > self.dense_search_min_similarity:
                hybrid_table[doc_id] += self.alfa * (1 / (idx + 1))

        for idx, doc_id in enumerate(top_k_sorted_idxs_sparse):
            if doc_id not in hybrid_table:
                hybrid_table[doc_id] = 0
            if doc_scores[doc_id] > self.sparse_search_min_similarity:
                hybrid_table[doc_id] += (1 - self.alfa) * (1 / (idx + 1))

        hybrid_scored = sorted(hybrid_table.items(), key=lambda item: item[1], reverse = True)[:self.k]
        top_k_sorted_idxs_hybrid, hybrid_scores = list(map(list, zip(*hybrid_scored)))

        top_k_documents = self.knowledge_base['documents'].iloc[top_k_sorted_idxs_hybrid]
    
        return top_k_documents, hybrid_scores

    def _construct_prompt(self, query_text, documents_text):
        prompt = textwrap.dedent(
            '''\
            <s>[INST]Use only the below-given KNOWLEDGE and not prior knowledge to provide an accurate, helpful, concise, and clear answer to the QUERY below.
            Avoid copying word-for-word from the KNOWLEDGE and try to use your own words when possible.
    
            KNOWLEDGE:
            {texts}
    
            Answer the QUERY using only the provided KNOWLEDGE. Don't provide notes, comments, or explanations.
            After the answer, write the last line starting with "Reference ids only: " followed only by a list of the [id] of each reference article containing information needed to answer the query, for example "[6], [11], [13]", that's all.
            If none of the articles from KNOWLEDGE contains information needed to provide a precise answer, or if you are not 100 % sure, reply with string: "DONT_KNOW".

            QUERY: "{query_text}"
            ANSWER:[/INST]
            '''
        ).format(
            query_text = query_text,
            texts = '\n\n'.join(['Article [{}]: """\n{}\n"""'.format(idx, text) for idx, text in documents_text.reset_index(drop=True).items()])
        )
    
        return prompt

    def process_query(self, query):
        documents, similarities = self._retriever(query)
        prompt = self._construct_prompt(query, documents.content)
        answer = self.client.execute_prompt(prompt)

        answer_and_refs = answer.split('Reference ids only:')
        answer = answer_and_refs[0].strip()

        llm_references = []
        used_ids = []
        if len(answer_and_refs) > 1:
            for ref_id in re.findall(r'\[(\d+)\]', answer_and_refs[1]):
                if ref_id not in used_ids:
                    used_ids.append(ref_id)
                    doc = documents.iloc[int(ref_id)]
                    llm_references.append((doc.title, doc.url, similarities[int(ref_id)]))

            llm_references = sorted(llm_references, key=lambda x: x[2], reverse=True)

        if ('DONT_KNOW' in answer) or (len(llm_references) == 0):
            answer = 'I\'m sorry, I don\'t know answer to your query.'
            llm_references = []

        return answer, llm_references

In [7]:
advanced_rag = AdvancedRAGQA()
advanced_rag.load_knowledge_base('../data/wikipedia_kb/')

query1 = 'How many employees did Socialbakers have in 2016?'
query2 = 'Who is Jan Rus?'
query3 = 'Where is Jan Rus working now?'
query4 = 'What is a dog?'
query5 = 'Do dogs love social media?'
query6 = 'Hi!'
query7 = 'Is Russia bigger than Belarus?'
query8 = 'Which country is bigger, Russia or Belarus?'

answer1, reference1 = advanced_rag.process_query(query1)
answer2, reference2 = advanced_rag.process_query(query2)
answer3, reference3 = advanced_rag.process_query(query3)
answer4, reference4 = advanced_rag.process_query(query4)
answer5, reference5 = advanced_rag.process_query(query5)
answer6, reference6 = advanced_rag.process_query(query6)
answer7, reference7 = advanced_rag.process_query(query7)
answer8, reference8 = advanced_rag.process_query(query8)

print(f'RAG: "{answer1}"\n     [ref: {reference1}]')
print(f'RAG: "{answer2}"\n     [ref: {reference2}]')
print(f'RAG: "{answer3}"\n     [ref: {reference3}]')
print(f'RAG: "{answer4}"\n     [ref: {reference4}]')
print(f'RAG: "{answer5}"\n     [ref: {reference5}]')
print(f'RAG: "{answer6}"\n     [ref: {reference6}]')
print(f'RAG: "{answer7}"\n     [ref: {reference7}]')
print(f'RAG: "{answer8}"\n     [ref: {reference8}]')

RAG: "In 2016, Socialbakers had 350 employees."
     [ref: [('Emplifi', 'https://en.wikipedia.org/wiki/Emplifi', 1.0)]]
RAG: "Jan Rus is a Research Team Lead who is currently working at Emplifi."
     [ref: [('Emplifi', 'https://en.wikipedia.org/wiki/Emplifi', 1.0)]]
RAG: "Jan Rus is currently working at Emplifi."
     [ref: [('Emplifi', 'https://en.wikipedia.org/wiki/Emplifi', 0.6666666666666666)]]
RAG: "I'm sorry, I don't know answer to your query."
     [ref: []]
RAG: "I'm sorry, I don't know answer to your query."
     [ref: []]
RAG: "I'm sorry, I don't know answer to your query."
     [ref: []]
RAG: "Yes, Russia is bigger than Belarus."
     [ref: [('Belarus', 'https://en.wikipedia.org/wiki/Belarus', 0.2916666666666667), ('Russia', 'https://en.wikipedia.org/wiki/Russia', 0.16346153846153846)]]
RAG: "Russia is bigger than Belarus."
     [ref: [('Belarus', 'https://en.wikipedia.org/wiki/Belarus', 0.35), ('Russia', 'https://en.wikipedia.org/wiki/Russia', 0.20833333333333331)]]
