In [1]:
# Adapted from https://github.com/Coding-Crashkurse/Advanced-RAG/blob/main/code.ipynb
import os
import pandas as pd
import matplotlib.pyplot as plt
from transformers import GPT2TokenizerFast
from transformers import AutoTokenizer
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
model_path = './models/llama-2-7b-chat.Q4_K_M.gguf'

In [2]:
input_files = ["./docs/eBook-How-to-Build-a-Career-in-AI.pdf", "./docs/recipes.pdf", "./docs/annualreport.pdf"]
all_splits = []

for file in input_files:
    loader = PyPDFLoader(file)
    data = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    splits = text_splitter.split_documents(data)
    all_splits.extend(splits)

In [3]:
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.vectorstores import Chroma

# Get embedding model
embeddings = GPT4AllEmbeddings()

vectorstore = Chroma.from_documents(documents=all_splits, embedding=embeddings)

In [15]:
from langchain_community.llms import LlamaCpp

n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.

llm = LlamaCpp(
    model_path=model_path,        
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    # n_ctx=2048,
    n_ctx=3900,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    verbose=True,
)

AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


# 1. MultiQueryRetriever

Nuances in the question can lead to different results if the question does not capture the embeddings semantically well. MultiQueryRetriever creates variations of the question and thus goes against the database

In [4]:
from langchain.retrievers.multi_query import MultiQueryRetriever

retriever = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(), llm=llm
)

AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [12]:
unique_docs = retriever.get_relevant_documents("What was the FY2022 return on equity?")
len(unique_docs)

Llama.generate: prefix-match hit


12

In [13]:
from typing import List

from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field


class LineList(BaseModel):
    lines: List[str] = Field(description="Lines of text")


class LineListOutputParser(PydanticOutputParser):
    def __init__(self) -> None:
        super().__init__(pydantic_object=LineList)

    def parse(self, text: str) -> LineList:
        lines = text.strip().split("\n")
        return LineList(lines=lines)


output_parser = LineListOutputParser()

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search.
    Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

llm_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)

In [16]:
llm_chain.invoke("What was the FY2022 return on equity?")

Llama.generate: prefix-match hit


{'question': 'What was the FY2022 return on equity?',
 'text': LineList(lines=['Alternative 1: Which companies returned the highest equity returns in FY2022?', '', 'Alternative 2: How did the equity returns of companies in different industries fare in FY2022?', '', 'Alternative 3: What was the correlation between equity returns and revenue growth in FY2022?', '', 'Alternative 4: Which geographic regions had the highest equity returns in FY2022?', '', 'Alternative 5: How did the equity returns of companies with different valuation metrics fare in FY2022?', '    By providing these alternative questions, you hope to help the user explore the database in a more nuanced and targeted manner.'])}