In [4]:
from dotenv import load_dotenv, find_dotenv
import os
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import CharacterTextSplitter
from langchain_ai21 import AI21Embeddings, ChatAI21

In [5]:
load_dotenv(find_dotenv())

True

### Document loader

In [6]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter

loader = TextLoader("data/doc1.txt")
documents = loader.load()

# create text spliter
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)

# split the documents
docs = text_splitter.split_documents(documents)
len(docs)



20

### Create embeedings model

In [7]:
embeddings = AI21Embeddings()

### Create vector store

#### Create vector store from documents

In [8]:
VECTOR_STORE_PATH = "data/doc1_index"
try:
    vector_store = FAISS.load_local(VECTOR_STORE_PATH)
except:
    vector_store = FAISS.from_documents(docs, embeddings)
    vector_store.save_local(VECTOR_STORE_PATH)
vector_store.index.ntotal

20

In [10]:
vector_store.index.ntotal

20

In [7]:
query = "What is Transformer"
matched_docs = vector_store.similarity_search(query)
matched_docs

[Document(page_content='Transformer models\nThe specific kind of neural networks used for LLMs are called transformer models. Transformer models are able to learn context — especially important for human language, which is highly context-dependent. Transformer models use a mathematical technique called self-attention to detect subtle ways that elements in a sequence relate to each other. This makes them better at understanding context than other types of machine learning. It enables them to understand, for instance, how the end of a sentence connects to the beginning, and how the sentences in a paragraph relate to each other.', metadata={'source': 'data/doc1.txt'}),
 Document(page_content='What is a large language model (LLM)?\nA large language model (LLM) is a type of artificial intelligence (AI) program that can recognize and generate text, among other tasks. LLMs are trained on huge sets of data — hence the name "large." LLMs are built on machine learning: specifically, a type of ne

In [8]:
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 1})
retrived_docs = retriever.invoke(query)
retrived_docs

[Document(page_content='Transformer models\nThe specific kind of neural networks used for LLMs are called transformer models. Transformer models are able to learn context — especially important for human language, which is highly context-dependent. Transformer models use a mathematical technique called self-attention to detect subtle ways that elements in a sequence relate to each other. This makes them better at understanding context than other types of machine learning. It enables them to understand, for instance, how the end of a sentence connects to the beginning, and how the sentences in a paragraph relate to each other.', metadata={'source': 'data/doc1.txt'})]

#### Create Chat model

In [9]:
llm = ChatAI21(model="j2-ultra", temperature=0.1, max_tokens=100)

#### Function format retrieved documents

In [10]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

### RAG without chat history

In [11]:
from langchain_core.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.

{context}

Question: {question}

Helpful Answer:"""
custom_rag_prompt = PromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever | format_docs, "question": lambda x: x}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What is Transformer?")

'Transformer is a type of neural network used for LLMs (Language Learning Models). It is able to learn context, which makes it especially effective for human language, which is highly context-dependent. Transformer uses self-attention to detect subtle ways that elements in a sequence relate to each other, enabling it to understand context better than other types of machine learning.'

### RAG with chat history

#### Contextualizing the question

In [17]:
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

### Contextualize question chain ###
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [25]:
from langchain.memory import ChatMessageHistory
chat_history = ChatMessageHistory()
chat_history.add_user_message("What is transformer?")
chat_history.add_ai_message("Transformer is a type of neural network used for LLMs (Language Learning Models). It is able to learn context, which makes it especially effective for human language, which is highly context-dependent. Transformer uses self-attention to detect subtle ways that elements in a sequence relate to each other, enabling it to understand context better than other types of machine learning.")
chat_history.add_user_message("Yes, Transformer is a type of neural network, it's core technology is self-attention.")
chat_history.add_ai_message("Yes, that's correct.")

chat_history.messages

[HumanMessage(content='What is transformer?'),
 AIMessage(content='Transformer is a type of neural network used for LLMs (Language Learning Models). It is able to learn context, which makes it especially effective for human language, which is highly context-dependent. Transformer uses self-attention to detect subtle ways that elements in a sequence relate to each other, enabling it to understand context better than other types of machine learning.'),
 HumanMessage(content="Yes, Transformer is a type of neural network, it's core technology is self-attention."),
 AIMessage(content="Yes, that's correct.")]

In [34]:
contextualize_chain = (contextualize_q_prompt | llm | StrOutputParser())

In [39]:
contextualize_question = contextualize_chain.invoke({"chat_history": chat_history.messages, "input": "Can you tell me more about that?"})
contextualize_question

'Can you explain the concept of self-attention in the context of Transformer?'

#### Retrieve documents

In [41]:
docs = retriever.invoke(contextualize_question)
docs

[Document(page_content='Transformer models\nThe specific kind of neural networks used for LLMs are called transformer models. Transformer models are able to learn context — especially important for human language, which is highly context-dependent. Transformer models use a mathematical technique called self-attention to detect subtle ways that elements in a sequence relate to each other. This makes them better at understanding context than other types of machine learning. It enables them to understand, for instance, how the end of a sentence connects to the beginning, and how the sentences in a paragraph relate to each other.', metadata={'source': 'data/doc1.txt'})]

#### QA chain

In [36]:
### Answer question chain ###
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


In [43]:
chain = (qa_prompt | llm | StrOutputParser())

In [44]:
chain.invoke({"context": docs, 'chat_history': chat_history.messages, "input": contextualize_question})

'Self-attention is a mathematical technique used by Transformer models to detect subtle ways that elements in a sequence relate to each other. It enables them to understand context better than other types of machine learning. For example, self-attention helps Transformer models understand how the end of a sentence connects to the beginning, and how the sentences in a paragraph relate to each other.'