# Challenges encountered

1. Open-source models were not usable due to their large sizes.
2. Hit the limit on HFhub, resulting in errors.
3. Fine-tuning was not completed due to the large model size and limited PC specifications.

# Sections

1. Main includes final code
2. Rough/Experiments: provides a detailed, step-by-step process.


# Main

This section includes the final code.


In [None]:
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI, ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
import os
from dotenv import load_dotenv


class QAsystem:
    def __init__(self, pdf_directory, openai_api_key):
        self.pdf_directory = pdf_directory
        self.openai_api_key = openai_api_key
        self.load_pdf_documents()
        self.clean_corpus()
        self.create_text_splitter()
        self.create_openai_client()
        self.create_retriever()
        self.create_contextualize_question_prompt()
        self.create_qa_prompt()
        self.create_rag_chain()
        self.create_conversational_rag_chain()

    def load_pdf_documents(self):
        try:
            self.loader = PyPDFDirectoryLoader(self.pdf_directory)
            self.pages = self.loader.load()
            print(f"Total Docs: {len(self.pages)}")
        except Exception as e:
            print(f"Error loading PDF: {e}")

    def clean_corpus(self):
        self.corpus = " ".join(
            [page.page_content.replace("\t", " ") for page in self.pages]
        )

        """Cleaning using RegEx"""
        print("Preprocessing corpus... ")
        self.cleaned_corpus = self.corpus

    def create_text_splitter(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=100,
            chunk_overlap=10,
            length_function=len,
        )
        splitted_corpus = self.text_splitter.split_text(self.cleaned_corpus)
        self.splitted_corpus_in_docs = self.text_splitter.create_documents(
            splitted_corpus
        )

    def create_openai_client(self):
        self.openAIclient = ChatOpenAI(
            api_key=self.openai_api_key,
            # callbacks=[ContextCallbackHandler(token="C2nN2SuVyaKE92pGT3HtcSsY")],
        )

    def create_retriever(self):
        self.openaiEmbeddings = OpenAIEmbeddings(openai_api_key=self.openai_api_key)
        self.vectordb = FAISS.from_documents(
            self.splitted_corpus_in_docs, self.openaiEmbeddings
        )
        self.retriever = self.vectordb.as_retriever()

    def create_contextualize_question_prompt(self):
        self.contextualize_q_system_prompt = (
            "Given a chat history and the latest user question "
            "which might reference context in the chat history, "
            "formulate a standalone question which can be understood "
            "without the chat history. Do NOT answer the question, "
            "just reformulate it if needed and otherwise return it as is."
        )
        self.contextualize_q_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", self.contextualize_q_system_prompt),
                MessagesPlaceholder("chat_history"),
                ("human", "{input}"),
            ]
        )
        self.history_aware_retriever = create_history_aware_retriever(
            self.openAIclient, self.retriever, self.contextualize_q_prompt
        )

    def create_qa_prompt(self):
        self.system_prompt = (
            "You are an assistant for question-answering tasks. "
            "Use the following pieces of retrieved context to answer "
            "the question. If you don't know the answer, say that you "
            "don't know. Use three sentences maximum and keep the "
            "answer concise."
            "\n\n"
            "{context}"
        )
        self.qa_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", self.system_prompt),
                MessagesPlaceholder("chat_history"),
                ("human", "{input}"),
            ]
        )
        self.question_answer_chain = create_stuff_documents_chain(
            self.openAIclient, self.qa_prompt
        )

    def create_rag_chain(self):
        self.rag_chain = create_retrieval_chain(
            self.history_aware_retriever, self.question_answer_chain
        )

    def create_conversational_rag_chain(self):
        self.store = {}

        def get_session_history(session_id: str) -> BaseChatMessageHistory:
            if session_id not in self.store:
                self.store[session_id] = ChatMessageHistory()
            return self.store[session_id]

        self.conversational_rag_chain = RunnableWithMessageHistory(
            self.rag_chain,
            get_session_history,
            input_messages_key="input",
            history_messages_key="chat_history",
            output_messages_key="answer",
        )

    def ask_question(self, question, session_id="abc123"):
        response = self.conversational_rag_chain.invoke(
            {"input": question},
            config={"configurable": {"session_id": session_id}},
        )["answer"]
        return response


load_dotenv()
openai_api_key = os.getenv("OPENAIAPIKEY")
if not openai_api_key:
    # print(f"OpenAI API Key: {openai_api_key}")
    # else:
    print("OPENAIAPIKEY environment variable not set.")

pdf_qa_system = QAsystem(pdf_directory="pdfs", openai_api_key=openai_api_key)
response = pdf_qa_system.ask_question(
    "List the categories covered by the paper titled 'TextGrad: Automatic Differentiation viaText'."
)
print(response)

Total Docs: 2
Preprocessing corpus... 
The categories covered by the paper "TextGrad: Automatic Differentiation via Text" include Question Answering and Specificity.


# Rough/Experiments

This section provides a detailed, step-by-step process.


In [None]:
# !pip install langchain_openai
# !pip install langchain_core
# !pip install python-dotenv
# !pip install langchain_community
# !pip install pypdf
# !pip install langchainhub



! pip install langchain-community
! pip install pypdf
! pip install langchain-openai
! pip install python-dotenv
# ! pip install sentence_transformers
! pip install langchain_huggingface
! pip install ipywidgets
! pip install faiss-cpu
! pip install --upgrade context-python


In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.llms import HuggingFaceHub


from langchain.prompts import PromptTemplate
import os

from dotenv import load_dotenv

load_dotenv()

True

In [4]:
from langchain_community.document_loaders import PyPDFDirectoryLoader

try:
    loader = PyPDFDirectoryLoader("pdfs")
    pages = loader.load()
    print(f"Total Docs: {len(pages)}")
except Exception as e:
    print(f"Error loading PDF: {e}")

Total Docs: 2


In [5]:
print(pages[0])

page_content='Title:  MeshAnything: Artist -Created Mesh Generation with Autoregressive Transformers  
Authors:  buaacyw/meshanything  
Date:  14 Jun 2024  
Description:  Recently, 3D assets created via reconstruction and generation have matched the 
quality of manually crafted assets, highlighting their potential for replacement.  
Stats:  417, 5.09 stars / hour  
Categories:  Decoder  
Links:  Paper, Code  
 
Title:  Accessing GPT -4 level Mathematical Olympiad Solutions via Monte Carlo Tree Self -
refine with LLaMa -3 8B  
Authors:  trotsky1997/mathblackbox  
Date:  11 Jun 2024  
Description:  This paper introduces the MCT Self -Refine algorithm, an innovative integration of 
Large Language Models (LLMs) with Monte Carlo Tree Search (MCTS), designed to enhance 
performance in complex mathematical reasoning tasks.  
Stats:  279, 2.35 stars / hour  
Categories:  Decision Making, GSM8K +2  
Links:  Paper, Code  
 
Title:  TextGrad: Automatic 'Differentiation' via Text  
Authors:  zou-g

In [6]:
print(pages[1])

page_content='Categories:  Language Modelling  
Links:  Paper, Code  
 
Title:  VideoLLaMA 2: Advancing Spatial -Temporal Modeling and Audio Understanding in 
Video -LLMs  
Authors:  damo -nlp-sg/videollama2  
Date:  11 Jun 2024  
Description:  In this paper, we present the VideoLLaMA 2, a set of Video Large Language 
Models (Video -LLMs) designed to enhance spatial -temporal modeling and audio understanding 
in video and audio -oriented tasks.  
Stats:  318, 1.50 stars / hour  
Categories:  Multiple -choice, Question Answering +3  
Links:  Paper, Code  
 ' metadata={'source': 'pdfs/RAG Input Doc.pdf', 'page': 1}


In [7]:
content = pages
corpus = " ".join([page.page_content.replace("\t", " ") for page in content])
print(f"length of Corpus: {len(corpus)}, \n\n\ncorpus[:100]: {corpus[:100]}")

length of Corpus: 2462, 


corpus[:100]: Title:  MeshAnything: Artist -Created Mesh Generation with Autoregressive Transformers  
Authors:  b


In [8]:
print(corpus)

Title:  MeshAnything: Artist -Created Mesh Generation with Autoregressive Transformers  
Authors:  buaacyw/meshanything  
Date:  14 Jun 2024  
Description:  Recently, 3D assets created via reconstruction and generation have matched the 
quality of manually crafted assets, highlighting their potential for replacement.  
Stats:  417, 5.09 stars / hour  
Categories:  Decoder  
Links:  Paper, Code  
 
Title:  Accessing GPT -4 level Mathematical Olympiad Solutions via Monte Carlo Tree Self -
refine with LLaMa -3 8B  
Authors:  trotsky1997/mathblackbox  
Date:  11 Jun 2024  
Description:  This paper introduces the MCT Self -Refine algorithm, an innovative integration of 
Large Language Models (LLMs) with Monte Carlo Tree Search (MCTS), designed to enhance 
performance in complex mathematical reasoning tasks.  
Stats:  279, 2.35 stars / hour  
Categories:  Decision Making, GSM8K +2  
Links:  Paper, Code  
 
Title:  TextGrad: Automatic 'Differentiation' via Text  
Authors:  zou-group/textgrad 

### Cleaning corpus


In [9]:
import re


def clean_corpus(text):
    """Nothing to clean yet"""
    return text


cleaned_corpus = clean_corpus(corpus)

In [None]:
# Load the environment variable
openai_api_key = os.getenv("OPENAIAPIKEY")
# Use the environment variable
if openai_api_key:
    print(f"OpenAI API Key: {openai_api_key}")
else:
    print("OPENAIAPIKEY environment variable not set.")

In [None]:
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain_community.callbacks.context_callback import ContextCallbackHandler

openAIclient = ChatOpenAI(
    api_key=openai_api_key,
    # model_name = "gpt-3.5-turbo-16k", #default4k
    # temperature=0.1,
    callbacks=[ContextCallbackHandler(token="C2nN2SuVyaKE92pGT3HtcSsY")],
)

openai_api_key

In [27]:
# to get number of clients
# print(f"total number of tokens in token: {openAIclient.get_num_tokens(cleaned_corpus)}")

In [28]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,  # gpt-3-5-turbo 179) , the docs quote a 16k context window
    chunk_overlap=10,
    length_function=len,
)


splitted_corpus = text_splitter.split_text(cleaned_corpus)

In [29]:
splitted_corpus = text_splitter.split_text(cleaned_corpus)
len(splitted_corpus), splitted_corpus[0]

(31,
 'Title:  MeshAnything: Artist -Created Mesh Generation with Autoregressive Transformers')

In [30]:
splitted_corpus_in_docs = text_splitter.create_documents(splitted_corpus)
splitted_corpus_in_docs

[Document(page_content='Title:  MeshAnything: Artist -Created Mesh Generation with Autoregressive Transformers'),
 Document(page_content='Authors:  buaacyw/meshanything  \nDate:  14 Jun 2024'),
 Document(page_content='Description:  Recently, 3D assets created via reconstruction and generation have matched the'),
 Document(page_content='quality of manually crafted assets, highlighting their potential for replacement.'),
 Document(page_content='Stats:  417, 5.09 stars / hour  \nCategories:  Decoder  \nLinks:  Paper, Code'),
 Document(page_content='Title:  Accessing GPT -4 level Mathematical Olympiad Solutions via Monte Carlo Tree Self -'),
 Document(page_content='refine with LLaMa -3 8B  \nAuthors:  trotsky1997/mathblackbox  \nDate:  11 Jun 2024'),
 Document(page_content='Description:  This paper introduces the MCT Self -Refine algorithm, an innovative integration of'),
 Document(page_content='Large Language Models (LLMs) with Monte Carlo Tree Search (MCTS), designed to enhance'),
 Docum

## LLMs


In [16]:
# HF_TOKEN='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'

In [17]:
# ! huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File "/home/zohaib/anaconda3/envs/r

In [None]:
! huggingface-cli whoami

mzohaibnasir


In [None]:
# from sentence_transformers import SentenceTransformer

# # Load the model
# embeddingsModel = SentenceTransformer("Linq-AI-Research/Linq-Embed-Mistral")
# embeddingsModel

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

# embeddingsModel = HuggingFaceEmbeddings(model_name="Alibaba-NLP/gte-Qwen2-7B-instruct")


embeddingsModel = HuggingFaceEmbeddings(
    model_name="Linq-AI-Research/Linq-Embed-Mistral"
)
embeddingsModel

In [None]:
from langchain_community.embeddings import OllamaEmbeddings

ollamaEmbeddings = (
    OllamaEmbeddings()
)  # by default, uses llama2. Run `ollama pull llama2` to pull down the model
ollamaEmbeddings

OllamaEmbeddings(base_url='http://localhost:11434', model='llama2', embed_instruction='passage: ', query_instruction='query: ', mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None, show_progress=False, headers=None, model_kwargs=None)

In [None]:
from langchain_openai import OpenAIEmbeddings


openaiEmbeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
openaiEmbeddings

In [21]:
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
hf = HuggingFaceHub(
    # repo_id="Linq-AI-Research/Linq-Embed-Mistral",
    repo_id="Alibaba-NLP/gte-Qwen2-7B-instruct",
    task="text2text-generation",
    #  model_kwargs={
    #     "max_new_tokens": 512,
    #     "top_k": 30,
    #     "temperature": 0.1,
    #     "repetition_penalty": 1.03,
    # },
)


hf

HuggingFaceHub(client=<InferenceClient(model='Alibaba-NLP/gte-Qwen2-7B-instruct', timeout=None)>, repo_id='Alibaba-NLP/gte-Qwen2-7B-instruct', task='text2text-generation')

In [None]:
from langchain_huggingface import HuggingFaceEndpoint


llm = HuggingFaceEndpoint(
    # repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
    repo_id="Alibaba-NLP/gte-Qwen2-7B-instruct",
    # max_length=128,
    # temperature=0.5,
    # huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
)
llm

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/zohaib/.cache/huggingface/token
Login successful


HuggingFaceEndpoint(repo_id='Alibaba-NLP/gte-Qwen2-7B-instruct', model='Alibaba-NLP/gte-Qwen2-7B-instruct', client=<InferenceClient(model='Alibaba-NLP/gte-Qwen2-7B-instruct', timeout=120)>, async_client=<InferenceClient(model='Alibaba-NLP/gte-Qwen2-7B-instruct', timeout=120)>)

In [None]:
splitted_corpus_in_docs

[Document(page_content='Title:  MeshAnything: Artist -Created Mesh Generation with Autoregressive Transformers'),
 Document(page_content='Authors:  buaacyw/meshanything  \nDate:  14 Jun 2024'),
 Document(page_content='Description:  Recently, 3D assets created via reconstruction and generation have matched the'),
 Document(page_content='quality of manually crafted assets, highlighting their potential for replacement.'),
 Document(page_content='Stats:  417, 5.09 stars / hour  \nCategories:  Decoder  \nLinks:  Paper, Code'),
 Document(page_content='Title:  Accessing GPT -4 level Mathematical Olympiad Solutions via Monte Carlo Tree Self -'),
 Document(page_content='refine with LLaMa -3 8B  \nAuthors:  trotsky1997/mathblackbox  \nDate:  11 Jun 2024'),
 Document(page_content='Description:  This paper introduces the MCT Self -Refine algorithm, an innovative integration of'),
 Document(page_content='Large Language Models (LLMs) with Monte Carlo Tree Search (MCTS), designed to enhance'),
 Docum

In [32]:
from langchain_community.vectorstores import FAISS


vectordb = FAISS.from_documents(splitted_corpus_in_docs, openaiEmbeddings)
retriever = vectordb.as_retriever()
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7ae568b8f190>)

In [50]:
# # Load the llm
# from langchain_core.prompts import ChatPromptTemplate
# from langchain.chains import create_retrieval_chain
# from langchain.chains.combine_documents import create_stuff_documents_chain
# from langchain.memory import ConversationBufferMemory

# # Define prompt template
# template = """
# You are an assistant for question-answering tasks.
# Use the provided context only to answer the following question:

# <context>
# {context}
# </context>

# Question: {input}
# """

# # Create a prompt template
# prompt = ChatPromptTemplate.from_template(template)


# memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# # Create a chain
# doc_chain = create_stuff_documents_chain(openAIclient, prompt)
# chain = create_retrieval_chain(retriever, doc_chain)


# # User query
# response = chain.invoke(
#     {"input": "Identify a paper that deals with language modeling and its scalability."}
# )

# # Get the Answer only
# print(response["answer"])

The paper "Scalable MatMul-free Language Modeling" by ridgerchu/matmulfreellm deals with language modeling and its scalability.


In [40]:
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    openAIclient, retriever, contextualize_q_prompt
)


### Answer question ###
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(openAIclient, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


### Statefully manage chat history ###
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [44]:
conversational_rag_chain.invoke(
    {
        "input": "List the categories covered by the paper titled 'TextGrad: Automatic Differentiation viaText'."
    },
    config={"configurable": {"session_id": "dummy"}},
)["answer"]

'The paper titled "TextGrad: Automatic Differentiation via Text" covers the categories of Question Answering and Specificity.'

In [13]:
conversational_rag_chain

NameError: name 'conversational_rag_chain' is not defined