In [15]:
import chainlit as cl
import torch

from chainlit.types import AskFileResponse

from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface.llms import HuggingFacePipeline

from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain import hub
from pprint import pprint

In [2]:
# initialize text splitter and embedding
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

embedding = HuggingFaceEmbeddings()

2024-09-10 14:49:59 - Use pytorch device_name: mps
2024-09-10 14:49:59 - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2




In [3]:
# function to load and split pdf file
def process_file(file: AskFileResponse):
    if file.type == "text/plain":
        Loader = TextLoader
    elif file.type == "application/pdf":
        Loader = PyPDFLoader

    loader = Loader(file.path)
    documents = loader.load()
    docs = text_splitter.split_documents(documents)
    for i, doc in enumerate(docs):
        doc.metadata["source"] = f"source_{i}"
    return docs

In [4]:
# function to get vector database
def get_vector_db(file: AskFileResponse):
    docs = process_file(file)
    cl.user_session.set("docs", docs)
    vector_db = Chroma.from_documents(documents=docs, embedding=embedding)
    return vector_db

In [None]:
# function to get LLM model
def get_huggingface_llm(
    model_name: str = "lmsys/vicuna-7b-v1.5", max_new_token: int = 512
):
    nf4_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name, quantization_config=nf4_config, low_cpu_mem_usage=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_token,
        pad_token_id=tokenizer.eos_token_id,
        device_map="auto",
    )

    llm = HuggingFacePipeline(
        pipeline=model_pipeline,
    )
    return llm


LLM = get_huggingface_llm()

## Read pdf file

In [11]:
Loader = PyPDFLoader
FILE_PATH = (
    "./docs/Writing_a_scientific_article__A_step-by-step_guide_for_beginners_.pdf"
)
loader = Loader(FILE_PATH)
documents = loader.load()

print("Number of documents: ", len(documents))
documents[0]

Number of documents:  7


Document(metadata={'source': './docs/Writing_a_scientific_article__A_step-by-step_guide_for_beginners_.pdf', 'page': 0}, page_content='Research  paper\nWriting  a  scientiﬁc  article:  A  step-by-step  guide  for  beginners\nF.  Ecarnot *,  M.-F.  Seronde,  R.  Chopard,  F.  Schiele,  N.  Meneveau\nEA3920,  Department  of  Cardiology,  University  Hospital  Jean-Minjoz,  3,  Boulevard  Fleming,  25000  Besanc ¸on,  France\n1.  Background\nEvery  researcher  has  been  face  to  face  with  a  blank  page  at  some\nstage  of  their  career,  wondering  where  to  start  and  what  to  write\nﬁrst.  Describing  one’s  research  work  in  a  format  that  is\ncomprehensible  to  others,  and  acceptable  for  publication  is  no\neasy  task.  When  you  invest  a  lot  of  time,  energy  and  often  money  in\nyour  research,  you  become  intimately  and  emotionally  involved.\nNaturally,  you  are  convinced  of  the  value  of  your  research,  and  of\nits  importance  for  the  sci

In [12]:
docs = text_splitter.split_documents(documents)
print("Number of mini-documents: ", len(docs))
docs[0]

Number of mini-documents:  59


Document(metadata={'source': './docs/Writing_a_scientific_article__A_step-by-step_guide_for_beginners_.pdf', 'page': 0}, page_content='Research  paper\nWriting  a  scientiﬁc  article:  A  step-by-step  guide  for  beginners\nF.  Ecarnot *,  M.-F.  Seronde,  R.  Chopard,  F.  Schiele,  N.  Meneveau\nEA3920,  Department  of  Cardiology,  University  Hospital  Jean-Minjoz,  3,  Boulevard  Fleming,  25000  Besanc ¸on,  France\n1.  Background\nEvery  researcher  has  been  face  to  face  with  a  blank  page  at  some\nstage  of  their  career,  wondering  where  to  start  and  what  to  write\nﬁrst.  Describing  one’s  research  work  in  a  format  that  is\ncomprehensible  to  others,  and  acceptable  for  publication  is  no\neasy  task.  When  you  invest  a  lot  of  time,  energy  and  often  money  in\nyour  research,  you  become  intimately  and  emotionally  involved.\nNaturally,  you  are  convinced  of  the  value  of  your  research,  and  of\nits  importance  for  the  sci

In [16]:
vector_db = Chroma.from_documents(documents=docs, embedding=embedding)

retriever = vector_db.as_retriever()

QUERY = "What are keypoints of this paper?"
result = retriever.invoke(QUERY)

print("Number of relevant documents: ", len(result))
pprint(result[0])

Number of relevant documents:  4
Document(metadata={'page': 4, 'source': './docs/Writing_a_scientific_article__A_step-by-step_guide_for_beginners_.pdf'}, page_content='the  emphasis  of  the  sentence.  The  order  in  which  the  results  or\nelements  of  discussion  are  mentioned  may  subtly  move  the\nemphasis  away  from  that  originally  intended  by  the  other  author.\nHere  again,  careful  re-reading  by  co-authors  and  senior  mentors,  or\nmembers  of  your  publications  department  (if  you  have  one),  will\nhelp  to  avoid  these  pitfalls.\nWhat  are  the  novel  ﬁndings  of  your  study?  Underlining  how\nyour  ﬁndings  yield  new  evidence  or  a  new  contribution  to  the  state\nof  knowledge  will  substantiate  the  importance  of  your  paper,  and\nits  added  value  for  the  literature,  as  opposed  to  being  ‘‘just  another\npaper’’  on  a  ‘‘worn-out’’  topic.  In  this  regard,  you  can  discuss\nwhether  or  not  your  paper  has  succeeded  