In [41]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = PyPDFLoader("pm_my.pdf")
data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=0, length_function=len, is_separator_regex=False, separators=[
        "\n\n",
        ".",
        "\n",
        " ",
        ".",
        "",
        ",",
    ])
all_splits = text_splitter.split_documents(data)

In [3]:
from langchain_community.document_loaders import PDFMinerLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = PDFMinerLoader("pm_my.pdf")
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=0, length_function=len, is_separator_regex=False, separators=[
        "\n\n",
        ".",
        "\n",
        " ",
        ".",
        "",
        ",",
    ])
all_splits = text_splitter.split_documents(data)

In [4]:
all_splits

[Document(page_content='Anwar bin Ibrahim born 10 August 1947 is a Malaysian politician who has served as the tenth Prime \nMinister of Malaysia since 2022', metadata={'source': 'pm_my.pdf'}),
 Document(page_content='. He served as the 12th and 16th Leader of the Opposition from 2008 to \n2015 and again from 2020 to 2022', metadata={'source': 'pm_my.pdf'}),
 Document(page_content=". He has been the chairman of the Pakatan Harapan (PH) coalition \nsince 2020, the second President of the People's Justice Party (PKR) since 2018 and the Member of", metadata={'source': 'pm_my.pdf'}),
 Document(page_content='Parliament (MP) for Tambun since November 2022', metadata={'source': 'pm_my.pdf'}),
 Document(page_content='. He also served as Deputy Prime Minister and in \nmany other Cabinet positions in the Barisan Nasional (BN) administration under former Prime Minister', metadata={'source': 'pm_my.pdf'}),
 Document(page_content='Mahathir Mohamad from 1982 to his removal in 1998', metadata={'source

In [5]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
 
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

model_kwargs = {'device':'cpu'}
 
encode_kwargs = {'normalize_embeddings': False}

hf = HuggingFaceEmbeddings(
    model_name=modelPath,     
    model_kwargs=model_kwargs, 
    encode_kwargs=encode_kwargs 
)

vectorstore = FAISS.from_documents(all_splits, hf)
vectorstore.save_local("local_vectors")
vectorstore = FAISS.load_local("local_vectors", hf, allow_dangerous_deserialization=True)


  from .autonotebook import tqdm as notebook_tqdm


In [35]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 1, "score_threshold": 1.5})

In [36]:
docs = retriever.get_relevant_documents("10 August 1947")
 
for doc in docs:
  print(doc.page_content.replace("\n", ""))

Anwar bin Ibrahim born 10 August 1947 is a Malaysian politician who has served as the tenth Prime Minister of Malaysia since 2022


In [25]:
# Assuming your text is stored in a list of langchain_core.documents.base.Document objects called 'document_list'
document_list = doc

# Iterate over the list and process each Document object
for document in document_list:
    # Access the page_content from the Document object
    page_content = document.page_content

    # Remove newline characters from page_content
    cleaned_page_content = page_content.replace("\n", "")

    # Update the page_content attribute with the cleaned version
    document.page_content = cleaned_page_content

# Now each Document object in document_list has newline characters removed from its page_content attribute


AttributeError: 'tuple' object has no attribute 'page_content'

In [80]:
chain = prompt | model | parser

print(chain.invoke({"context": "My parents named me Naeem and my father's name is Hussien, Naeem has a cat named Luca", 
                    "question": "What's Naeem's father name and what his cat's name'?"}))

Hussien and Luca, respectively.

I don't know the other information.


In [53]:
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import BitsAndBytesConfig

In [18]:
model_id = "pankajmathur/orca_mini_3b"
 
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
                 model_id,
                 device_map="auto"
                 )
pipe = pipeline("text-generation",
               model=model,
               tokenizer=tokenizer,
                max_new_tokens=1024
               )
 
llm = HuggingFacePipeline(pipeline=pipe)

Downloading shards: 100%|██████████| 3/3 [21:55<00:00, 438.45s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 3000.22it/s]


ValueError: You are trying to offload the whole model to the disk. Please use the `disk_offload` function instead.

In [63]:
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings


MODEL = "gemma:2b"
model = Ollama(model=MODEL)
embeddings = OllamaEmbeddings(model=MODEL)

model.invoke("Tell me a joke")

"What do you call a boomerang that won't come back?\n\nA stick."

In [64]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser 
chain.invoke("Tell me a joke")

"What do you call a boomerang that won't come back?\n\nA stick."

In [77]:
from langchain.prompts import PromptTemplate

template = """
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. Do not try to change the context. Do not add any non context into the context.
Use three sentences maximum and keep the answer as concise as possible. 
{context}
Question: {question}
Helpful Answer:"""

prompt = PromptTemplate.from_template(template)
prompt.format(context="Here is some context", question="Here is a question")

'\nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I don\'t know".\n\nContext: Here is some context\n\nQuestion: Here is a question\n'

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
 
loader = PyPDFLoader("pm_my.pdf")
data = loader.load()
 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=20, separators=[
        "\n\n",
        ".",
        "\n",
        " ",
        
        "",
    ])
all_splits = text_splitter.split_documents(data)

In [66]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("pm_my.pdf")
pages = loader.load_and_split()
pages

[Document(page_content="Anwar bin Ibrahim (Jawi: انور بن ابراهيم IPA: [anwar ɪbrah ɪm]; born 10 August 1947) is a Malaysian \npolitician who has served as the tenth Prime Minister of Malaysia since 2022.  He served as the 12th and \n16th Leader of the Opposition from 2008 to 2015 and again from 2020 to 2022. He has been the \nchairman of the Pakatan Harapan (PH) coalition since 2020, the second President of the People's Justice \nParty (PKR) since 2018 and the Member of Parliament (MP) for Tambun since November 2022. He also \nserved as Deputy Prime Minister and in many other Cabinet positions in the Barisan Nasional (BN) \nadministration under former Prime Minister Mahathir Mohamad from 1982 to his removal in 1998.  \n \nA graduate of the University of Malaya, Anwar started his political career as President of student union \nPersatuan Kebangsaan Pelajar Islam Malaysia (PKPIM) and one of the founders and 2nd President of \nyouth organisation Angkatan Belia Islam Malaysia (ABIM).  Afte

In [68]:
from langchain_community.vectorstores import DocArrayInMemorySearch

vectorstore = DocArrayInMemorySearch.from_documents(pages, embedding=embeddings)



In [81]:
retriever = vectorstore.as_retriever()

retriever.invoke("10 August 1947")

[Document(page_content="Moreover, former Malaysian prime minister Najib Razak also saw his prison sentence halved and fine \nreduced during Anwar's tenure, sparking further controversy.  \nAnwar has long been an advocate for Islamic democracy and reforms to Malaysia's political system. \nOutside of politics, Anwar has held positions at various academic institutions. Unlike the previous Prime \nMinisters of Malaysia who held Hari Raya Aidilfitri open house at Seri Perdana, Anwar does not held Hari \nRaya Aidilfitri open house at Seri Perdana upon his appointment as tenth Prime Minister of Malaysia.", metadata={'source': 'pm_my.pdf', 'page': 1}),
 Document(page_content="Anwar bin Ibrahim (Jawi: انور بن ابراهيم IPA: [anwar ɪbrah ɪm]; born 10 August 1947) is a Malaysian \npolitician who has served as the tenth Prime Minister of Malaysia since 2022.  He served as the 12th and \n16th Leader of the Opposition from 2008 to 2015 and again from 2020 to 2022. He has been the \nchairman of the Pak

In [None]:
question = "What is the structure of ADLREAD?"
print(f"Answer: {chain.invoke({'question': question})}")

In [70]:
from operator import itemgetter

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)

questions = [
    "Who is anwar ibrahim?",
    "who is najib razak?",
    "which university anwar ibrahim graduated from?",
]

for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {chain.invoke({'question': question})}")
    print()

Question: Who is anwar ibrahim?
Answer: The context does not specify who Anwar Ibrahim is, so I cannot answer this question from the context.

Question: who is najib razak?
Answer: The context does not mention who Najib Razak is, so I cannot answer this question from the context.

Question: which university anwar ibrahim graduated from?
Answer: The context does not specify the university Anwar Ibrahim graduated from, so I cannot answer this question from the context.

