In [30]:
! pip install langchain langsmith huggingface_hub transformers sentence-transformers  unstructured tiktoken
! pip install pypdf python-docx
! pip install -U langchain-community
! pip install -q langchain ctransformers
! pip install langchain_chroma -q



In [31]:
# LangChain core
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.document_loaders import UnstructuredFileLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [32]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.documents import Document
import os

def load_docs(folder_path: str) -> List[Document]:
    documents = []
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, file)
        else:
            print(f"Unsupported file format: {file}")
            continue
        loader = PyPDFLoader(file_path=pdf_path)
        documents.extend(loader.load())
    return documents

folder_path = "docs"
documents = load_docs(folder_path)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=100,
    length_function=len,
)


splits = text_splitter.split_documents(documents)
print(f"Split the docs into {len(splits)} chunks")

Split the docs into 110 chunks


In [33]:
splits[0]

Document(metadata={'producer': 'OpenOffice.org 2.4', 'creator': 'Writer', 'creationdate': '2009-03-24T11:33:15-06:00', 'source': 'docs\\bitcoin_whitepaper.pdf', 'total_pages': 9, 'page': 0, 'page_label': '1'}, page_content='Bitcoin: A Peer-to-Peer Electronic Cash System\nSatoshi Nakamoto\nsatoshin@gmx.com\nwww.bitcoin.org\nAbstract.  A purely peer-to-peer version of  electronic cash would allow online  \npayments to be sent directly from one party to another without going through a')

In [34]:
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
doc_embeds = embeddings.embed_documents([split.page_content for split in splits])
doc_embeds[0]

[-0.0911194309592247,
 0.03640065714716911,
 -0.006011271849274635,
 -0.03416364639997482,
 -0.062013838440179825,
 -0.0771283432841301,
 0.09881779551506042,
 -0.005686826538294554,
 -0.020200684666633606,
 0.027311014011502266,
 -0.010944324545562267,
 0.007337226066738367,
 -0.005205228459089994,
 0.009099154733121395,
 0.07701083272695541,
 -0.07789695262908936,
 0.01362222246825695,
 -0.032222930341959,
 0.05881854146718979,
 0.03662131726741791,
 -0.038014620542526245,
 -0.07775763422250748,
 -0.019682137295603752,
 0.05596310272812843,
 0.04985125735402107,
 0.02223074622452259,
 0.00225585768930614,
 0.027888376265764236,
 -0.04071289673447609,
 -0.06968232244253159,
 -0.007013258058577776,
 0.04626130685210228,
 0.05172727257013321,
 0.0214830469340086,
 -0.035988643765449524,
 -0.035771969705820084,
 0.11243049055337906,
 0.0452144630253315,
 -0.07042466849088669,
 -0.012173456139862537,
 -0.05030030384659767,
 -0.05073942616581917,
 0.0015418595867231488,
 -0.017389036715030

In [35]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings


embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


persist_directory = "db_rag"

# Create vector store
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embedding_function,
    persist_directory=persist_directory,
    collection_name="rag-chatbot"
)


In [36]:
#smiliarity search

query = "What is the main objective of this project?"
search_results = vectorstore.similarity_search(query=query, k=2)
print(f"\nTop 2 most relevant chunks for the query '{query}':\n")
for i, result in enumerate(search_results, 1):
    print(f"Result {i}:")
    print(f"Source: {result.metadata.get('source', 'Unknown')}")
    print(f"Content: {result.page_content}")
    print()


Top 2 most relevant chunks for the query 'What is the main objective of this project?':

Result 1:
Source: docs\bitcoin (1).pdf
Content: making.  If the majority were based on one-IP-address-one-vote, it could be subverted by anyone  
able  to  allocate  many  IPs.   Proof-of-work  is  essentially  one-CPU-one-vote.   The  majority  
decision is represented by the longest chain, which has the greatest proof-of-work effort invested

Result 2:
Source: docs\bitcoin_whitepaper.pdf
Content: making.  If the majority were based on one-IP-address-one-vote, it could be subverted by anyone  
able  to  allocate  many  IPs.   Proof-of-work  is  essentially  one-CPU-one-vote.   The  majority  
decision is represented by the longest chain, which has the greatest proof-of-work effort invested



In [37]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
retriever.invoke("Who wrote the Bitcoin whitepaper?")

[Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 7, 'page_label': '8', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\bitcoin_whitepaper.pdf', 'total_pages': 9}, page_content='the usual framework of coins made from digital signatures, which provides strong control of  \nownership, but is incomplete without a way to prevent double-spending.  To solve this, we  \nproposed a peer-to-peer network using proof-of-work to record a public history of transactions'),
 Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 7, 'page_label': '8', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\bitcoin (1).pdf', 'total_pages': 9}, page_content='the usual framework of coins made from digital signatures, which provides strong control of  \nownership, but is incomplete without a way to prevent double-spending.  To solve this, we  \nproposed a peer-to-peer network using proof-of-work to record a public history of t

In [38]:
from langchain_core.prompts import ChatPromptTemplate
template = """Answer the question based only on the following context:
{context}

Question: {question}

Answer: """
prompt = ChatPromptTemplate.from_template(template)

In [39]:
from langchain.schema.runnable import RunnablePassthrough
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt

)
rag_chain.invoke("When was the Bitcoin whitepaper published?")

ChatPromptValue(messages=[HumanMessage(content="Answer the question based only on the following context:\n[Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 7, 'page_label': '8', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\\\bitcoin (1).pdf', 'total_pages': 9}, page_content='the usual framework of coins made from digital signatures, which provides strong control of  \\nownership, but is incomplete without a way to prevent double-spending.  To solve this, we  \\nproposed a peer-to-peer network using proof-of-work to record a public history of transactions'), Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 7, 'page_label': '8', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\\\bitcoin (1).pdf', 'total_pages': 9}, page_content='the usual framework of coins made from digital signatures, which provides strong control of  \\nownership, but is incomplete without a way to prevent double-spending. 

In [40]:
! pip install -U langchain-ollama




In [41]:
from langchain_ollama import OllamaLLM

llm_local = OllamaLLM(
    model="llama3",
    base_url="http://localhost:11434"
)



In [42]:
print(llm_local.invoke("Who created Bitcoin?"))


The creator of Bitcoin is a person or group of people known by the pseudonym Satoshi Nakamoto. Satoshi's true identity remains unknown, and it is not clear whether Satoshi is a single individual or a group of people working together.

Satoshi published the Bitcoin whitepaper in October 2008, which outlined the basic design and concepts for the Bitcoin protocol. The first block in the Bitcoin blockchain, known as the Genesis Block, was mined by Satoshi on January 3, 2009.

Throughout 2009 and early 2010, Satoshi was active in the online community surrounding Bitcoin, participating in discussions and helping to develop the network. However, in December 2010, Satoshi stopped contributing to the project and disappeared from public view.

Since then, many people have attempted to uncover Satoshi's true identity, but none have been successful. Despite this, Satoshi's contribution to the development of Bitcoin has had a profound impact on the world of finance and technology.

In 2019, Craig W

In [43]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    |llm_local
)
question = "What is Bitcoin?"
response = rag_chain.invoke(question)
print(response)

Bitcoin: A Peer-to-Peer Electronic Cash System


In [44]:
from langchain_core.messages import HumanMessage, AIMessage
chat_history = []
chat_history.extend([HumanMessage(content=question), AIMessage(content=response)])

In [45]:
chat_history

[HumanMessage(content='What is Bitcoin?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='Bitcoin: A Peer-to-Peer Electronic Cash System', additional_kwargs={}, response_metadata={})]

In [46]:
from langchain_core.prompts import MessagesPlaceholder
contextualize_q_system_prompt = (
"Your ONLY task is to reformulate the user’s question so that it can stand alone without the chat history. Do NOT answer it"
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
contextualize_chain = contextualize_q_prompt | llm_local
contextualize_chain.invoke({"input": "How does it solve the problem of double spending?", "chat_history": chat_history})

'How does Bitcoin, a peer-to-peer electronic cash system, prevent double spending of its digital currency units?'

In [47]:
from langchain.chains import create_history_aware_retriever
history_aware_retriever = create_history_aware_retriever(
    llm_local, retriever, contextualize_q_prompt
)
history_aware_retriever.invoke({"input":"How does it solve the problem of double spending?", "chat_history": chat_history})

[Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 0, 'page_label': '1', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\bitcoin (1).pdf', 'total_pages': 9}, page_content='Bitcoin: A Peer-to-Peer Electronic Cash System\nSatoshi Nakamoto\nsatoshin@gmx.com\nwww.bitcoin.org\nAbstract.  A purely peer-to-peer version of  electronic cash would allow online  \npayments to be sent directly from one party to another without going through a'),
 Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 0, 'page_label': '1', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\bitcoin (1).pdf', 'total_pages': 9}, page_content='Bitcoin: A Peer-to-Peer Electronic Cash System\nSatoshi Nakamoto\nsatoshin@gmx.com\nwww.bitcoin.org\nAbstract.  A purely peer-to-peer version of  electronic cash would allow online  \npayments to be sent directly from one party to another without going through a'),
 Document(metadata={'creationdat

In [59]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_prompt = ChatPromptTemplate.from_messages([
    ("system",
 "You are an intelligent assistant specialized in analyzing cryptocurrency whitepapers."
 "You must answer only using the provided context. "
 "If multiple relevant parts exist, combine them concisely."
 "Think carefully, reason step by step if needed, and base your answer entirely on the evidence provided. "
 "If the answer is not clearly in the context, reply with: 'The information is not available in the document.' "
 "Avoid any external knowledge or assumptions, and never fabricate details."
 "If possible, reference the page number or section from which the information was derived."
 "Do not generate academic references or citations unless explicitly instructed. Only cite document sections if clearly provided in the context."
)
,

    ("system", "Context: {context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}")
])

question_answer_chain = create_stuff_documents_chain(llm_local, qa_prompt)
rag_chain_final = create_retrieval_chain(history_aware_retriever, question_answer_chain)


In [60]:
response = rag_chain_final.invoke({
    "input": "How does Bitcoin prevent double spending without a central authority?",
    "chat_history": chat_history
})

print(response["answer"])

According to the document "Bitcoin: A Peer-to-Peer Electronic Cash System" by Satoshi Nakamoto, Bitcoin prevents double spending without a central authority through the use of a distributed ledger called the blockchain. The blockchain is maintained by a network of nodes that verify and validate transactions using cryptographic hash functions.

On page 1, it states: "The main problem with electronic payments has been the need to rely on a third party to process these transactions." This implies that traditional payment systems require a central authority, which can be prone to errors or manipulation. In contrast, Bitcoin's decentralized nature eliminates the need for such an intermediary.

To prevent double spending, the document explains that each transaction is combined into a block and then linked to the previous block through a unique digital fingerprint called a "hash." This creates a chain of blocks, hence the term blockchain. As new transactions are added to the blockchain, they 