In [1]:
! pip install langchain langsmith huggingface_hub transformers sentence-transformers  unstructured tiktoken
! pip install pypdf python-docx
! pip install -U langchain-community
! pip install -q langchain ctransformers
! pip install langchain_chroma -q



In [2]:
# LangChain core
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.document_loaders import UnstructuredFileLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.documents import Document
import os

def load_docs(folder_path: str) -> List[Document]:
    documents = []
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, file)
        else:
            print(f"Unsupported file format: {file}")
            continue
        loader = PyPDFLoader(file_path=pdf_path)
        documents.extend(loader.load())
    return documents

folder_path = "docs"
documents = load_docs(folder_path)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=100,
    length_function=len,
)


splits = text_splitter.split_documents(documents)
print(f"Split the docs into {len(splits)} chunks")

Split the docs into 110 chunks


In [4]:
splits[0]

Document(metadata={'producer': 'OpenOffice.org 2.4', 'creator': 'Writer', 'creationdate': '2009-03-24T11:33:15-06:00', 'source': 'docs\\bitcoin (1).pdf', 'total_pages': 9, 'page': 0, 'page_label': '1'}, page_content='Bitcoin: A Peer-to-Peer Electronic Cash System\nSatoshi Nakamoto\nsatoshin@gmx.com\nwww.bitcoin.org\nAbstract.  A purely peer-to-peer version of  electronic cash would allow online  \npayments to be sent directly from one party to another without going through a')

In [5]:
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
doc_embeds = embeddings.embed_documents([split.page_content for split in splits])
doc_embeds[0]

  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


[-0.0911194309592247,
 0.03640065714716911,
 -0.006011271849274635,
 -0.03416364639997482,
 -0.062013838440179825,
 -0.0771283432841301,
 0.09881779551506042,
 -0.005686826538294554,
 -0.020200684666633606,
 0.027311014011502266,
 -0.010944324545562267,
 0.007337226066738367,
 -0.005205228459089994,
 0.009099154733121395,
 0.07701083272695541,
 -0.07789695262908936,
 0.01362222246825695,
 -0.032222930341959,
 0.05881854146718979,
 0.03662131726741791,
 -0.038014620542526245,
 -0.07775763422250748,
 -0.019682137295603752,
 0.05596310272812843,
 0.04985125735402107,
 0.02223074622452259,
 0.00225585768930614,
 0.027888376265764236,
 -0.04071289673447609,
 -0.06968232244253159,
 -0.007013258058577776,
 0.04626130685210228,
 0.05172727257013321,
 0.0214830469340086,
 -0.035988643765449524,
 -0.035771969705820084,
 0.11243049055337906,
 0.0452144630253315,
 -0.07042466849088669,
 -0.012173456139862537,
 -0.05030030384659767,
 -0.05073942616581917,
 0.0015418595867231488,
 -0.017389036715030

In [6]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings


embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


persist_directory = "db_rag"

# Create vector store
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embedding_function,
    persist_directory=persist_directory,
    collection_name="rag-chatbot"
)


In [7]:
#smiliarity search

query = "What is the main objective of this project?"
search_results = vectorstore.similarity_search(query=query, k=2)
print(f"\nTop 2 most relevant chunks for the query '{query}':\n")
for i, result in enumerate(search_results, 1):
    print(f"Result {i}:")
    print(f"Source: {result.metadata.get('source', 'Unknown')}")
    print(f"Content: {result.page_content}")
    print()


Top 2 most relevant chunks for the query 'What is the main objective of this project?':

Result 1:
Source: docs\bitcoin (1).pdf
Content: making.  If the majority were based on one-IP-address-one-vote, it could be subverted by anyone  
able  to  allocate  many  IPs.   Proof-of-work  is  essentially  one-CPU-one-vote.   The  majority  
decision is represented by the longest chain, which has the greatest proof-of-work effort invested

Result 2:
Source: docs\bitcoin (1).pdf
Content: making.  If the majority were based on one-IP-address-one-vote, it could be subverted by anyone  
able  to  allocate  many  IPs.   Proof-of-work  is  essentially  one-CPU-one-vote.   The  majority  
decision is represented by the longest chain, which has the greatest proof-of-work effort invested



In [8]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
retriever.invoke("Who wrote the Bitcoin whitepaper?")

[Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 7, 'page_label': '8', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\bitcoin (1).pdf', 'total_pages': 9}, page_content='the usual framework of coins made from digital signatures, which provides strong control of  \nownership, but is incomplete without a way to prevent double-spending.  To solve this, we  \nproposed a peer-to-peer network using proof-of-work to record a public history of transactions'),
 Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 7, 'page_label': '8', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\bitcoin (1).pdf', 'total_pages': 9}, page_content='the usual framework of coins made from digital signatures, which provides strong control of  \nownership, but is incomplete without a way to prevent double-spending.  To solve this, we  \nproposed a peer-to-peer network using proof-of-work to record a public history of transact

In [9]:
from langchain_core.prompts import ChatPromptTemplate
template = """Answer the question based only on the following context:
{context}

Question: {question}

Answer: """
prompt = ChatPromptTemplate.from_template(template)

In [10]:
from langchain.schema.runnable import RunnablePassthrough
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt

)
rag_chain.invoke("When was the Bitcoin whitepaper published?")

ChatPromptValue(messages=[HumanMessage(content="Answer the question based only on the following context:\n[Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 7, 'page_label': '8', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\\\bitcoin (1).pdf', 'total_pages': 9}, page_content='the usual framework of coins made from digital signatures, which provides strong control of  \\nownership, but is incomplete without a way to prevent double-spending.  To solve this, we  \\nproposed a peer-to-peer network using proof-of-work to record a public history of transactions'), Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 7, 'page_label': '8', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\\\bitcoin (1).pdf', 'total_pages': 9}, page_content='the usual framework of coins made from digital signatures, which provides strong control of  \\nownership, but is incomplete without a way to prevent double-spending. 

In [11]:
! pip install -U langchain-ollama




In [12]:
from langchain_ollama import OllamaLLM

llm_local = OllamaLLM(
    model="llama3",
    base_url="http://localhost:11434"
)



In [13]:
print(llm_local.invoke("Who created Bitcoin?"))


The origin of Bitcoin is a fascinating story!

Bitcoin was created by an individual or group of individuals using the pseudonym Satoshi Nakamoto. The true identity of Satoshi Nakamoto remains unknown to this day, despite numerous attempts to uncover their identity.

Satoshi Nakamoto published a whitepaper in October 2008 that proposed the concept of Bitcoin and its underlying technology, blockchain. The paper outlined the design principles and technical specifications for the first decentralized digital currency.

In January 2009, Satoshi Nakamoto created the first block in the Bitcoin blockchain, known as the Genesis Block. This marked the official launch of the Bitcoin network.

Throughout the early years of Bitcoin's development, Satoshi Nakamoto remained active on online forums and email lists, providing guidance and support to other developers working on the project. They also made significant contributions to the code and maintained a high level of involvement until December 2010

In [14]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    |llm_local
)
question = "What is Bitcoin?"
response = rag_chain.invoke(question)
print(response)

Bitcoin: A Peer-to-Peer Electronic Cash System


In [15]:
from langchain_core.messages import HumanMessage, AIMessage
chat_history = []
chat_history.extend([HumanMessage(content=question), AIMessage(content=response)])

In [16]:
chat_history

[HumanMessage(content='What is Bitcoin?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='Bitcoin: A Peer-to-Peer Electronic Cash System', additional_kwargs={}, response_metadata={})]

In [17]:
from langchain_core.prompts import MessagesPlaceholder
contextualize_q_system_prompt = (
"Your ONLY task is to reformulate the user’s question so that it can stand alone without the chat history. Do NOT answer it"
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
contextualize_chain = contextualize_q_prompt | llm_local
contextualize_chain.invoke({"input": "How does it solve the problem of double spending?", "chat_history": chat_history})

'What is the mechanism by which Bitcoin prevents duplicate transactions and maintains a secure, decentralized ledger?'

In [18]:
from langchain.chains import create_history_aware_retriever
history_aware_retriever = create_history_aware_retriever(
    llm_local, retriever, contextualize_q_prompt
)
history_aware_retriever.invoke({"input":"How does it solve the problem of double spending?", "chat_history": chat_history})

[Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 0, 'page_label': '1', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\bitcoin (1).pdf', 'total_pages': 9}, page_content='Bitcoin: A Peer-to-Peer Electronic Cash System\nSatoshi Nakamoto\nsatoshin@gmx.com\nwww.bitcoin.org\nAbstract.  A purely peer-to-peer version of  electronic cash would allow online  \npayments to be sent directly from one party to another without going through a'),
 Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 0, 'page_label': '1', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\bitcoin (1).pdf', 'total_pages': 9}, page_content='Bitcoin: A Peer-to-Peer Electronic Cash System\nSatoshi Nakamoto\nsatoshin@gmx.com\nwww.bitcoin.org\nAbstract.  A purely peer-to-peer version of  electronic cash would allow online  \npayments to be sent directly from one party to another without going through a'),
 Document(metadata={'creationdat

In [19]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_prompt = ChatPromptTemplate.from_messages([
    ("system",
 "You are an intelligent assistant helping users understand cryptocurrency whitepapers. "
 "You must answer only using the provided context. "
 "Think carefully, reason step by step if needed, and base your answer entirely on the evidence provided. "
 "If the answer is not clearly in the context, reply with: 'The information is not available in the document.' "
 "Avoid any external knowledge or assumptions, and never fabricate details."

)
,

    ("system", "Context: {context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}")
])

question_answer_chain = create_stuff_documents_chain(llm_local, qa_prompt)
rag_chain_final = create_retrieval_chain(history_aware_retriever, question_answer_chain)


In [20]:
response = rag_chain_final.invoke({
    "input": "How does Bitcoin prevent double spending without a central authority?",
    "chat_history": chat_history
})

print(response["answer"])

Based on the provided context, it seems that the current solution to preventing double-spending is not reliable because it relies on the transaction process itself. However, since this is the same document where we are trying to understand how Bitcoin prevents double-spending without a central authority, I will look for alternative information.

The relevant passage in this document is:

"A Peer-to-Peer Electronic Cash System"

This suggests that Bitcoin operates as a peer-to-peer system, where transactions between users are verified and recorded on a public ledger called the blockchain. This decentralized approach allows for secure and transparent transactions without relying on a central authority.

In fact, later in the whitepaper, it is explained how Bitcoin prevents double-spending:

"What is needed is some way to link each new coin to its predecessor, making it impossible to spend the same coin twice."

This is achieved through the use of cryptography, specifically digital signat

In [21]:
from langchain.prompts import ChatPromptTemplate

follow_up_prompt = ChatPromptTemplate.from_messages([
    ("system", 
     "You are an assistant that suggests thoughtful follow-up questions based on previous Q&A about cryptocurrency whitepapers. "
     "Your job is to propose ONE relevant follow-up question based on the context of the previous exchange. "
     "Keep the follow-up short, clear, and curiosity-driven."),
    ("human", "Previous question: {question}\nAnswer: {answer}\n\nSuggest a follow-up question:")
])

# Then create the chain
follow_up_chain = follow_up_prompt | llm_local

# Example use
follow_up_chain.invoke({
    "question": "What is double spending?",
    "answer": "It refers to the risk of the same digital coin being spent twice, which Bitcoin solves with timestamped blocks and proof-of-work."
})


'What are some potential implications if a cryptocurrency were to abandon the proof-of-work consensus mechanism in favor of another method, such as proof-of-stake or delegated proof-of-stake?'

In [22]:
import sqlite3
from datetime import datetime
import uuid

DB_NAME = "rag_app.db"

def get_db_connection():
    conn = sqlite3.connect(DB_NAME)
    conn.row_factory = sqlite3.Row
    return conn

def create_application_logs():
    conn = get_db_connection()
    conn.execute('''CREATE TABLE IF NOT EXISTS application_logs
    (id INTEGER PRIMARY KEY AUTOINCREMENT,
    session_id TEXT,
    user_query TEXT,
    gpt_response TEXT,
    model TEXT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
    conn.close()

def insert_application_logs(session_id, user_query, gpt_response, model):
    conn = get_db_connection()
    conn.execute('INSERT INTO application_logs (session_id, user_query, gpt_response, model) VALUES (?, ?, ?, ?)',
                 (session_id, user_query, gpt_response, model))
    conn.commit()
    conn.close()

def get_chat_history(session_id):
    conn = get_db_connection()
    cursor = conn.cursor()
    cursor.execute('SELECT user_query, gpt_response FROM application_logs WHERE session_id = ? ORDER BY created_at', (session_id,))
    messages = []
    for row in cursor.fetchall():
        messages.extend([
            {"role": "human", "content": row['user_query']},
            {"role": "ai", "content": row['gpt_response']}
        ])
    conn.close()
    return messages

# Initialize the database
create_application_logs()


In [23]:
# Example usage for a new user
session_id = str(uuid.uuid4())
question = "What is Bitcoin?"
chat_history = get_chat_history(session_id)
answer = rag_chain_final.invoke({"input": question, "chat_history": chat_history})['answer']
insert_application_logs(session_id, question, answer, "llama3")
print(f"Human: {question}")
print(f"AI: {answer}\n")

# Example of a follow-up question
question2 = "Who invented it?"
chat_history = get_chat_history(session_id)
answer2 = rag_chain_final.invoke({"input": question2, "chat_history": chat_history})['answer']
insert_application_logs(session_id, question2, answer2, "llama3")
print(f"Human: {question2}")
print(f"AI: {answer2}")


Human: What is Bitcoin?
AI: According to the provided context, Satoshi Nakamoto's whitepaper "Bitcoin: A Peer-to-Peer Electronic Cash System" defines Bitcoin as "a purely peer-to-peer version of electronic cash". This implies that Bitcoin is an electronic payment system that allows online payments to be sent directly from one party to another without going through a central authority.

Human: Who invented it?
AI: According to the provided context, the author of the whitepaper "Bitcoin: A Peer-to-Peer Electronic Cash System" is Satoshi Nakamoto. However, there is no information provided about who Satoshi Nakamoto actually is or whether they are an individual or a group of people. The email address satoshin@gmx.com and website www.bitcoin.org are also listed as contact details for Satoshi Nakamoto.


In [24]:

question3 = "Who are the strategic partners or investors?"
chat_history = get_chat_history(session_id)
answer3 = rag_chain_final.invoke({"input": question3, "chat_history": chat_history})['answer']
insert_application_logs(session_id, question3, answer3, "llama3")
print(f"Human: {question3}")
print(f"AI: {answer3}")

Human: Who are the strategic partners or investors?
AI: The information is not available in the document.


In [25]:
question4 = "What is the technical architecture of the system?"
chat_history = get_chat_history(session_id)
answer4 = rag_chain_final.invoke({"input": question4, "chat_history": chat_history})['answer']
insert_application_logs(session_id, question4, answer4, "llama3")
print(f"Human: {question4}")
print(f"AI: {answer4}")



Human: What is the technical architecture of the system?
AI: According to the provided context, the whitepaper "Bitcoin: A Peer-to-Peer Electronic Cash System" does not provide detailed information about the technical architecture of the Bitcoin system. Therefore, I can only answer that the information is not available in the document.

Note that this whitepaper focuses on the concept and design of Bitcoin as a peer-to-peer electronic cash system, but it does not delve into technical details such as network protocols, algorithms, or infrastructure.


In [26]:
from langchain_core.runnables import RunnableMap

# This version extracts context explicitly
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

question_answer_chain_jus = RunnableMap({
    "context": lambda x: format_docs(x["context"]),
    "input": lambda x: x["input"],
    "chat_history": lambda x: x.get("chat_history", [])
}) | qa_prompt | llm_local


In [27]:
rag_chain_final_jus = create_retrieval_chain(history_aware_retriever, question_answer_chain)


In [28]:
result = rag_chain_final_jus.invoke({
    "input": "What is the technical architecture of the system?",
    "chat_history": chat_history
})

# result is a dict like: {'answer': '...', 'context': [...]}
answer = result['answer']
source_docs = result['context']

print(f"🤖 Answer: {answer}\n")

print("📚 Sources:")
for i, doc in enumerate(source_docs):
    print(f"\nSource {i+1}:")
    print(f"Page: {doc.metadata.get('page', 'Unknown')}")
    print(f"Preview: {doc.page_content[:300]}...")


🤖 Answer: According to the provided context, there is no information about the technical architecture of the Bitcoin system. The whitepaper only provides an abstract and does not delve into the technical details of the system.

📚 Sources:

Source 1:
Page: 0
Preview: Bitcoin: A Peer-to-Peer Electronic Cash System
Satoshi Nakamoto
satoshin@gmx.com
www.bitcoin.org
Abstract.  A purely peer-to-peer version of  electronic cash would allow online  
payments to be sent directly from one party to another without going through a...

Source 2:
Page: 0
Preview: Bitcoin: A Peer-to-Peer Electronic Cash System
Satoshi Nakamoto
satoshin@gmx.com
www.bitcoin.org
Abstract.  A purely peer-to-peer version of  electronic cash would allow online  
payments to be sent directly from one party to another without going through a...

Source 3:
Page: 0
Preview: Bitcoin: A Peer-to-Peer Electronic Cash System
Satoshi Nakamoto
satoshin@gmx.com
www.bitcoin.org
Abstract.  A purely peer-to-peer version of  electronic ca