In [None]:
! pip install langchain langsmith huggingface_hub transformers sentence-transformers  unstructured tiktoken
! pip install pypdf python-docx
! pip install -U langchain-community
! pip install -q langchain ctransformers
! pip install langchain_chroma -q

In [2]:
# LangChain core
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.document_loaders import UnstructuredFileLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
import sys
sys.path.append('./notebooks')  # path to reach /websearch/ folder

from websearch.test import main
from dotenv import load_dotenv

load_dotenv(dotenv_path="./websearch/.env")

In [8]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.documents import Document
import os

def load_docs(folder_path: str) -> List[Document]:
    documents = []
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, file)
        else:
            print(f"Unsupported file format: {file}")
            continue
        loader = PyPDFLoader(file_path=pdf_path)
        documents.extend(loader.load())
    return documents

folder_path = "docs"
documents = load_docs(folder_path)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=100,
    length_function=len,
)


splits = text_splitter.split_documents(documents)
print(f"Split the docs into {len(splits)} chunks")

Split the docs into 110 chunks


In [9]:
splits[0]

Document(metadata={'producer': 'OpenOffice.org 2.4', 'creator': 'Writer', 'creationdate': '2009-03-24T11:33:15-06:00', 'source': 'docs\\bitcoin_whitepaper.pdf', 'total_pages': 9, 'page': 0, 'page_label': '1'}, page_content='Bitcoin: A Peer-to-Peer Electronic Cash System\nSatoshi Nakamoto\nsatoshin@gmx.com\nwww.bitcoin.org\nAbstract.  A purely peer-to-peer version of  electronic cash would allow online  \npayments to be sent directly from one party to another without going through a')

In [None]:
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
doc_embeds = embeddings.embed_documents([split.page_content for split in splits])
doc_embeds[0]

In [11]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings


embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


persist_directory = "db_rag"

# Create vector store
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embedding_function,
    persist_directory=persist_directory,
    collection_name="rag-chatbot"
)


In [12]:
! pip install -U langchain-ollama





[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
from langchain_ollama import OllamaLLM

llm_local = OllamaLLM(
    model="llama3",
    base_url="http://localhost:11434"
)



In [14]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
retriever.invoke("How does Bitcoin prevent fraud?")

[Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 0, 'page_label': '1', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\bitcoin (1).pdf', 'total_pages': 9}, page_content='be wary of their customers, hassling them for more information than they would otherwise need.  \nA certain percentage of fraud is accepted as unavoidable.  These costs and payment uncertainties  \ncan be avoided in person by using physical currency, but no mechanism exists to make payments'),
 Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 0, 'page_label': '1', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\bitcoin_whitepaper.pdf', 'total_pages': 9}, page_content='be wary of their customers, hassling them for more information than they would otherwise need.  \nA certain percentage of fraud is accepted as unavoidable.  These costs and payment uncertainties  \ncan be avoided in person by using physical currency, but no mech

In [15]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers.multi_query import MultiQueryRetriever
multiquery_retriever = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(search_kwargs={"k": 2}),
    llm=llm_local,
)

In [16]:
from langchain_core.prompts import ChatPromptTemplate
template = """Answer the question based only on the following context:
{context}

Question: {question}

Answer: """
prompt = ChatPromptTemplate.from_template(template)

In [17]:
from langchain.schema.runnable import RunnablePassthrough
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt

)
rag_chain.invoke("When was the Bitcoin whitepaper published?")

ChatPromptValue(messages=[HumanMessage(content="Answer the question based only on the following context:\n[Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 7, 'page_label': '8', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\\\bitcoin (1).pdf', 'total_pages': 9}, page_content='the usual framework of coins made from digital signatures, which provides strong control of  \\nownership, but is incomplete without a way to prevent double-spending.  To solve this, we  \\nproposed a peer-to-peer network using proof-of-work to record a public history of transactions'), Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 7, 'page_label': '8', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\\\bitcoin_whitepaper.pdf', 'total_pages': 9}, page_content='the usual framework of coins made from digital signatures, which provides strong control of  \\nownership, but is incomplete without a way to prevent double-spe

In [18]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    |llm_local
)
question = "What is Bitcoin?"
response = rag_chain.invoke(question)
print(response)

Bitcoin: A Peer-to-Peer Electronic Cash System


In [19]:
from langchain_core.messages import HumanMessage, AIMessage
chat_history = []
chat_history.extend([HumanMessage(content=question), AIMessage(content=response)])

In [20]:
chat_history

[HumanMessage(content='What is Bitcoin?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='Bitcoin: A Peer-to-Peer Electronic Cash System', additional_kwargs={}, response_metadata={})]

In [21]:
from langchain_core.prompts import MessagesPlaceholder
contextualize_q_system_prompt = (
"Your ONLY task is to reformulate the user’s question so that it can stand alone without the chat history. Do NOT answer it"
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
contextualize_chain = contextualize_q_prompt | llm_local
contextualize_chain.invoke({"input": "How does it solve the problem of double spending?", "chat_history": chat_history})

'What is the mechanism used by Bitcoin to prevent double-spending, which occurs when a digital coin is spent more than once?'

In [22]:
from langchain.chains import create_history_aware_retriever
history_aware_retriever = create_history_aware_retriever(
    llm_local, retriever, contextualize_q_prompt
)
history_aware_retriever.invoke({"input":"How does it solve the problem of double spending?", "chat_history": chat_history})

[Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 0, 'page_label': '1', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\bitcoin (1).pdf', 'total_pages': 9}, page_content='Bitcoin: A Peer-to-Peer Electronic Cash System\nSatoshi Nakamoto\nsatoshin@gmx.com\nwww.bitcoin.org\nAbstract.  A purely peer-to-peer version of  electronic cash would allow online  \npayments to be sent directly from one party to another without going through a'),
 Document(metadata={'creationdate': '2009-03-24T11:33:15-06:00', 'creator': 'Writer', 'page': 0, 'page_label': '1', 'producer': 'OpenOffice.org 2.4', 'source': 'docs\\bitcoin (1).pdf', 'total_pages': 9}, page_content='Bitcoin: A Peer-to-Peer Electronic Cash System\nSatoshi Nakamoto\nsatoshin@gmx.com\nwww.bitcoin.org\nAbstract.  A purely peer-to-peer version of  electronic cash would allow online  \npayments to be sent directly from one party to another without going through a'),
 Document(metadata={'creationdat

In [23]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are an intelligent assistant specialized in analyzing cryptocurrency whitepapers.\n"
     "You must answer strictly and only based on the provided context inserted below.\n"
     "Do not use any external knowledge, prior memory, or assumptions.\n"
     "If multiple relevant parts exist, combine them concisely and logically.\n"
     "If the question asks for a purpose, goal, or reason, prioritize explaining the *why* before the *how*.\n"
     "Think carefully and reason step-by-step if necessary, but always stay grounded in the provided context.\n"
     "If the answer cannot be clearly found in the context, you must respond exactly with:\n"
     "'The information is not available in the document.'\n"
     "You are strictly prohibited from guessing, assuming, completing, or inventing missing information.\n"
     "If possible, reference the page number or section from which the answer is derived.\n"
     "Do not generate academic references, bibliography entries, or external citations unless explicitly instructed.\n"
     "Only cite document sections if they are clearly included in the provided context."
    ),
    ("system", "Context: {context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}")
])

question_answer_chain = create_stuff_documents_chain(llm_local, qa_prompt)
rag_chain_final = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [24]:
import sqlite3
from datetime import datetime
import uuid

DB_NAME = "rag_app.db"

def get_db_connection():
    conn = sqlite3.connect(DB_NAME)
    conn.row_factory = sqlite3.Row
    return conn

def create_application_logs():
    conn = get_db_connection()
    conn.execute('''CREATE TABLE IF NOT EXISTS application_logs
    (id INTEGER PRIMARY KEY AUTOINCREMENT,
    session_id TEXT,
    user_query TEXT,
    gpt_response TEXT,
    model TEXT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
    conn.close()

def insert_application_logs(session_id, user_query, gpt_response, model):
    conn = get_db_connection()
    conn.execute('INSERT INTO application_logs (session_id, user_query, gpt_response, model) VALUES (?, ?, ?, ?)',
                 (session_id, user_query, gpt_response, model))
    conn.commit()
    conn.close()

def get_chat_history(session_id):
    conn = get_db_connection()
    cursor = conn.cursor()
    cursor.execute('SELECT user_query, gpt_response FROM application_logs WHERE session_id = ? ORDER BY created_at', (session_id,))
    messages = []
    for row in cursor.fetchall():
        messages.extend([
            {"role": "human", "content": row['user_query']},
            {"role": "ai", "content": row['gpt_response']}
        ])
    conn.close()
    return messages

# Initialize the database
create_application_logs()

In [25]:
import uuid

session_id = str(uuid.uuid4())

user_question = "How does Bitcoin prevent double-spending without relying on a central authority?"

chat_history = get_chat_history(session_id)

final_response = rag_chain_final.invoke({
    "input": user_question,
    "chat_history": chat_history
})


print(final_response['answer'])


According to the provided context, it seems that the original idea was to have a centralized minting process where coins must be returned to the mint after each transaction. However, this solution is problematic because it relies on a central authority.

The information is not available in the document regarding how Bitcoin prevents double-spending without relying on a central authority, as this concept seems to be specific to the original idea and not directly related to Bitcoin's actual implementation.


In [None]:


# Start a session
session_id = str(uuid.uuid4())

# User input
user_question = "Who are the founders and co founders of etherium ?"

# Load past messages if any
chat_history = get_chat_history(session_id)

# Step 1: Try RAG
final_response = rag_chain_final.invoke({
    "input": user_question,
    "chat_history": chat_history
})

# Step 2: Check if fallback is needed
if "The information is not available in the document." in final_response['answer']:
    print("\n🧠 No answer found in document. Triggering web search agent...")
    main(user_question)  # 🔁 Fallback to real-time web search
else:
    print("\n📄 RAG Answer:")
    print(final_response['answer'])



🧠 No answer found in document. Triggering web search agent...
USER (hardcoded): Who are the founders and co founders of etherium ?

GENERATING SEARCH QUERY.
EXA Search Results: 8 results found.
4
✅ First 500 chars of scraped text:
**The Uncanny Mind That Built Ethereum**

## Vitalik Buterin invented the world’s hottest new cryptocurrency and inspired a movement — before he’d turned 20.

--

I** met Vitalik Buterin** for the first time in Miami, during a Bitcoin conference in 2014. I had been invited by a Bitcoiner I knew in New York to stay at a beach house with a team of developers who were working on the next big thing, a technology called Ethereum. I was told it would blow Bitcoin out of the water.

Buterin and about a...

