# Blockchain RAG

In [3]:
!pip install ragatouille
!pip install chromadb
!pip install langchain langchain-community langchain-huggingface --upgrade
!pip install web3

Collecting langchain
  Downloading langchain-0.3.3-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.2-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain-core<0.4.0,>=0.3.10 (from langchain)
  Downloading langchain_core-0.3.12-py3-none-any.whl.metadata (6.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.135-py3-none-any.whl.metadata (13 kB)
Downloading langchain-0.3.3-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_community-0.3.2-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_core-0.3.12-py3-none-any.whl (407 kB)
Downloading langsmith-0.1.135-py3-none-any.whl (295 kB)
Installing collected packages: langsmith, langchain-core, langchain, langchain-c

In [1]:
import chromadb
import pandas as pd
from ragatouille import RAGPretrainedModel
import ollama
from typing import Optional
from web3 import Web3

## Retrieve the Corpus from the Blockchain

In [2]:
abi = [
	{
		"inputs": [],
		"stateMutability": "nonpayable",
		"type": "constructor"
	},
	{
		"inputs": [
			{
				"internalType": "uint256",
				"name": "",
				"type": "uint256"
			}
		],
		"name": "corpus",
		"outputs": [
			{
				"internalType": "address",
				"name": "sender",
				"type": "address"
			},
			{
				"internalType": "string",
				"name": "metadata",
				"type": "string"
			}
		],
		"stateMutability": "view",
		"type": "function"
	},
	{
		"inputs": [
			{
				"internalType": "uint256[]",
				"name": "ids",
				"type": "uint256[]"
			},
			{
				"internalType": "string",
				"name": "metadata",
				"type": "string"
			},
			{
				"internalType": "string[]",
				"name": "chunks",
				"type": "string[]"
			}
		],
		"name": "insert",
		"outputs": [],
		"stateMutability": "nonpayable",
		"type": "function"
	},
	{
		"inputs": [
			{
				"internalType": "uint256",
				"name": "key",
				"type": "uint256"
			}
		],
		"name": "removeDocument",
		"outputs": [],
		"stateMutability": "nonpayable",
		"type": "function"
	},
	{
		"inputs": [
			{
				"internalType": "address",
				"name": "sender",
				"type": "address"
			}
		],
		"name": "removeDocumentBySender",
		"outputs": [
			{
				"internalType": "uint256[]",
				"name": "",
				"type": "uint256[]"
			}
		],
		"stateMutability": "nonpayable",
		"type": "function"
	},
	{
		"inputs": [
			{
				"internalType": "uint256",
				"name": "key",
				"type": "uint256"
			}
		],
		"name": "retrieveDocument",
		"outputs": [
			{
				"components": [
					{
						"internalType": "address",
						"name": "sender",
						"type": "address"
					},
					{
						"internalType": "string",
						"name": "metadata",
						"type": "string"
					},
					{
						"internalType": "uint256[]",
						"name": "ids",
						"type": "uint256[]"
					},
					{
						"internalType": "string[]",
						"name": "chunks",
						"type": "string[]"
					}
				],
				"internalType": "struct Corpus.Document",
				"name": "",
				"type": "tuple"
			}
		],
		"stateMutability": "view",
		"type": "function"
	},
	{
		"inputs": [
			{
				"internalType": "address",
				"name": "sender",
				"type": "address"
			}
		],
		"name": "retrieveDocumentKeysBySender",
		"outputs": [
			{
				"internalType": "uint256[]",
				"name": "",
				"type": "uint256[]"
			}
		],
		"stateMutability": "view",
		"type": "function"
	},
	{
		"inputs": [],
		"name": "retrieveLatestID",
		"outputs": [
			{
				"internalType": "uint256",
				"name": "",
				"type": "uint256"
			}
		],
		"stateMutability": "view",
		"type": "function"
	},
	{
		"inputs": [],
		"name": "retrieveLatestKey",
		"outputs": [
			{
				"internalType": "int256",
				"name": "",
				"type": "int256"
			}
		],
		"stateMutability": "view",
		"type": "function"
	}
]

In [3]:
web3class = Web3(Web3.HTTPProvider('http://localhost:8545'))
contractAddress = web3class.to_checksum_address('0x5fbdb2315678afecb367f032d93f642f64180aa3')

In [14]:
def retrieveDocument(key):
    contract = web3class.eth.contract(abi=abi, address=contractAddress)
    key = key
    doc = contract.functions.retrieveDocument(key).call()
    if (len(doc[2]) > 0):
        result = f'Retrieved document {key} results'
        print(result)
        return doc[1], doc[2], doc[3]
    else:
        result = f'No results found for {key}'
        print(result)
        return None, None, None

In [5]:
metadata, ids, docs = retrieveDocument(1)

Retrieved document 1 results
Metadata: ghibli example 1
IDs: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
Chunks: ["Hayao Miyazaki was born on January 5, 1941, in the town Akebono-cho in Hongō, Tokyo City, Empire of Japan, the second of four sons. His father, Katsuji Miyazaki (born 1915), was the director of Miyazaki Airplane, his brother's company, which manufactured rudders for fighter planes during World War II. The business allowed his family to remain affluent during Miyazaki's early life. Miyazaki's father enjoyed purchasing paintings and demonstrating them to guests, but otherwise had little known artistic understanding. He was in the Imperial Japanese Army around 1940, discharged and lectured about disloyalty after declaring to his commanding officer that he wished not to fight because of his wife and young child. According to Miyazaki, his father often told him about his exploits, claiming he conti

In [42]:
def retrieveCorpus():
    contract = web3class.eth.contract(abi=abi, address=contractAddress)
    latest_key = contract.functions.retrieveLatestKey().call();
    docs = []
    ids = []
    metadatas = []
    for key in range(latest_key+1):
        doc_metadata, doc_ids, doc_chunks = retrieveDocument(key)
        if doc_chunks is not None:
            for doc in range(len(doc_chunks)):
                docs.append(doc_chunks[doc])
                ids.append(str(doc_ids[doc]))
                metadatas.append({'doc_type':doc_metadata})
    return metadatas, ids, docs

In [43]:
db_metadatas, db_ids, db_docs = retrieveCorpus()

Retrieved document 0 results
Retrieved document 1 results


In [44]:
print(db_metadatas)
print(db_ids)
print(db_docs)

[{'doc_type': 'init'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli exam

## Preprocess and Load the Corpus into the Vector Store

In [45]:
chroma_client = chromadb.Client()
#chroma_client.delete_collection(name="ghibli") # In case we need to redo anything
collection = chroma_client.get_or_create_collection(name="ghibli")
collection.add(documents=db_docs, ids=db_ids, metadatas=db_metadatas)

In [46]:
# Display an example search against chromadb

results = collection.query(
    query_texts=["What was Miyazaki's first film?"],
    n_results=2
)
print(results)

{'ids': [['16', '17']], 'distances': [[0.770384669303894, 0.7783008217811584]], 'metadatas': [[{'doc_type': 'ghibli example 1'}, {'doc_type': 'ghibli example 1'}]], 'embeddings': None, 'documents': [["In 1987, Studio Ghibli acquired the rights to create a film adaptation of Eiko Kadono's novel Kiki's Delivery Service. Miyazaki's work on My Neighbor Totoro prevented him from directing the adaptation; he acted as producer, while Sunao Katabuchi was chosen as director and Nobuyuki Isshiki as script writer. Miyazaki's dissatisfaction of Isshiki's first draft led him to make changes to the project, ultimately taking the role of director. Kadono expressed her dissatisfaction with the differences between the book and screenplay, but Miyazaki and Takahata convinced her to let production continue. The film was originally intended to be a 60-minute special, but expanded into a feature film after Miyazaki completed the storyboards and screenplay. Miyazaki felt the struggles of the protagonist, Ki

## Define the RAG

In [47]:
# Define the reranker

llmreranker = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

  self.scaler = torch.cuda.amp.GradScaler()


In [51]:
# The below takes the prompt, performs a retrieval against chromadb, reranks it, then calls ollama. I did not make (most) of this, it's derived from: https://huggingface.co/learn/cookbook/en/advanced_rag

def rag_query(
    question: str,
    llm: str,
    knowledge_index=collection,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 10,
    num_docs_final: int = 5):
    print("=> Retrieving documents...")
    relevant_docs = knowledge_index.query(query_texts=question, n_results=num_retrieved_docs)
    relevant_docs = relevant_docs['documents'][0]

    if reranker:
        print("=> Reranking documents...")
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)

    relevant_docs = relevant_docs[:num_docs_final]

    final_prompt = f"""
        use the following CONTEXT to answer the QUESTION at the end.
        If you don't know the answer, just say that you don't know, don't try to make up an answer.
        
        CONTEXT: {relevant_docs}
        QUESTION: {question}
        
        """

    response = ollama.chat(model=llm, messages=[
        {
            'role': 'user',
            'content': final_prompt,
        },
    ])
    answer = response['message']['content']

    return answer, relevant_docs

In [49]:
# Perform a retrieval/generations without the reranker

question="What was Miyazaki's first film?"
result, docs = rag_query(question, 'llama3.1')
print(result)

=> Retrieving documents...


In [50]:
print(result)

I don't know. The context doesn't mention his first film explicitly, only that "Porco Rosso" was not his first film (as it did not top Animage's yearly reader poll).


In [None]:
# Display the retrieved documents used

print(docs)
len(docs)

In [52]:
# Perform a retrieval/generation with the reranker

result, docs = rag_query(question, 'llama3.1', reranker=llmreranker)
print(result)

=> Retrieving documents...
=> Reranking documents...


  return torch.cuda.amp.autocast() if self.activated else NullContextManager()
100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 27.36it/s]


Based on the context, Miyazaki's first feature anime film was "The Castle of Cagliostro" (1979), which he directed for Tokyo Movie Shinsha.
