In [None]:
import chromadb
import ollama
from pyzotero import zotero
from tqdm import tqdm
import os
import subprocess
from pushover import Client
import time

# Yet Another RAG system (YARAGS)

## RAG: Retrieval-Augmented Generation

1. retrieves external information from a knowledge base
2. uses that information to "ground" the LLM's response (context)

### How does it work?

1. Compute embeddings of knowledge base
2. Compute embedding of a user query
3. Find the most similar embeddings (typically using cross-entropy) to the query embedding
4. Insert their associated content in the LLM's context
5. Submit query to LLM

## Problem to solve

Write background chapter using references from my [Zotero](https://www.zotero.org/) library (for free).

### How?

Implement our own RAG system with:
- Knowledge base: Zotero annotations (retrieved using [pyzotero](https://github.com/urschrei/pyzotero))
- Embeddings and similarity: Use [Chroma](https://www.trychroma.com/), an AI-native open-source vector database. By default uses [Sentence transformers](https://www.sbert.net/) (`all-MiniLM-L6-v2`).
- LLM: Local using [Ollama](https://ollama.com/). Model: `gpt-oss:20b`

### Disclamer

Heavily inspired from blog post [DIY: Ground LLaMa on your papers from Zotero](https://medium.com/@emcf1/diy-ground-a-language-model-on-your-papers-from-zotero-with-finesse-a5c4ca7c187a)

## Setup

In [None]:
# Chroma client

chroma_client = chromadb.PersistentClient()

In [None]:
# Zotero client

zot_client = zotero.Zotero(
        library_id=os.environ.get("ZOTERO_USER_ID"),
        library_type="user",
        api_key=os.environ.get("ZOTERO_API_KEY"),
        local=False
    )

## Import annotations from Zotero to Chromadb

In [None]:
# Create collection 'annotations' in Chroma

collection = chroma_client.get_or_create_collection("annotations")
# Backup "all-my-documents"

In [None]:
# Retrieve Zotero annotations

annotations = zot_client.items(itemType="annotation")
# Use zot_client.everything to get all annotations

In [None]:
# Add each retrieved annotation in the collection 'annotation'
# Upon insert: automatic tokenization, embedding, and indexing.

for annotation in tqdm(annotations):
    annot = annotation['data']
    if 'annotationText' in annot:
        content = annot['annotationText']
        collection.add(
            documents=[content],
            metadatas=[{"parent": annot['parentItem']}],
            ids=[annot["key"]],
        )

## Query most similar results

In [None]:
QUERY = "Auscultation is not reliable"

In [None]:
# Return the 5 most similar results
# (based on embeddings proximity)

results = collection.query(
    query_texts=[QUERY],
    n_results=5
)

In [None]:
# Print results

for i, key in enumerate(results['ids'][0]):
    parent = results['metadatas'][0][i]['parent']
    text = results['documents'][0][i]
    print(f"\nzotero://open-pdf/library/items/{parent}?annotation={key.ljust(40)}: {results['distances'][0][i]}")
    print(text)

## LLM

In [None]:
# We make sure our LLM is running

#MODEL = "phi3:mini"
MODEL = "gpt-oss:20b"
process = subprocess.Popen(["ollama", "run", MODEL])

In [None]:
# Add results to context including the annotation reference in square brackets

context = ""
for key, text in zip(results['ids'][0], results['documents'][0]):
    context = context + f"{text} [{key}]\n"
print(context)

In [None]:
# Ask LLM to answer theuser question using the context

response = ollama.chat(model=MODEL, messages=[
        {"role": "system",
         "content": "You are a helpful scientific assistant." +
                    "Using the context provided, answer the user's question." +
                    "Make sure to insert the references (in square braces) found in the context appropriately."
        },
        {"role": "user", "content": f"Context:\n{context}\nUser query: Write a short paragraph on this topic: {QUERY}"}
    ])
response_text = response['message']['content']
print(response_text)

### Bonus

Get notifications when running long-running code with [Pushover](https://pushover.net/):

In [None]:
# Start pushover client
pushover_client = Client(os.environ.get("PUSHOVER_USER_KEY"), api_token=os.environ.get("PUSHOVER_API_TOKEN"))

In [None]:
# A long and risky task...
def long_risky_operation():
    time.sleep(5)
    0 / 0

In [None]:
# Wrap code with exception handler

try:
    long_risky_operation()
    client.send_message(response_text, title="Done!")
except Exception as e:
    client.send_message(f"Error: {e}", title="Oh no!")