In [1]:
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import sys

In [11]:
def ingest():
    # Get the doc
    loader = PyPDFLoader("/Users/yehao/Desktop/projetos/wmb/dataset/QuickStatementsBasics.pdf")
    pages = loader.load_and_split()
    # Split the pages by char
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(pages)
    print(f"Split {len(pages)} documents into {len(chunks)} chunks.")
    #
    embedding = FastEmbedEmbeddings()
    #Create vector store
    Chroma.from_documents(documents=chunks,  embedding=embedding, persist_directory="./sql_chroma_db")

In [12]:
# only run this once to generate vector store
ingest()


Split 16 documents into 36 chunks.


  from .autonotebook import tqdm as notebook_tqdm
Fetching 5 files: 100%|██████████| 5/5 [00:16<00:00,  3.27s/it]


In [14]:
from huggingface_hub import login
access_token_read = "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"
access_token_write = "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"
login(token = access_token_read)

In [16]:
def rag_chain():
    model = ChatOllama(model="llama3")
    #
    prompt = PromptTemplate.from_template(
        """
            <s> [Instructions] You are an expert assistant in Wikidata tools. 
            Answer the following question strictly based on the given context, which may contain information about QuickStatements, 
            SPARQL queries, item creation, property formatting, or submission strategies. 
            If the context is insufficient to answer, reply with: "No relevant context available to answer this question." [/Instructions] </s> 

            [Instructions] 
            Question: {input} 
            Context: {context} 
            Answer: 
            [/Instructions]
        """
    )

    #Load vector store
    embedding = FastEmbedEmbeddings()
    vector_store = Chroma(persist_directory="./sql_chroma_db", embedding_function=embedding)

    #Create chain
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k": 3,
            "score_threshold": 0.5,
        },
    )

    document_chain = create_stuff_documents_chain(model, prompt)
    chain = create_retrieval_chain(retriever, document_chain)
    #
    return chain

In [24]:
print("Test")

Test


In [25]:
def ask(query: str):
    #
    chain = rag_chain()
    # invoke chain
    result = chain.invoke({"input": query})
    # print results
    print(result["answer"])
    for doc in result["context"]:
        print("Source: ", doc.metadata["source"])

In [26]:
ask("What is quicksatements?")


[32m2025-08-02 08:50:44.384[0m | [31m[1mERROR   [0m | [36mfastembed.common.model_management[0m:[36mdownload_model[0m:[36m430[0m - [31m[1mCould not download model from HuggingFace: (ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=None)"), '(Request ID: 1c0e2136-badf-42d6-9edb-f1dd6a30697e)') Falling back to other sources.[0m
[32m2025-08-02 08:50:44.387[0m | [31m[1mERROR   [0m | [36mfastembed.common.model_management[0m:[36mdownload_model[0m:[36m452[0m - [31m[1mCould not download model from either source, sleeping for 3.0 seconds, 2 retries left.[0m


QuickStatements (QS) is a tool that allows you to edit Wikidata items using a simple set of text commands. With QS, you can add and remove statements, labels, descriptions, aliases, and add statements with optional qualifiers and sources. The command sequence can be typed in the import window or created in a spreadsheet or text editor and pasted in, or even generated by external code like Lua, called from a template and passed as a URL.
Source:  /Users/yehao/Desktop/projetos/wmb/dataset/QuickStatementsBasics.pdf
