<a href="https://colab.research.google.com/github/moramnavadeep/pathway/blob/main/pathway.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install -q pathway sentence-transformers transformers requests pandas beautifulsoup4
import pathway as pw
from pathway.xpacks.llm.question_answering import AdaptiveRAGQuestionAnswerer, SimpleContextProcessor
from pathway.xpacks.llm.splitters import TokenCountSplitter
from pathway.xpacks.llm.embedders import SentenceTransformerEmbedder
from pathway.stdlib.indexing import UsearchKnnFactory, USearchMetricKind
from pathway.xpacks.llm import document_store
from transformers import pipeline
import pandas as pd
import requests
from bs4 import BeautifulSoup
NEWS_API_KEY="b097a7c0f7d24e13a2911e9850278e66"
response=requests.get(
    f"https://newsapi.org/v2/top-headlines?country=us&apiKey={NEWS_API_KEY}"
)
data=response.json()
articles=data.get("articles", [])
clean_articles=[]
for i, a in enumerate(articles):
    clean_articles.append({
        "id": str(i),
        "title":a.get("title") or "",
        "description":a.get("description") or "",
        "url":a.get("url") or "",
        "content":a.get("content") or ""
    })
df_articles=pd.DataFrame(clean_articles)
df_articles["full_text"]=df_articles["title"]+"\n"+df_articles["description"]+"\n"+df_articles["content"]
news_table=pw.debug.table_from_pandas(df_articles)
embedder=SentenceTransformerEmbedder(model="all-MiniLM-L6-v2")
retriever=UsearchKnnFactory(
    reserved_space=2000,
    embedder=embedder,
    metric=USearchMetricKind.COS
)
splitter = TokenCountSplitter(max_tokens=400)

document_store_instance = document_store.DocumentStore(
    docs=news_table.select(data=pw.this.full_text, title=pw.this.title),
    splitter=splitter,
    retriever_factory=retriever
)
qa_pipeline = pipeline(
    task="question-answering",
    model="distilbert-base-uncased-distilled-squad",
    device=0
)
class HFQAWrapper:
    def __init__(self, pipeline, model_name="distilbert-base-uncased-distilled-squad"):
        self.pipeline=pipeline
        self.model=model_name
    def __call__(self, prompt: str, context_docs: list = None):
        # Combine retrieved docs into one context string
        if not context_docs:
            return "No information found."
        context_texts = []
        for doc in context_docs:
            text = doc.get("text") if isinstance(doc, dict) else str(doc)
            url = doc.get("url") if isinstance(doc, dict) else ""
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(text, "html.parser")
            clean_text = soup.get_text(separator=" ", strip=True)
            if clean_text:
                if url:
                    clean_text += f"\n(Source: {url})"
                context_texts.append(clean_text)
        context="\n\n".join(context_texts)
        if not context.strip():
            return "No information found."
        result=self.pipeline({"question": prompt, "context": context})
        return result.get("answer", "No information found.")
llm_model=HFQAWrapper(qa_pipeline)
rag=AdaptiveRAGQuestionAnswerer(
    llm=llm_model,
    indexer=document_store_instance,
    context_processor=SimpleContextProcessor(),
    n_starting_documents=3,
    factor=2,
    max_iterations=3
)
def ask(query: str):
    query_df=pd.DataFrame({
        "prompt": [query],
        "filters": [None],
        "return_context_docs": [True]
    })
    query_table = pw.debug.table_from_pandas(query_df)
    response_table = rag.answer_query(query_table)
    pw.debug.compute_and_print(response_table)
ask("latest medical field acheivment?/")
ask("uno news")



Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Device set to use cuda:0


            | result
            | result
