In [6]:
import requests
from newspaper import Article

# -------- CONFIG --------
SERPER_API_KEY = "b6377b6d2d07fe6a6f334f03986986351846d4bb"  # 🔁 Replace with your actual key
OLLAMA_API_URL = "http://localhost:11434/api/generate"

In [7]:
import re 

In [171]:
pip install playwright


Note: you may need to restart the kernel to use updated packages.


In [172]:
!pip install selenium




In [8]:
def extract_article_text(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return f"Source: {url}\n\n{article.text}"
    except Exception as e:
        return f"Source: {url}\n\n[Failed to extract: {e}]"

In [9]:
def ask_llama3_locally(prompt):
    try:
        headers = {
            "Content-Type": "application/json",
            "Accept": "application/json"
        }
        response = requests.post(
            OLLAMA_API_URL,
            json={"model": "llama3", "prompt": prompt, "stream": False},
            headers=headers,
            verify=False  # Disable SSL verification for ngrok URL
        )
        if response.status_code == 200:
                answer_text = response.json().get("response", "[No response found]")
                return answer_text  # ✅ Return only the clean text
        else:
            return f"[HTTP {response.status_code}] Error from Llama3"
    except Exception as e:
        return f"[❌ Error calling Llama3: {e}]"

In [10]:
from bs4 import BeautifulSoup

def clean_html(html):
    soup = BeautifulSoup(html, "html.parser")
    texts = soup.find_all(text=True)

    blacklist = [
        '[document]', 'noscript', 'header', 'html', 'meta', 'head', 'input', 'script', 'style'
    ]

    output = ''
    for t in texts:
        if t.parent.name not in blacklist:
            content = t.strip()
            if content:
                output += f'{content} '

    return output


In [11]:
def is_url_trusted_by_ai(url):
    prompt = f"""You are a trustworthiness evaluator.

Decide whether the following website is generally considered a **professional and trustworthy source of information** for serious research, financial news, or due diligence.

Do **not** include websites that:
- Are editable by the public (like Wikipedia, Reddit, Quora)
- Are known for unverified or user-generated content

Only reply with `yes` or `no`.

### Website:
{url}

### Is this source trusted?
"""

    try:
        headers = {
            "Content-Type": "application/json",
            "Accept": "application/json"
        }
        response = requests.post(
            OLLAMA_API_URL,
            json={"model": "llama3", "prompt": prompt, "stream": False},
            headers=headers,
            verify=False
        )
        if response.status_code == 200:
            verdict = response.json().get("response", "").strip().lower()
            print(f"🤖 AI verdict for {url}: {verdict}")
            return "yes" in verdict
        else:
            return False
    except Exception as e:
        print(f"[AI trust check failed for {url}]: {e}")
        return False


In [12]:
def web_search_serper(query, max_results=4):
    headers = {
        "X-API-KEY": SERPER_API_KEY
    }
    json_data = {
        "q": query
    }
    response = requests.post("https://google.serper.dev/search", headers=headers, json=json_data)

    if response.status_code != 200:
        print(f"❌ Serper API error: {response.status_code}")
        return []

    data = response.json()
    results = data.get("organic", [])
    urls = []
    for item in results:
        url = item["link"]
        print(f"🔍 Evaluating source: {url}")
        if is_url_trusted_by_ai(url):
            urls.append(url)
        if len(urls) >= max_results:
            break
    return urls


In [13]:
def classify_question_type(query):
    classification_prompt = f"""You are a classification assistant.

Decide if the following question requires a short, direct answer (like a number, name, or fact), or a detailed, multi-paragraph summary.

Respond with only one word: `short` or `detailed`.

### Question:
{query}

### Answer Style:"""

    # Send to LLaMA3
    try:
        headers = {
            "Content-Type": "application/json",
            "Accept": "application/json"
        }
        response = requests.post(
            OLLAMA_API_URL,
            json={"model": "llama3", "prompt": classification_prompt, "stream": False},
            headers=headers,
            verify=False
        )
        if response.status_code == 200:
            style = response.json().get("response", "").strip().lower()
            return "short" if "short" in style else "detailed"
        else:
            return "detailed"
    except:
        return "detailed"


In [14]:
def display_final_answer(answer, query, urls):
    print("tetete"+answer)
    print("\n" + "="*80)
    print(f"🧠 Answer to: {query}")
    print("="*80)
    print("\n🔗 Sources Used:")
    for url in urls:
        print(f" - {url}")
    print("="*80)


In [15]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
import time

def extract_readable_text(url):
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # New headless mode is less detectable
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    # Fake being a real user
    chrome_options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
    )

    # Optional: make webdriver undetectable
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    driver = webdriver.Chrome(options=chrome_options)

    driver.execute_cdp_cmd(
        "Page.addScriptToEvaluateOnNewDocument",
        {
            "source": """
                Object.defineProperty(navigator, 'webdriver', {
                  get: () => undefined
                })
            """
        }
    )

    driver.get(url)
    time.sleep(5)  # let JS render

    html = driver.page_source
    driver.quit()

    soup = BeautifulSoup(html, "html.parser")

    texts = soup.find_all(string=True)  # Fixed here

    output = ''
    blacklist = ['[document]', 'noscript', 'header', 'html', 'meta', 'head', 'input', 'script', 'style']

    for t in texts:
        if t.parent.name not in blacklist:
            output += '{} '.format(t.strip())

    return output


In [16]:
import re

def smart_clean_text(text):
    lines = text.split("\n")
    important_lines = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
        # Filter rules:
        if "cookie" in line.lower():
            continue
        if "privacy" in line.lower():
            continue
        if "consent" in line.lower():
            continue
        if "sign up" in line.lower() or "sign in" in line.lower():
            continue
        if "accept" in line.lower() and "cookies" in line.lower():
            continue
        if len(line) < 5:
            continue
        important_lines.append(line)

    cleaned_text = "\n".join(important_lines)
    return cleaned_text

# Usage

# Then build prompt


In [18]:
def web_rag_query(query):
    print(f"🔍 Searching with Serper for: {query}")
    urls = web_search_serper(query)

    # --- Blacklist bad URLs ---
    blacklist = [
        "https://bitflyer.com/en-us/bitcoin-chart"
    ]

    # Remove blacklisted URLs
    urls = [url for url in urls if url not in blacklist]

    print(f"📎 URLs after blacklist filter: {urls}")
    if not urls:
        return "[❌ No articles found for this query.]", urls

    print(f"📄 Extracting content from {len(urls)} URLs...")
    sources = []
    successful_urls = []

    for url in urls:
        try:
            raw_text = extract_readable_text(url)
            cleaned_text = smart_clean_text(raw_text)
            
            # 👇 Separate each source properly
            source_text = f"Source: {url}\n---\n{cleaned_text}\n"
            sources.append(source_text)

            successful_urls.append(url)
          
        except Exception as e:
            print(f"❌ Failed to extract {url}: {e}")
            continue

    if not sources:
        return "[❌ No accessible or relevant sources found.]", urls

    answer_style = classify_question_type(query)

    # 👇 Here, join sources with two newlines between them for clarity
    joined_sources = "\n\n".join(sources)

    if answer_style == "short":
        final_prompt = f"""You are a precise assistant. Give a short, factual answer based **only** on the provided sources.

### Question:
{query}

### Sources:
{joined_sources}

### Answer:
"""
    else:
        final_prompt = f"""You are an expert crypto analyst and news summarizer.

Please answer the following question using only the information from the provided sources.
Be clear, concise, and neutral in tone.

### Question:
{query}

### Sources:
{joined_sources}

### Summary:
"""

    print("🧠 Asking llama3 locally...")
    answer = ask_llama3_locally(final_prompt)
    return answer, successful_urls


In [181]:
pip install nest_asyncio


Note: you may need to restart the kernel to use updated packages.


In [19]:
query = "What is the price of bitcoin today?"
final_answer,urls= web_rag_query(query)
print("tetete"+final_answer)

🔍 Searching with Serper for: What is the price of bitcoin today?
🔍 Evaluating source: https://coinmarketcap.com/currencies/bitcoin/
🤖 AI verdict for https://coinmarketcap.com/currencies/bitcoin/: **yes**
🔍 Evaluating source: https://www.coindesk.com/price/bitcoin
🤖 AI verdict for https://www.coindesk.com/price/bitcoin: yes
🔍 Evaluating source: https://bitflyer.com/en-us/bitcoin-chart
🤖 AI verdict for https://bitflyer.com/en-us/bitcoin-chart: yes
🔍 Evaluating source: https://www.coinbase.com/price/bitcoin
🤖 AI verdict for https://www.coinbase.com/price/bitcoin: **no**
🔍 Evaluating source: https://crypto.com/price/bitcoin
🤖 AI verdict for https://crypto.com/price/bitcoin: no
🔍 Evaluating source: https://ca.finance.yahoo.com/quote/BTC-USD/
🤖 AI verdict for https://ca.finance.yahoo.com/quote/BTC-USD/: yes
📎 URLs after blacklist filter: ['https://coinmarketcap.com/currencies/bitcoin/', 'https://www.coindesk.com/price/bitcoin', 'https://ca.finance.yahoo.com/quote/BTC-USD/']
📄 Extracting cont

In [2]:
question = "give me the exact price of bitcoin today ?"

final_answer, urls = web_rag_query(question)
display_final_answer(final_answer, question, urls)


🔍 Searching with Serper for: give me the exact price of bitcoin today ?


NameError: name 'web_search_serper' is not defined