In [None]:
# 📦 Step 1: Install required packages
!pip install -q langchain langchain_groq groq newspaper3k googlesearch-python

# 📚 Step 2: Import libraries
import os
import nltk
from googlesearch import search
from newspaper import Article
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from urllib.parse import urlparse

# 🛠️ Step 3: Download tokenizer used by newspaper3k
nltk.download('punkt')

# 🔐 Step 4: Set Groq API key (⚠️ Paste your actual key here)
GROQ_API_KEY = "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"
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

# 🤖 Step 5: Initialize Groq LLM
llm = ChatGroq(
    temperature=0.2,
    groq_api_key=GROQ_API_KEY,
    model_name="llama3-70b-8192"
)

# 💬 Step 6: Input your research topic
topic = input("📘 Enter your research topic:\n> ")

# 🔎 Step 7: Perform Google search (filter valid URLs)
print("🔍 Searching Google...")
search_results = [url for url in search(topic, num_results=5) if url.startswith("http")]
print("\n🔗 Top results found:\n")
for link in search_results:
    print(link)

# 🛡️ Step 8: Function to detect unsupported domains
def is_supported_url(url):
    domain = urlparse(url).netloc
    unsupported_domains = [
        "apsnet.org", "sciencedirect.com", "ieee.org", "dl.acm.org",
        "springerlink.com", "link.springer.com", "mdpi.com", "tandfonline.com"
    ]
    return all(bad not in domain for bad in unsupported_domains)

# 🧠 Step 9: Summarize supported articles
summaries = []
for url in search_results:
    print(f"\n📰 Processing: {url}")

    if not is_supported_url(url):
        summaries.append(f"⚠️ Skipped unsupported URL (likely protected or dynamic): {url}\n")
        print("⚠️ Skipped unsupported URL.")
        continue

    try:
        article = Article(url)
        article.download()
        article.parse()
        content = article.text[:4000]  # Limit content to avoid token overload

        prompt = PromptTemplate.from_template(
            "Summarize the following article in a clear, academic tone:\n\n{content}\n\nSummary:"
        )
        summary = llm.invoke(prompt.format(content=content)).content
        summaries.append(f"🔗 URL: {url}\n📝 Summary:\n{summary}\n")
    except Exception as e:
        summaries.append(f"⚠️ Failed to process {url}:\n{str(e)}\n")

# 📢 Step 10: Output all summaries
print("\n✅ Final Summaries:\n")
for s in summaries:
    print(s)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


📘 Enter your research topic:
> PLANT DISEASE DETECTION 
🔍 Searching Google...

🔗 Top results found:

https://journalofbigdata.springeropen.com/articles/10.1186/s40537-023-00863-9
https://www.nature.com/articles/s41598-023-34549-2
https://www.frontiersin.org/journals/plant-science/articles/10.3389/fpls.2024.1356260/full
https://apsjournals.apsnet.org/doi/10.1094/PDIS-03-15-0340-FE
https://plantmethods.biomedcentral.com/articles/10.1186/s13007-021-00722-9

📰 Processing: https://journalofbigdata.springeropen.com/articles/10.1186/s40537-023-00863-9

📰 Processing: https://www.nature.com/articles/s41598-023-34549-2

📰 Processing: https://www.frontiersin.org/journals/plant-science/articles/10.3389/fpls.2024.1356260/full

📰 Processing: https://apsjournals.apsnet.org/doi/10.1094/PDIS-03-15-0340-FE
⚠️ Skipped unsupported URL.

📰 Processing: https://plantmethods.biomedcentral.com/articles/10.1186/s13007-021-00722-9

✅ Final Summaries:

🔗 URL: https://journalofbigdata.springeropen.com/articles/10.