In [1]:
%%capture
!pip install langchain-community pypdf
!pip install -qU langchain-huggingface
!pip install -U langchain-google-genai
!pip install "unstructured[image]"
!pip install pillow opencv-python
!pip install tqdm
!pip install ddgs
!pip install -U langchain langchain-community langchain-core

In [2]:
import getpass
import os

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass()


¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


In [None]:
#SmartLoader

In [3]:
import os, requests
from langchain_community.document_loaders import (
    PyPDFLoader,
    CSVLoader,
    WebBaseLoader,
)

def smart_loader(source: str):
    """Load documents from file path or URL intelligently."""
    all_docs = []

    if source.startswith("http://") or source.startswith("https://"):
        # Web link
        if source.endswith(".pdf"):
            # Download PDF temporarily
            temp_path = "temp.pdf"
            response = requests.get(source)
            with open(temp_path, "wb") as f:
                f.write(response.content)
            loader = PyPDFLoader(temp_path)
            all_docs.extend(loader.load())
            os.remove(temp_path)
        elif source.endswith(".csv"):
            temp_path = "temp.csv"
            response = requests.get(source)
            with open(temp_path, "wb") as f:
                f.write(response.content)
            loader = CSVLoader(file_path=temp_path)
            all_docs.extend(loader.load())
            os.remove(temp_path)
        else:
            # Generic webpage
            loader = WebBaseLoader(source)
            all_docs.extend(loader.load())
    else:
        # Local file
        if source.endswith(".pdf"):
            loader = PyPDFLoader(file_path=source)
        elif source.endswith(".csv"):
            loader = CSVLoader(file_path=source)
        else:
            raise ValueError(f"Unsupported file format: {source}")
        all_docs.extend(loader.load())

    return all_docs




Chunking Function

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
def load_and_split_data(source: str, chunk_size: int = 1000, chunk_overlap: int = 400):
    """
    Load documents from a file path or URL using smart_loader,
    then split them into manageable text chunks.
    """
    # Load documents (smart detection)
    all_docs = smart_loader(source)

    # Split into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        add_start_index=True,
    )
    all_splits = text_splitter.split_documents(all_docs)

    print(f"‚úÖ Loaded {len(all_docs)} documents, split into {len(all_splits)} chunks.")
    return all_docs, all_splits

Embedding Function

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name="minishlab/potion-base-8M")
from tqdm import tqdm

# Assuming you already have this function from before:
# from your_module import load_and_split_data

def generate_embeddings_from_source(source: str, embeddings_model=None):
    """
    Loads, splits, and embeds text from a file or URL.
    Returns: (documents, splits, vectors)
    """
    # Step 1: Load and split the data
    all_docs, all_splits = load_and_split_data(source)

    # Step 2: Initialize embedding model if not provided
    embeddings = embeddings_model or embedding_model

    # Step 3: Generate embeddings
    all_vectors = []
    for doc in tqdm(all_splits, desc="Generating embeddings"):
        vec = embeddings.embed_query(doc.page_content)
        all_vectors.append(vec)

    print(f"\n‚úÖ Loaded {len(all_docs)} documents")
    print(f"‚úÖ Split into {len(all_splits)} chunks")
    print(f"‚úÖ Generated {len(all_vectors)} embeddings.")

    return all_docs, all_splits, all_vectors

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/278 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

./model.safetensors:   0%|          | 0.00/30.2M [00:00<?, ?B/s]

In [6]:
%%capture
!pip install -qU langchain-chroma

In [None]:
import chromadb
from langchain_chroma import Chroma
from datetime import datetime
from tqdm import tqdm

# Reuse global embedding model and client
from langchain_huggingface import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name="minishlab/potion-base-8M")

cloud_client = chromadb.CloudClient(
    api_key="",
    tenant="",             #Use Your Own
)

# üß© Combine everything
def add_source_to_vector_db(
    source: str,
    collection_name: str,
    embeddings_model=embedding_model,
    client=cloud_client,
    batch_size: int = 300
):
    """
    Loads, splits, embeds, and uploads data from any source (PDF/local/URL) to Chroma Cloud.
    """

    print(f"\nüìÑ Processing source: {source}")

    # Step 1: Generate docs, splits, vectors using your pipeline
    docs, splits, vectors = generate_embeddings_from_source(source, embeddings_model)

    print(f"‚úÖ Loaded {len(docs)} documents | Split into {len(splits)} chunks")

    # Step 2: Connect or create Chroma collection
    vector_store = Chroma(
        client=client,
        collection_name=collection_name,
        embedding_function=embeddings_model,
    )

    # Step 3: Add chunks in batches
    ids = []
    for i in tqdm(range(0, len(splits), batch_size), desc="Uploading to Chroma"):
        batch = splits[i:i+batch_size]
        for doc in batch:
            doc.metadata.setdefault("source", source)
            doc.metadata.setdefault("upload_date", str(datetime.now()))
        try:
            batch_ids = vector_store.add_documents(documents=batch)
            ids.extend(batch_ids)
        except Exception as e:
            print(f"‚ö†Ô∏è Error adding batch {i//batch_size + 1}: {e}")

    print(f"\n‚úÖ Successfully uploaded {len(ids)} chunks from {source} ‚Üí {collection_name}")

    return ids

In [None]:
import os
from langchain_google_genai import ChatGoogleGenerativeAI 

os.environ["GOOGLE_API_KEY"] = "" #User your Own

model = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite")

In [9]:
from langchain.tools import tool
@tool("add_data_to_vector_db", return_direct=True)
def add_data_to_vector_db(sources: list[str], collection_name: str):
    """
    Takes a list of data sources (PDF paths or URLs) and a collection name,
    and adds them to ChromaDB.
    """
    if not collection_name:
        return "‚ùå Please specify a collection name."

    results = []
    for src in sources:
        try:
            ids = add_source_to_vector_db(src, collection_name)
            results.append(f"‚úÖ Added {len(ids)} chunks from {src}")
        except Exception as e:
            results.append(f"‚ùå Failed for {src}: {e}")
    return "\n".join(results)


In [None]:
from langchain.tools import tool
import ast

# ---- Base ingestion tool ----
@tool("data_ingestion_workflow", return_direct=True)
def data_ingestion_workflow(
    external_sources: list[str] = None,
    external_collection: str = None
):
    """
    Data ingestion workflow.
    Modes:
    1. Interactive mode (default)
    2. External ingestion mode (when external_sources + external_collection are provided)
    """

    #  MODE 2 
    if external_sources and external_collection:
        print("üì• Running in EXTERNAL INGESTION MODE...")
        print("Sources received from another tool:", external_sources)
        print("Collection name:", external_collection)

        result = add_data_to_vector_db.invoke({
            "sources": external_sources,
            "collection_name": external_collection
        })

        return {
            "message": "External ingestion completed.",
            "result": result
        }

    
    # MODE 1 
    print("üß† Starting interactive data ingestion session...")
    print("Type 'done' to stop.\n")

    prompt = """
    You are a structured input generator for data ingestion.

    Your task:
    - Read the user query carefully.
    - Extract two things:
        1. A Python list named `sources` containing all PDF paths or URLs mentioned by the user.
        2. A Python string named `collection_name` representing the collection name.

    Rules:
    - Return only two Python variable declarations in this exact format:
        ["/path/file.pdf", "https://arxiv.org/pdf/2307.12945.pdf"]
        "AI_Papers"
    - Do not include markdown.
    """

    llm = model

    while True:
        query = input("\nüó£Ô∏è Enter your instruction (or 'done' to finish):\n")
        if query.lower().strip() in ["done", "exit", "quit"]:
            print("‚úÖ Finished ingestion session.")
            break

        # Step 1: Ask LLM to extract structured input
        response = llm.invoke(prompt + "\nUser query:\n" + query)
        final_output_text = response.content if hasattr(response, "content") else str(response)

        print("\n--- LLM Output ---")
        print(final_output_text)

        try:
            # Step 2: Parse LLM output
            lines = [line.strip() for line in final_output_text.splitlines() if line.strip()]
            sources = ast.literal_eval(lines[0])
            collection_name = ast.literal_eval(lines[1])

            print("‚úÖ Parsed Sources:", sources)
            print("‚úÖ Parsed Collection Name:", collection_name)

            # Step 3: Add data to vector DB
            result = add_data_to_vector_db.invoke({
                "sources": sources,
                "collection_name": collection_name
            })
            print("\n--- Tool Result ---")
            print(result)

        except Exception as e:
            print(f"‚ùå Error parsing or invoking tool: {e}")
            continue

    return "üü¢ Ingestion session completed successfully."


In [11]:
%%capture
!pip install ddgs serper playwright beautifulsoup4 lxml aiohttp
!pip install python-docx
!playwright install
!pip install langchain

In [None]:
from langchain.tools import tool
from ddgs import DDGS
import requests
from docx import Document
from datetime import datetime

SERPER_API_KEY = ""

#user your own keys


def ddg_text_search(query, max_results=15):
    results = []
    try:
        with DDGS() as ddgs:
            for r in ddgs.text(query, max_results=max_results):
                results.append({
                    "title": r.get("title"),
                    "snippet": r.get("body"),
                    "link": r.get("href"),
                    "source": "ddg"
                })
    except:
        pass
    return results


def serper_text_search(query, max_results=15):
    try:
        payload = {"q": query, "num": max_results}
        headers = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"}

        res = requests.post("https://google.serper.dev/search", json=payload, headers=headers)
        organic = res.json().get("organic", [])

        return [{
            "title": r.get("title"),
            "snippet": r.get("snippet"),
            "link": r.get("link"),
            "source": "google-serper"
        } for r in organic]
    except:
        return []



# ---------------------------------------------------------
def rank(results):
    _priority = {"google-serper": 1, "ddg": 2}
    return sorted(results, key=lambda x: _priority.get(x["source"], 9))



def unified_search(query, max_results=15):

    # TEXT SEARCH
    ddg_text = ddg_text_search(query, max_results)
    google_text = serper_text_search(query, max_results)

    # Combine & rank
    all_text = rank(ddg_text + google_text)[:max_results]

    return all_text



def save_links_to_docx(query, results, file_path):
    doc = Document()

    doc.add_heading(f"Top Search Links for: {query}", level=1)
    doc.add_paragraph(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

    # LINKS ONLY (no numbers, plain list)
    doc.add_heading("Links", level=2)

    if not results:
        doc.add_paragraph("No results found.")
    else:
        for r in results:
            doc.add_paragraph(r)

    doc.save(file_path)
    return file_path



def unified_search_and_export(query, max_results=15):
    # unified_search returns ONLY text results now
    text_results = unified_search(query, max_results)
    return text_results

@tool("unified_web_search", return_direct=False)
def unified_web_search_tool(query: str, max_results: int = 15) -> dict:
    """
    Performs unified DuckDuckGo + Google Serper text search.
    Returns top links in a structured dict (no saving here).
    """

    # Get search results
    text_results = unified_search_and_export(
        query=query,
        max_results=max_results
    )

    links = [r["link"] for r in text_results]

    return {
        "query": query,
        "count": len(links),
        "links": links
    }


@tool("save_links_to_docx", return_direct=True)
def save_links_to_docx_tool(query: str, links: list, file_path: str) -> str:
    """
    Saves a list of search links to a DOCX file.
    """
    if not links:
        return "‚ùå No links provided to save."

    path = save_links_to_docx(query, links, file_path)
    print("The Search Results are saved")

    return f"üìÑ Links saved successfully to: {path}"


In [None]:
@tool("interactive_search_and_save", return_direct=True)
def interactive_search_and_save() -> str:
    """
    Fully interactive loop:
    - Ask the user what they want to search
    - Show results
    - Ask if they want to save (all or selected)
    - Save links to DOCX
    - Keep updating until user says 'done'
    """

    docx_path = "interactive_search_results.docx"
    all_saved_links = []

    while True:
        # Step 1: Ask for search query
        user_query = input("\nüîç Enter your search query (or type 'done' to exit): ").strip()
        if user_query.lower() == "done":
            return {
                  "links": all_saved_links
                    }

        # Step 2: Search
        result = unified_web_search_tool.invoke({
            "query": user_query,
            "max_results": 15
        })

        print("\n--- Search Results ---")
        print(result)

        # Parse links back from text
        links = result["links"]

        # Step 3: Ask user if they want to save
        save_choice = input(
            "\nüíæ Do you want to save links?\n"
            "(all / selected / no): "
        ).strip().lower()

        if save_choice == "no":
            print("Skipping save.")
            continue

        # Step 4: Save all links
        if save_choice == "all":
            all_saved_links.extend(links)
            save_links_to_docx("Combined Search Results", all_saved_links, docx_path)
            print(f"‚úîÔ∏è Saved all {len(all_saved_links)} total links.")
            continue

        # Step 5: Save selected links
        if save_choice == "selected":
            print("\nEnter indices of links you want to save (comma-separated):")
            for idx, link in enumerate(links):
                print(f"{idx}. {link}")

            selected_input = input("Your selection: ").strip()
            try:
                indices = [int(x.strip()) for x in selected_input.split(",")]
                selected_links = [links[i] for i in indices if 0 <= i < len(links)]
                all_saved_links.extend(selected_links)
                save_links_to_docx("Combined Search Results", all_saved_links, docx_path)
                print(f"‚úîÔ∏è Saved selected links ({len(selected_links)} added).")
            except:
                print("‚ùå Invalid selection. Skipping.")

        else:
            print("‚ùå Invalid choice. Skipping.")

    # End


In [None]:
from langchain.tools import tool

@tool("search_to_ingestion_pipeline", return_direct=True)
def search_to_ingestion_pipeline(
    collection_name: str = "search_results",
    run_rag: bool = True

) -> dict:
    """
    Runs the complete workflow:
    1. Executes interactive_search_and_save() to let the user search & select links.
    2. Takes the returned list of links.
    3. Passes those links into data_ingestion_workflow() using external ingestion mode.
    4. Returns final ingestion result.

    Args:
        collection_name (str): Vector DB collection to store search results.

    Returns:
        dict: {
            "links_ingested": [...],
            "collection_name": "...",
            "ingestion_result": ...
        }
    """

    # Step 1: Run the interactive search tool
    search_output = interactive_search_and_save.invoke({})

    # Safety: ensure correct structure
    if not isinstance(search_output, dict) or "links" not in search_output:
        return {
            "error": "search tool did not return expected 'links' list",
            "raw_output": search_output
        }

    links = search_output["links"]

    if not links:
        return {
            "message": "No links selected by user, nothing to ingest.",
            "links_ingested": []
        }
    if run_rag:
        # Step 2: Run ingestion
        ingestion_result = data_ingestion_workflow.invoke({
            "external_sources": links,
            "external_collection": collection_name
        })
        return {
            "links_ingested": links,
            "collection_name": collection_name,
            "ingestion_result": ingestion_result
        }
    else:
        # Skip ingestion
        return {
            "message": "RAG ingestion skipped because run_rag=False.",
            "links_selected": links,
            "analysis_status": "skipped"
        }


In [None]:
from langchain.agents import create_agent
from langchain_core.language_models.chat_models import BaseChatModel


tools = [data_ingestion_workflow ,search_to_ingestion_pipeline]
# If desired, specify custom instructions
prompt = (
          """
              You are an orchestrator agent.

              If the user wants to upload or provide their own data,
              use the tool: data_ingestion_workflow.

              If the user wants to search or collect data from the web,
              use the tool: search_to_ingestion_pipeline.

              Ask for clarification if needed.
          """

)

orchestrator  = create_agent(model, tools, system_prompt=prompt)

In [None]:
response = orchestrator.invoke({
    "messages": [{"role": "user", "content": "I want to upload my CSV files"}]
})

print(response)

üß† Starting interactive data ingestion session...
Type 'done' to stop.

