In [6]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings



In [7]:
import os
import json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from PIL import Image, ImageDraw, ImageFont
from fpdf import FPDF
from docx import Document as WordDoc
from pptx import Presentation

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
from langchain.agents import Tool, initialize_agent, AgentType
from langchain_community.document_loaders import (
    PyPDFLoader,
    TextLoader,
    Docx2txtLoader,
    CSVLoader,
    UnstructuredExcelLoader,
    UnstructuredMarkdownLoader
)


load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

def extract_text_from_pptx(file_path):
    prs = Presentation(file_path)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

# === Step 1: LangChain Data Ingestion ===
def load_documents(folder="data"):
    docs = []
    for file in os.listdir(folder):
        path = os.path.join(folder, file)
        ext = os.path.splitext(file)[1].lower()

        try:
            if ext == ".pdf":
                docs.extend(PyPDFLoader(path).load())
            elif ext == ".txt":
                docs.extend(TextLoader(path).load())
            elif ext == ".docx":
                docs.extend(Docx2txtLoader(path).load())
            elif ext == ".csv":
                docs.extend(CSVLoader(path).load())
            elif ext in [".xls", ".xlsx"]:
                docs.extend(UnstructuredExcelLoader(path).load())
            elif ext == ".md":
                docs.extend(UnstructuredMarkdownLoader(path).load())
            elif ext == ".pptx":
                text = extract_text_from_pptx(path)
                docs.append(Document(page_content=text, metadata={"source": file}))
            else:
                print(f"Skipping unsupported file: {file}")
        except Exception as e:
            print(f"Failed to load {file}: {str(e)}")

    return docs

# === Step 2: Build vectorstore ===
def setup_vectorstore():
    docs = load_documents("data")
    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = splitter.split_documents(docs)
    embeddings = OpenAIEmbeddings()
    return FAISS.from_documents(chunks, embeddings)

# === Step 3: Agent Tools ===
def document_search(query: str) -> str:
    vectorstore_path = Path("vectorstore").resolve()

    if not vectorstore_path.exists():
        print("Vectorstore missing! Rebuilding")
        vs = setup_vectorstore()
        vs.save_local(str(vectorstore_path))

    vectorstore = FAISS.load_local(str(vectorstore_path), OpenAIEmbeddings(),allow_dangerous_deserialization=True )
    retriever = vectorstore.as_retriever()
    docs = retriever.get_relevant_documents(query)

    return "\n\n".join([d.page_content for d in docs])

def stats_lookup(query: str) -> str:
    output = []
    for file in os.listdir("data"):
        path = os.path.join("data", file)
        if file.endswith(".xlsx"):
            df = pd.read_excel(path)
            matches = df[df.apply(lambda row: query.lower() in row.astype(str).str.lower().to_string(), axis=1)]
            if not matches.empty:
                output.append(matches.to_string())
        elif file.endswith(".json") or file.endswith(".jsonl"):
            with open(path, "r", encoding="utf-8") as f:
                for line in f:
                    if query.lower() in line.lower():
                        output.append(line.strip())
    return "\n---\n".join(output) if output else "No matching stats found."

def pick_template(request: str) -> str:
    request = request.lower()
    if "grant" in request:
        return "Use grant format: intro, need, goals, impact, budget."
    elif "blog" in request:
        return "6 paragraphs: hook, story, stats, quote, mission, close."
    elif "slide" in request or "presentation" in request:
        return "Make 5-7 slide outline with bullets from interviews/data."
    elif "social" in request:
        return "Write 5 captions using emojis and hashtags."
    else:
        return "Use general report style: summary, findings, insights."

# === Step 4: Output Writer ===
def save_output(text, name="CAFBrain_Agent_Output", outdir="Output"):
    os.makedirs(outdir, exist_ok=True)

    with open(f"{outdir}/{name}.txt", "w", encoding="utf-8") as f:
        f.write(text)

    doc = WordDoc()
    doc.add_heading("CAFBrain Agent Output", 0)
    doc.add_paragraph(text)
    doc.save(f"{outdir}/{name}.docx")

    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    for line in text.split("\n"):
        pdf.multi_cell(0, 10, line)
    pdf.output(f"{outdir}/{name}.pdf")

    with open(f"{outdir}/{name}.json", "w", encoding="utf-8") as f:
        json.dump({"output": text}, f, indent=2)

    prs = Presentation()
    title = prs.slides.add_slide(prs.slide_layouts[0])
    title.shapes.title.text = "CAFBrain Presentation"
    title.placeholders[1].text = "Generated by Agent"
    for section in text.split("\n\n"):
        slide = prs.slides.add_slide(prs.slide_layouts[1])
        slide.shapes.title.text = "Slide"
        slide.placeholders[1].text = section.strip()
    prs.save(f"{outdir}/{name}.pptx")

    img = Image.new("RGB", (1200, 600), color=(255, 255, 255))
    draw = ImageDraw.Draw(img)
    font = ImageFont.load_default()
    y = 20
    for line in text.split('\n'):
        draw.text((40, y), line.strip(), fill=(0, 0, 0), font=font)
        y += 20
    img.save(f"{outdir}/{name}.png")

# === Step 5: CLI Runner ===
if __name__ == "__main__":
    print("Initializing CAFBrain Agent...")

    vectorstore_path = Path("vectorstore").resolve()
    if not vectorstore_path.exists():
        print("🔧 No vectorstore found. Creating FAISS index...")
        vs = setup_vectorstore()
        vs.save_local(str(vectorstore_path))
    else:
        print(" Vectorstore already exists.")

    tools = [
        Tool(name="DocumentSearch", func=document_search, description="Search internal documents"),
        Tool(name="StatLookup", func=stats_lookup, description="Lookup data from spreadsheets or JSON"),
        Tool(name="TemplatePicker", func=pick_template, description="Select template based on task")
    ]

    llm = ChatOpenAI(temperature=0.3, model="gpt-4")
    agent = initialize_agent(
        tools=tools,
        llm=llm,
        agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
        verbose=True,
        handle_parsing_errors=True
    )

user_input = input("\n Ask CAFBrain Agent:\n> ").strip()

# Use a more reasonable minimum character check (e.g., 10–20 chars)
if not user_input or len(user_input) < 20:
    print("Please provide a more detailed request (e.g., 'Generate a grant using Montgomery County data')")
else:
    try:
        response = agent.run(user_input)
        print("\nFinal Output:\n", response)
        save_output(response)
        print("\nOutput saved in /Output folder.")
    except Exception as e:
        print("An error occurred while processing the request:")
        print(e)



Initializing CAFBrain Agent...
 Vectorstore already exists.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThis statement seems to be more of a fact rather than a question. However, if we are to provide more information or data on this, we could look up statistics on global hunger.
Action: StatLookup
Action Input: Global hunger statistics[0m
Observation: [33;1m[1;3mNo matching stats found.[0m
Thought:[32;1m[1;3mThere are no available statistics on global hunger in the current database. Perhaps I could find some relevant documents that discuss the issue.
Action: DocumentSearch
Action Input: Global hunger[0m
Observation: [36;1m[1;3mFood insecurity in DC
Hunger affects individuals the world over, on every continent 
and in every country. The capital region of the United States is 
no exception. Nearly 1.5 million people in our area experienced 
some level of food insecurity last year, meaning they weren’t 
always sure where their next meal would come from. 
In CAFB’