In [None]:
# Run this cell (may take a minute)
!pip install --upgrade pip
!pip install reportlab pypdf sentence-transformers faiss-cpu neo4j python-dotenv


In [None]:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
pdf_path = "/content/finance_report.pdf"

c = canvas.Canvas(pdf_path, pagesize=letter)
text = c.beginText(50, 750)
text.setFont("Helvetica", 11)

content = """
FinTech Global — Q2 2025 Financial Overview

Departments:
1. Investment Research — led by Sarah Khan
   - Projects:
     * AI Credit Scoring — Budget: $5,000,000
     * Portfolio Optimization — Budget: $4,000,000
   - Employees:
     * Ali Raza — Data Analyst
     * Hina Tariq — Junior Analyst

2. Risk Management — led by Fatima Noor
   - Projects:
     * Sustainable Investments — Budget: $8,000,000
   - Employees:
     * Bilal Ahmed — Compliance Officer
     * Ayesha Malik — Financial Analyst

Company Notes:
FinTech Global continues to expand its AI investments and expects higher recurring revenue from services.
"""

for line in content.strip().split("\n"):
    text.textLine(line)
c.drawText(text)
c.save()

print("✅ Dummy PDF written to:", pdf_path)


In [None]:
from pypdf import PdfReader
pdf_path = "/content/finance_report.pdf"

reader = PdfReader(pdf_path)
pages = [p.extract_text() or "" for p in reader.pages]
full_text = "\n".join(pages)
print("✅ Extracted text (preview):\n")
print(full_text[:1000])


In [None]:
import re, pprint

text = full_text

# Simple rule-based extraction tuned to the dummy PDF's structure.
# This is intentionally simple; for production use use NER (spaCy) or LLM-based extraction.

data = {}
current_dept = None

lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
for ln in lines:
    # Department header like "1. Investment Research — led by Sarah Khan"
    m = re.match(r'^\d+\.\s*(.+?)\s*—\s*led by\s*(.+)$', ln, flags=re.I)
    if m:
        dept_name = m.group(1).strip()
        head = m.group(2).strip()
        current_dept = dept_name
        data[current_dept] = {"head": head, "projects": [], "employees": []}
        continue

    # Project lines like "* AI Credit Scoring — Budget: $5,000,000"
    m = re.match(r'^[\*\-\•]\s*(.+?)\s*—\s*Budget:\s*\$?([\d,]+)', ln)
    if m and current_dept:
        pname = m.group(1).strip()
        budget = int(m.group(2).replace(",", ""))
        data[current_dept]["projects"].append({"name": pname, "budget": budget})
        continue

    # Employee lines like "* Ali Raza — Data Analyst"
    m = re.match(r'^[\*\-\•]\s*(.+?)\s*—\s*(.+)$', ln)
    if m and current_dept:
        # check it's under employees (we assume later lines)
        name = m.group(1).strip()
        role = m.group(2).strip()
        # heuristics: if role contains 'Analyst' or 'Officer' treat as employee
        data[current_dept]["employees"].append({"name": name, "role": role})
        continue

pprint.pprint(data)


In [None]:
from neo4j import GraphDatabase

# ← REPLACE these with your AuraDB (or local) credentials
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "password"

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))

def run_cypher(cypher, params=None):
    with driver.session() as session:
        res = session.run(cypher, params or {})
        return [r.data() for r in res]

# Clear DB (CAUTION: removes all nodes) — comment out if you don't want this
print("Clearing DB (if any) ...")
run_cypher("MATCH (n) DETACH DELETE n")

# Create company node
COMPANY = "FinTech Global"
run_cypher("MERGE (c:Company {name:$company})", {"company": COMPANY})

# Insert departments, heads, employees and projects from extracted data
for dept, info in data.items():
    # Merge department and link to company
    run_cypher("""
    MERGE (d:Department {name:$dept})
    WITH d
    MATCH (c:Company {name:$company})
    MERGE (c)-[:HAS_DEPARTMENT]->(d)
    """, {"dept": dept, "company": COMPANY})

    # Head
    head_name = info.get("head")
    if head_name:
        run_cypher("""
        MERGE (h:Employee {name:$head_name})
        SET h.title = $title
        WITH h
        MATCH (d:Department {name:$dept})
        MERGE (h)-[:HEADS]->(d)
        """, {"head_name": head_name, "title": f"Head of {dept}", "dept": dept})

    # Employees
    for emp in info.get("employees", []):
        run_cypher("""
        MERGE (e:Employee {name:$ename})
        SET e.role = $erole
        WITH e
        MATCH (d:Department {name:$dept})
        MERGE (e)-[:WORKS_IN]->(d)
        """, {"ename": emp["name"], "erole": emp["role"], "dept": dept})

    # Projects
    for proj in info.get("projects", []):
        run_cypher("""
        MERGE (p:Project {name:$pname})
        SET p.budget = $pbudget
        WITH p
        MATCH (d:Department {name:$dept})
        MERGE (d)-[:MANAGES]->(p)
        """, {"pname": proj["name"], "pbudget": proj["budget"], "dept": dept})

print("✅ Graph populated in Neo4j.")


In [None]:
# Who reports to the head of Investment Research? (Employees in that dept)
q1 = """
MATCH (h:Employee {title: 'Head of Investment Research'})-[:HEADS]->(d:Department)<-[:WORKS_IN]-(e:Employee)
RETURN e.name AS employee
"""
print("Who reports to Head of Investment Research?")
print(run_cypher(q1))

# Which department manages 'AI Credit Scoring'?
q2 = """
MATCH (d:Department)-[:MANAGES]->(p:Project {name: 'AI Credit Scoring'})
RETURN d.name AS department
"""
print("Which department manages AI Credit Scoring?")
print(run_cypher(q2))

# List projects under company
q3 = """
MATCH (c:Company {name: $company})-[:HAS_DEPARTMENT]->(d:Department)-[:MANAGES]->(p:Project)
RETURN p.name AS project, p.budget AS budget
"""
print("Projects under company:")
print(run_cypher(q3, {"company": COMPANY}))


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle
from math import ceil

# 1) Chunk the text (simple fixed-size splitter)
def naive_chunks(text, chunk_size=800):
    tokens = text.split()
    chunks = []
    for i in range(0, len(tokens), chunk_size):
        chunks.append(" ".join(tokens[i:i+chunk_size]))
    return chunks

chunks = naive_chunks(full_text, chunk_size=120)  # smaller chunk_size for short doc
print("Number of text chunks:", len(chunks))

# 2) Embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
embs = model.encode(chunks, show_progress_bar=True, convert_to_numpy=True)

# 3) Build FAISS index
d = embs.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embs)
print("FAISS index built. n_items =", index.ntotal)

# Save index and chunks for reuse
faiss.write_index(index, "/content/faiss_index.bin")
with open("/content/chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)
print("Saved FAISS index and chunks.")


In [None]:
import pickle, faiss, numpy as np
from sentence_transformers import SentenceTransformer

# Load
index = faiss.read_index("/content/faiss_index.bin")
with open("/content/chunks.pkl", "rb") as f:
    chunks = pickle.load(f)
smodel = SentenceTransformer("all-MiniLM-L6-v2")

def retrieve_docs(query, top_k=3):
    qvec = smodel.encode([query], convert_to_numpy=True)
    D, I = index.search(qvec, top_k)
    results = [chunks[idx] for idx in I[0] if idx < len(chunks)]
    return results

print("Example retrieval for 'projects budgets':")
print(retrieve_docs("projects and their budgets", top_k=3))


In [None]:
# 🔹 Groq-Based LLM Integration for Finance Graph-RAG
!pip install langchain-groq --quiet

from langchain_groq import ChatGroq
import os

# Set your Groq API key (replace with your own)
os.environ["GROQ_API_KEY"] = "your_groq_api_key_here"


# Initialize Groq LLM
llm = ChatGroq(model="llama-3.1-8b-instant", api_key=os.environ["GROQ_API_KEY"])

def llm_answer_groq(context, question):
    """
    Uses Groq's LLM (LLaMA-3.1-8B-Instant) to answer questions based on retrieved finance docs.
    """
    prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer clearly and concisely."
    response = llm.invoke(prompt)
    return response.content

# Example usage
docs = retrieve_docs("What are the budgets for the projects?", top_k=3)
context = "\n\n".join(docs)
print(llm_answer_groq(context, "List the projects and their budgets."))


In [None]:
def answer_nl(question):
    ql = question.lower()
    # very simple heuristic routing
    graph_keywords = ["who reports", "who report", "which department", "manages", "which projects", "budget", "head of"]
    if any(k in ql for k in ["who reports", "who report", "head of", "works in", "reports to", "which department", "manages"]):
        # map some patterns to prepared cypher queries
        if "who reports to" in ql or "who reports" in ql:
            # find head title or name extraction
            if "head of" in ql:
                # e.g., "Who reports to the Head of Investment Research?"
                # extract department name after "head of"
                part = ql.split("head of")[-1].strip().strip("? ").title()
                cy = """
                MATCH (h:Employee {title:$title})-[:HEADS]->(d:Department)<-[:WORKS_IN]-(e:Employee)
                RETURN e.name AS employee
                """
                title = f"Head Of {part}" if not ql.count("head of") else f"Head of {part}"
                # Because our graph saved title as 'Head of Investment Research', ensure match casing
                rows = run_cypher(cy, {"title": f"Head of {part}"})
                names = [r["employee"] for r in rows]
                return ", ".join(names) if names else "No matching employees found."

        if "which department" in ql and "manages" in ql:
            # find project name in the question (naive)
            m = re.search(r'project\s+(.+?)(\?|$)', ql)
            if m:
                pname = m.group(1).strip().title()
                cy = """
                MATCH (d:Department)-[:MANAGES]->(p:Project {name:$pname})
                RETURN d.name AS department
                """
                rows = run_cypher(cy, {"pname": pname})
                return ", ".join([r["department"] for r in rows]) if rows else "No department found."

        # fallback: run a general search in graph trying to find named entities
        # list all employees and department names and check if any are in the question
        all_nodes = run_cypher("MATCH (n) RETURN labels(n) as labels, n.name as name")
        names = [r["name"] for r in all_nodes if r.get("name")]
        found = [n for n in names if n.lower() in ql]
        if found:
            # return node summary for first match
            nm = found[0]
            cy = "MATCH (n {name:$name})-[r]-(m) RETURN type(r) as rel, m.name as other"
            rows = run_cypher(cy, {"name": nm})
            return f"Relations for {nm}: " + ", ".join([f"{r['rel']} -> {r['other']}" for r in rows]) if rows else "No relations found."

        return "Couldn't map question to graph. Try a different phrasing."

    else:
        # document route: retrieve top-3 chunks and call LLM (OpenAI example)
        docs = retrieve_docs(question, top_k=3)
        context = "\n\n".join(docs)
        # if you set OpenAI key above, use llm_answer_openai; otherwise return the context
        try:
            ans = llm_answer_openai(context, question)
            return ans
        except Exception as e:
            return f"No LLM key found or LLM error. Returning context:\n\n{context[:1000]}"

# Try combined QA:
examples = [
    "Who reports to the Head of Investment Research?",
    "Which department manages AI Credit Scoring project?",
    "What are the budgets of the projects?",
    "What recommendations does the report give?"
]

for q in examples:
    print("\nQ:", q)
    print("A:", answer_nl(q))
