In [1]:
# Set import path
import sys
sys.path.append("../finalRAGChatbot/modules")

# Reload on every notebook run
%load_ext autoreload
%autoreload 2


In [2]:
import sys
import os

# Add parent of 'modules' to sys.path
sys.path.append(os.path.abspath(".."))  # now it sees 'modules' as a sibling to 'notebooks'

from modules.ingest import load_and_split_docs, create_vectorstore
from modules.retriever import setup_rag_pipeline, answer_question
from modules.logging_db import create_logs_table, log_interaction


📂 Current working directory: c:\Users\mahmo\OneDrive\Bureau\PIDS_Code\Crypto-Fund-Due-Diligence-Automation\finalRAGChatbot\notebooks


In [3]:
# 1. Load and embed documents
splits = load_and_split_docs("../docs")
vectorstore = create_vectorstore(splits)

# 2. Setup the full RAG pipeline
setup_rag_pipeline(vectorstore)

# 3. Prepare logs table (creates SQLite table if missing)
create_logs_table()


  embedding = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import uuid
session_id = str(uuid.uuid4())

# Ask a question that should be answered from the PDFs
question = "List me the Team members of coinbase venture ?"
answer, source = answer_question(question, session_id)

print("🔎 Source:", source)
print("💬 Answer:", answer)

# Log it
log_interaction(session_id, question, answer)


USER (hardcoded): List me the Team members of coinbase venture ?

GENERATING SEARCH QUERY.
EXA Search Results: 10 results found.
4
✅ First 500 chars of scraped text:
Aklil Ibssa serves as the Director of Corporate Development & Ventures at Coinbase, having joined the company pre-IPO as one of the initial hires in the team. Key contributions include leading seven acquisitions and approximately 40 investments, significantly enhancing Coinbase's standing as a foremost acquirer and investor in the crypto sector, totaling over 25 acquisitions and around 400 investments. Prior to Coinbase, Aklil held the position of Product Manager at LinkedIn, where involvement i...

Contains data needed: True
Classification: detailed
ASSISTANT: Based on the provided CONTEXT, there is no mention of team members for Coinbase Venture. The context only mentions Aklil Ibssa as the Director of Corporate Development & Ventures at Coinbase, but it does not provide information about any other team members.

The inf

In [7]:
import os
import pandas as pd
from dotenv import load_dotenv
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_ollama import OllamaLLM
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
import sqlite3
import uuid

# -------------------------
# 2. Environment & Models
# -------------------------

In [8]:
load_dotenv(".env")

llm_local = OllamaLLM(
    model="llama3",
    base_url="http://127.0.0.1:11434"
)

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma(
    persist_directory="db_rag",
    embedding_function=embedding_function,
    collection_name="rag-chatbot"
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

  vectorstore = Chroma(


# -------------------------
# 3. Prompt Templates
# -------------------------

In [9]:
qa_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are an assistant specialized in crypto due diligence. Use only the provided context."
     " If unsure, say: 'The information is not available in the document.'"
    ),
    ("system", "Context: {context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}")
])

In [10]:
# -------------------------
# 4. RAG Chain Assembly
# -------------------------
question_answer_chain = create_stuff_documents_chain(llm_local, qa_prompt)

from langchain_core.prompts import ChatPromptTemplate as ContextPrompt
contextualize_q_prompt = ContextPrompt.from_messages([
    ("system", "Your task is to rewrite the user's question to make it self-contained."),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}")
])

history_aware_retriever = create_history_aware_retriever(
    llm_local, retriever, contextualize_q_prompt
)

rag_chain_final = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [11]:
# -------------------------
# 5. SQLite for Sessions
# -------------------------
DB_NAME = "rag_app.db"

def get_db_connection():
    conn = sqlite3.connect(DB_NAME)
    conn.row_factory = sqlite3.Row
    return conn

def create_logs_table():
    conn = get_db_connection()
    conn.execute('''CREATE TABLE IF NOT EXISTS application_logs (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        session_id TEXT,
        user_query TEXT,
        gpt_response TEXT,
        model TEXT,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    )''')
    conn.commit()
    conn.close()

create_logs_table()

def get_chat_history(session_id):
    conn = get_db_connection()
    cursor = conn.cursor()
    cursor.execute("SELECT user_query, gpt_response FROM application_logs WHERE session_id = ? ORDER BY created_at", (session_id,))
    messages = []
    for row in cursor.fetchall():
        messages.extend([
            HumanMessage(content=row['user_query']),
            AIMessage(content=row['gpt_response'])
        ])
    conn.close()
    return messages

def log_interaction(session_id, question, answer, model="llama3"):
    conn = get_db_connection()
    conn.execute("INSERT INTO application_logs (session_id, user_query, gpt_response, model) VALUES (?, ?, ?, ?)",
                 (session_id, question, answer, model))
    conn.commit()
    conn.close()


In [12]:
# -------------------------
# 6. Question CSV Loader & Routing
# -------------------------
def load_questions(csv_path):
    df = pd.read_csv(csv_path)
    def decide_source(row):
        if row["data_accessibility"] == "csv_only":
            return "csv"
        elif row["category_risk"] == "high" and row["severity"] in ["critical", "high"]:
            return "websearch"
        return "rag"
    df["source"] = df.apply(decide_source, axis=1)
    return df

In [13]:
# -------------------------
# 7. Answer Dispatcher
# -------------------------
def answer_question(question, session_id, source="rag"):
    chat_history = get_chat_history(session_id)
    
    if source == "rag":
        result = rag_chain_final.invoke({
            "input": question,
            "chat_history": chat_history
        })
        if "The information is not available in the document." not in result['answer']:
            return result['answer'], "rag"

    if source == "csv":
        return "[CSV answer stub]", "csv"

    from websearch.test import main
    return main(question), "web"

In [16]:
# -------------------------
# 8. Batch Run
# -------------------------
session_id = str(uuid.uuid4())
questions_df = load_questions("C:/Users/mahmo/OneDrive/Bureau/PIDS_Code/Crypto-Fund-Due-Diligence-Automation/funds_treat/notebooks/final_questions_metrics.csv")

results = []
for _, row in questions_df.iterrows():
    q = row['question']
    src = row['source']
    a, used = answer_question(q, session_id, source=src)
    log_interaction(session_id, q, a)
    results.append({"question": q, "answer": a, "source": used})

pd.DataFrame(results).to_csv("qa_results.csv", index=False)
print("✅ Answers saved to qa_results.csv")

KeyboardInterrupt: 

In [1]:
# FUND DUE DILIGENCE QA WORKSHOP NOTEBOOK
# =======================================
# Goal: Load 5 sample questions per category, answer each using RAG + Web + CSV context,
# show editable responses, enable user chat interaction, and produce category-wise scoring.
# This logic will later be deployed into a Django-based platform.

# --------------------------------------------------------
# 📦 SECTION 1: Setup - Imports, Paths, and Load Modules
# --------------------------------------------------------
import sys, os
import uuid
from dotenv import load_dotenv

# Add parent dir to path to access finalRAGChatbot.modules
sys.path.append(os.path.abspath(".."))

from modules.ingest import load_and_split_docs, create_vectorstore
from modules.retriever import setup_rag_pipeline, answer_question
from modules.logging_db import create_logs_table, log_interaction, get_chat_history
from modules.csv_query import load_all_csvs, find_csv_context
from modules.fund_name_resolver import extract_fund_name_from_question, extract_fund_name_from_text
from modules.scoring import compute_category_score, compute_overall_score

load_dotenv("../.env")
create_logs_table()

# --------------------------------------------------------
# 📁 SECTION 2: Load PDFs and Initialize Vector DB
# --------------------------------------------------------
splits = load_and_split_docs("../docs")
vectorstore = create_vectorstore(splits)
setup_rag_pipeline(vectorstore)

# --------------------------------------------------------
# 🧠 SECTION 3: Load Questions CSV and CSV Datasets
# --------------------------------------------------------
import pandas as pd

def get_top_priority_questions(df, top_n=5):
    """
    Select top N questions per category based on a weighted importance score.
    The formula prioritizes severity, strategic impact, and risk, while lowering score for accessible data.
    """
    def compute_priority(row):
        return (
            2 * row["severity"] + 
            2 * row["strategic_impact"] + 
            1.5 * row["category_risk"] - 
            1.2 * row["data_accessibility"]
        )

    df = df.copy()
    df["priority_score"] = df.apply(compute_priority, axis=1)

    top_questions = (
        df.sort_values("priority_score", ascending=False)
          .groupby("category")
          .head(top_n)
          .reset_index(drop=True)
    )
    return top_questions


# Sample 5 questions per category
questions_df = pd.read_csv("C:/Users/mahmo/OneDrive/Bureau/PIDS_Code/Crypto-Fund-Due-Diligence-Automation/funds_treat/notebooks/final_questions_metrics.csv")
sample_df = get_top_priority_questions(questions_df, top_n=5)


# --------------------------------------------------------
# 🤖 RISK SCORING FUNCTION - Called by Answer Pipeline
# --------------------------------------------------------
def extract_risk_score_from_answer(answer_text):
    """
    Ask the LLM to assign a RISK SCORE (0 = no risk, 10 = critical risk) to the provided answer.
    The score should reflect how concerning or alarming the answer is from an investment risk perspective.
    """

    from modules.llm_setup import llm_local
    from langchain_core.messages import HumanMessage

    system_prompt = (
        "You are acting as a senior investment due diligence analyst. Your task is to evaluate the following answer "
        "and assign a RISK SCORE between 0 and 10:\n\n"
        "- A score of 0 means the answer presents no investment concern at all.\n"
        "- A score of 5 means moderate or unclear risk is present.\n"
        "- A score of 10 means the answer indicates very high or critical investment risk.\n\n"
        "You must return a single number only, based strictly on the risk conveyed by the answer's content."
    )

    try:
        response = llm_local.invoke([
            HumanMessage(role="system", content=system_prompt),
            HumanMessage(role="user", content=answer_text)
        ])
        score_text = response.content.strip()
        return float(score_text)
    except:
        return 5.0  # neutral fallback


# --------------------------------------------------------
# 🧠 SECTION 4: Answer Sample Questions (RAG > Web > CSV)
# --------------------------------------------------------
datasets = load_all_csvs()
def run_sample_qa_workflow(fund_name, session_id):
    answered = []
    for _, row in sample_df.iterrows():
        question = row["question"]
        category = row["category"]

        # Try to resolve fund name if needed
        resolved_fund = extract_fund_name_from_question(question) or fund_name

        # Retrieve CSV context
        csv_context = find_csv_context(datasets, resolved_fund, category)

        # Use RAG and fallback to web+csv
        answer, source = answer_question(
            question, session_id, resolved_fund, datasets, category,
            metadata=row.to_dict(), csv_context=csv_context
        )

        # Placeholder: LLM must return an estimated risk_score in future implementation
        extracted_risk_score = extract_risk_score_from_answer(answer)  

        answered.append({
            "question": question,
            "category": category,
            "severity": row["severity"],
            "data_accessibility": row["data_accessibility"],
            "category_risk": row["category_risk"],
            "strategic_impact": row["strategic_impact"],
            "risk_score": extracted_risk_score,
            "answer": answer,
            "source": source
        })
        log_interaction(session_id, question, answer)

    return pd.DataFrame(answered)

# Run for a sample fund
session_id = str(uuid.uuid4())
FUND_NAME = "coinbase ventures"
answers_df = run_sample_qa_workflow(FUND_NAME, session_id)
answers_df.head()

# --------------------------------------------------------
# 📊 SECTION 5: Compute Score per Category + Global Score
# --------------------------------------------------------
def compute_category_score(df, category):
    """
    Compute category score based on LLM-evaluated risk_score (0-10), where lower is better.
    Score is (10 - risk_score) averaged per category.
    """
    try:
        return df[df["category"] == category]["risk_score"].apply(lambda x: max(0, 10 - float(x))).mean()
    except:
        return 0

def compute_overall_score(df):
    """
    Compute global investment score as mean of all category scores.
    """
    categories = df["category"].unique()
    if len(categories) == 0:
        return 0
    return sum(compute_category_score(df, cat) for cat in categories) / len(categories)

category_scores = answers_df["category"].unique()
category_scores = {
    category: compute_category_score(answers_df, category)
    for category in category_scores
}

overall_score = compute_overall_score(answers_df)

print("\n📈 Category Scores:")
print(category_scores)
print("\n💰 Overall Investment Score:", round(overall_score, 2))

# --------------------------------------------------------
# 💬 SECTION 6: Interact with the Chatbot About a Question
# --------------------------------------------------------
from modules.llm_setup import llm_local
from langchain_core.messages import HumanMessage, AIMessage

chat_history = get_chat_history(session_id)

user_input = "Can you explain more why this answer is risky?"
follow_up = llm_local.invoke([
    *chat_history,
    HumanMessage(content=user_input)
])

print("\n🔁 Follow-Up Answer:", follow_up.content)

# --------------------------------------------------------
# ✏️ SECTION 7: Edit a Question and Re-run Answer
# --------------------------------------------------------
edited_question = "Who exactly are the lead partners of coinbase ventures?"
edited_answer, edited_source = answer_question(
    edited_question, session_id, FUND_NAME, datasets, category="team")

print("\n✏️ Updated Answer:", edited_answer)
print("🔎 Source:", edited_source)


📂 Current working directory: c:\Users\mahmo\OneDrive\Bureau\PIDS_Code\Crypto-Fund-Due-Diligence-Automation\finalRAGChatbot\notebooks


  embedding = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


USER (hardcoded): How will you adapt to upcoming regulations like the EU's MiCA or the U.S. Executive Order on crypto?

GENERATING SEARCH QUERY.
EXA Search Results: 9 results found.
7
✅ First 500 chars of scraped text:
**An audio summary of this article is available in the player below. Scroll to keep reading. ****Listen and subscribe to Womble Perspectives wherever you get your podcasts.**


The Council of the European Union has approved the world's first major rules aimed at regulating the cryptocurrency industry, the [Markets in Crypto-Assets (MiCA) Regulation](https://www.esma.europa.eu/esmas-activities/digital-finance-and-innovation/markets-crypto-assets-regulation-mica#:~:text=The%20Markets%20in%20Crypto...

Contains data needed: Based on the article, it seems that the EU has taken a significant step towards regulating cryptocurrencies with the introduction of the Markets in Crypto-Assets (MiCA) regulation. This regulation aims to provide a harmonized framework for crypto-asset s

AttributeError: 'str' object has no attribute 'content'