In [16]:
# 1. INSTALL LIBRARIES
!pip install -q -U langchain-core langchain-community langchain-groq pypdf chromadb tiktoken langchain-text-splitters langchain-classic sentence-transformers

import os
import shutil
import uuid
import base64
import re
from google.colab import files
from IPython.display import display, Markdown, Image

# LangChain Imports
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_classic.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate

# 2. SETUP GROQ API KEY
GROQ_API_KEY = "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"

def render_mermaid(full_text):
    """Regex-based cleaner to extract and render Mermaid code visually."""
    # Find the block starting with graph TD and ending with the last bracket
    match = re.search(r'(graph\s+TD.*\])', full_text, re.DOTALL)
    if not match:
        print("Assistant provided code, but it didn't match the 'graph TD' format for visualization.")
        return

    code = match.group(1).strip()

    # Use UTF-8 for special characters and encode for the mermaid.ink API
    try:
        code_bytes = code.encode("utf-8")
        base64_bytes = base64.b64encode(code_bytes)
        base64_string = base64_bytes.decode("ascii")
        print("\n--- Visual Flowchart ---")
        display(Image(url="https://mermaid.ink/img/" + base64_string))
    except Exception as e:
        print(f"Visualization Error: {e}")

def run_research_assistant():
    print("--- AI Research Assistant Project (MatSoc) ---")
    db_id = str(uuid.uuid4())[:8]
    persist_dir = f"./chroma_db_{db_id}"

    # 3. UPLOAD PDF
    print("\nStep 1: Upload your Research Paper (PDF)")
    uploaded = files.upload()
    if not uploaded: return
    file_path = list(uploaded.keys())[0]

    # 4. PROCESS PDF
    print("Step 2: Processing document...")
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)
    splits = text_splitter.split_documents(docs)

    # 5. LOCAL EMBEDDINGS (Deviation from Baseline)
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=persist_dir)

    # 6. LLM SETUP (Groq)
    llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama-3.3-70b-versatile")

    template = """You are an AI Research Assistant.
    Answer ONLY using the provided context. If the answer is not in the context, say you don't know.

    IF the user asks for a flowchart:
    1. Briefly explain the steps in text.
    2. Provide the Mermaid JS code starting with 'graph TD'.

    Context: {context}
    Question: {question}
    Answer:"""

    QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

    # 7. RAG PIPELINE
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
    )

    # 8. CHAT INTERFACE
    print("\n--- System Ready! ---")
    while True:
        query = input("\nYour Question: ")
        if query.lower() == 'exit':
            shutil.rmtree(persist_dir, ignore_errors=True)
            break

        try:
            response = qa_chain.invoke(query)
            result = response['result']
            display(Markdown(f"**Assistant:** {result}"))

            if "graph TD" in result:
                render_mermaid(result)

        except Exception as e:
            print(f"Error: {e}")

# RUN
run_research_assistant()

--- AI Research Assistant Project (MatSoc) ---

Step 1: Upload your Research Paper (PDF)


Saving DuanZhang_Separation_TASLP.pdf to DuanZhang_Separation_TASLP.pdf
Step 2: Processing document...

--- System Ready! ---

Your Question: Give me a flowchart of this paper.


**Assistant:** The steps in the flowchart are as follows: 
1. Calculate the SDR curve of the mixed signal for both the piccolo source and the voice source.
2. Compare the calculated SDR curves with the oracle lines, which represent the theoretical upper bounds of single-channel source separation performance.
3. Use the AHS model to calculate the minus log-likelihood of the piccolo signal and the voice signal.
4. Evaluate the performance of the algorithm by comparing the MPE results with the true MPE results and the results from [53].

Here is the Mermaid JS code for the flowchart:
```graph TD
    A[Calculate SDR curve of mixed signal] --> B[Compare with oracle lines]
    B --> C[Calculate minus log-likelihood using AHS model]
    C --> D[Evaluate performance by comparing MPE results]
    D --> E[Compare with true MPE results and [53]]
```


--- Visual Flowchart ---



Your Question: mermaid diagram.


**Assistant:** To create a flowchart for the provided context, we first need to identify the steps involved in the process. The context discusses the separation of harmonic instrumental sources and singing voices using the AHS (Amplitude Harmonic Structure) model. The steps can be briefly explained as follows:

1. Generate a mixed signal by adding the two sources (e.g., piccolo and voice, or oboe and euphonium) with equal energy or a specified energy ratio.
2. Learn the AHS models from the mixed signal for each source.
3. Use the learned AHS models to separate the sources from the mixed signal.
4. Evaluate the performance of the separation using metrics such as SDR (Signal-to-Distortion Ratio).

Here is the Mermaid JS code for the flowchart:
```mermaid
graph TD
    A[Generate Mixed Signal] --> B[Learn AHS Models]
    B --> C[Separate Sources]
    C --> D[Evaluate Performance]
    D --> E[Output Results]
```


--- Visual Flowchart ---



Your Question: bnanan


**Assistant:** I don't know.


Your Question: g


**Assistant:** I don't know. There is no question provided.


Your Question: exit
