In [1]:
from pathlib import Path
import os
import re
from typing import List

from llama_index.core import (
    VectorStoreIndex,
    Settings,
    Document
)
from llama_index.core import (
    VectorStoreIndex,
    Settings,
    Document
)
from llama_index.core.prompts import PromptTemplate
from llama_index.core.chat_engine import SimpleChatEngine
from llama_index.core.prompts import PromptTemplate, MessageRole

from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.readers.base import BaseReader
from llama_index.readers.file.markdown import MarkdownReader

class ObsidianProcessor:
    def __init__(self, chunk_size: int = 512, chunk_overlap: int = 50):
        self.node_parser = SentenceSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )

    def clean_text(self, text: str) -> str:
        """Clean Obsidian-specific markdown and formatting"""
        # Remove Obsidian internal links [[...]]
        text = re.sub(r'\[\[([^\]]+)\]\]', r'\1', text)
        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        # Remove empty lines
        text = '\n'.join(line for line in text.split('\n') if line.strip())
        return text

    def process_documents(self, documents: List[Document]) -> List[Document]:
        """Process and chunk documents"""
        cleaned_docs = []
        for doc in documents:
            if doc.text.strip():  # Skip empty documents
                cleaned_text = self.clean_text(doc.text)
                if cleaned_text:
                    doc.text = cleaned_text
                    cleaned_docs.append(doc)

        # Convert nodes back to documents
        nodes = self.node_parser.get_nodes_from_documents(cleaned_docs)
        return [Document(text=node.text) for node in nodes]

class MyObsidianReader(BaseReader):
    def __init__(self, input_dir: str):
        self.input_dir = Path(input_dir)

    def my_load_data(self):
        docs = []
        for dirpath, dirnames, filenames in os.walk(self.input_dir):
            # Skip certain directories
            if "Images_Media" in dirnames:
                dirnames.remove("Images_Media")
            dirnames[:] = [d for d in dirnames if not d.startswith(".")]

            for filename in filenames:
                if filename.endswith(".md"):
                    filepath = os.path.join(dirpath, filename)
                    content = MarkdownReader().load_data(Path(filepath))
                    docs.extend(content)
        return docs


class PersonalObsidianChat:
    def __init__(self, index):
        self.index = index
        self.retriever = self.index.as_retriever(
            similarity_top_k=3
        )

        # Create a more focused query engine
        self.query_engine = self.index.as_query_engine(
            similarity_top_k=3,
            response_mode="tree_summarize",  # Changed response mode
            streaming=False
        )

    def chat(self, query: str) -> str:
        try:
            # First check if the query is empty or just whitespace
            if not query or query.isspace():
                return "Please provide a valid query."

            print(f"Processing chat query: {query}")  # Debug print

            # Get relevant context
            nodes = self.retriever.retrieve(query)
            if not nodes:
                return "No relevant information found in your notes."

            # Build context string
            context = "\n".join([
                f"Context {i+1}: {node.node.text if hasattr(node, 'node') else node.text}"
                for i, node in enumerate(nodes)
            ])

            # Form complete query with context
            complete_query = f"""
            Question: {query}
            Based on the following context from notes:
            {context}

            Please provide a focused answer specifically addressing the question.
            """

            response = self.query_engine.query(complete_query)
            return response.response

        except Exception as e:
            return f"Error in chat: {str(e)}"

    def query(self, query: str) -> str:
        try:
            if not query or query.isspace():
                return "Please provide a valid query."

            print(f"Processing direct query: {query}")  # Debug print

            response = self.query_engine.query(query)
            if not response or not response.response:
                return "No relevant information found."

            return response.response

        except Exception as e:
            return f"Error in query: {str(e)}"

    def search_notes(self, query: str) -> list:
        print("=== Debug: Starting search_notes method ===")  # Debug print
        try:
            print(f"Debug: Query received: '{query}'")

            if not self.retriever:
                print("Debug: Retriever is not initialized")
                return []

            print("Debug: Calling retriever.retrieve()")
            results = self.retriever.retrieve(query)

            print(f"Debug: Retriever returned {len(results) if results else 0} results")

            if not results:
                print("Debug: No results found")
                return []

            print("Debug: Processing results")
            processed_results = []
            for result in results:
                if hasattr(result, 'node'):
                    print(f"Debug: Found NodeWithScore object, score: {result.score}")
                    processed_results.append(result)
                else:
                    print(f"Debug: Found regular Node object")
                    processed_results.append(result)

            print(f"Debug: Returning {len(processed_results)} processed results")
            return processed_results

        except Exception as e:
            print(f"Debug: Error occurred in search_notes: {str(e)}")
            import traceback
            print("Debug: Full traceback:")
            print(traceback.format_exc())
            return []

def init_llm():
    try:
        llm = Ollama(
            model="tinydolphin",
            request_timeout=120.0,
            temperature=0.7,  # Add temperature control
            context_window=2048  # Set context window
        )
        return llm
    except Exception as e:
        print(f"Error initializing LLM: {e}")
        return None

def create_enhanced_index(documents: List[Document]):
    try:
        processor = ObsidianProcessor(chunk_size=512, chunk_overlap=50)
        processed_docs = processor.process_documents(documents)

        if not processed_docs:
            print("Warning: No documents to index")
            return None

        return VectorStoreIndex.from_documents(
            processed_docs,
            show_progress=True
        )
    except Exception as e:
        print(f"Error creating index: {e}")
        return None



In [2]:
from llama_index.core.prompts import ChatMessage, MessageRole

chat_prompt = PromptTemplate(
    template=(
        "You are an AI assistant helping to search and analyze personal notes from an Obsidian vault. "
        "Use the following context from the notes to answer the question. "
        "If you cannot find relevant information in the context, say 'I cannot find relevant information about that in your notes.'\n\n"
        "Context: {context}\n\n"
        "Human: {query}\n\n"
        "Assistant: "
    )
)

In [3]:
# Main execution

# Initialize LLM
llm = init_llm()

# Set up global settings
Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.chunk_size = 512
Settings.chunk_overlap = 50

# Load and process documents
reader = MyObsidianReader(input_dir="/Users/cairo/Library/Mobile Documents/iCloud~md~obsidian/Documents")


documents = reader.my_load_data()

# Create index
index = create_enhanced_index(documents)

# Create chat interface

# obsidian_chat = PersonalObsidianChat(index)
# response = obsidian_chat.chat("What are my notes about social perception?")
# print(response)


# # Interactive chat loop
# while True:
#     query = input("You: ")
#     if query.lower() in ['quit', 'exit']:
#         break
#     response = obsidian_chat.chat(query)
#     print(f"Bot: {response}")





Parsing nodes:   0%|          | 0/4872 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/784 [00:00<?, ?it/s]

In [4]:
def chat_interface():
    print("Debug: Creating PersonalObsidianChat instance")
    obsidian_chat = PersonalObsidianChat(index)

    while True:
        print("\nOptions:")
        print("1. Chat")
        print("2. Query")
        print("3. Search notes")
        print("4. Exit")

        # Menu choice with validation loop
        valid_choice = False
        while not valid_choice:
            choice = input("Choose an option (1-4): ").strip()
            print(f"Debug: User selected option: '{choice}'")

            if choice in ["1", "2", "3", "4"]:
                valid_choice = True
            elif choice:  # Only show error for non-empty invalid input
                print("Invalid option. Please enter 1, 2, 3, or 4.")

        if choice == "3":
            # Separate prompt for search
            print("\nSearch Mode:")
            while True:
                search_term = input("Enter your search term (or 'back' to return to menu): ").strip()
                if search_term.lower() == 'back':
                    break

                if not search_term:
                    print("Search term cannot be empty. Please try again.")
                    continue

                if search_term == "3":
                    print("Invalid search term. Please enter your actual search keywords.")
                    continue

                print(f"Debug: Searching for: '{search_term}'")
                results = obsidian_chat.search_notes(search_term)

                if results:
                    for i, result in enumerate(results, 1):
                        print(f"\nResult {i}:")
                        if hasattr(result, 'node'):
                            text = result.node.text
                            score = result.score
                        else:
                            text = result.text
                            score = getattr(result, 'score', 'N/A')

                        print(f"Relevance Score: {score}")
                        if text:
                            print(f"Content: {text[:300]}...")
                        else:
                            print("No content available")
                        print("-" * 50)
                else:
                    print("No results found for your search term.")
                break

        elif choice == "1":
            chat_input = input("Enter your question: ").strip()
            if chat_input:
                response = obsidian_chat.chat(chat_input)
                print(f"\nResponse: {response}")

        elif choice == "2":
            query = input("Enter your query: ").strip()
            if query:
                response = obsidian_chat.query(query)
                print(f"\nResponse: {response}")

        elif choice == "4":
            print("Goodbye!")
            break

In [None]:

print("Number of documents loaded:", len(documents))
print("Index created successfully:", index is not None)
print("LLM initialized:", llm is not None)

chat_interface()

Number of documents loaded: 2306
Index created successfully: True
LLM initialized: True
Debug: Creating PersonalObsidianChat instance

Options:
1. Chat
2. Query
3. Search notes
4. Exit
Debug: User selected option: '3'

Search Mode:
Debug: Searching for: 'social'
=== Debug: Starting search_notes method ===
Debug: Query received: 'social'
Debug: Calling retriever.retrieve()
Debug: Retriever returned 3 results
Debug: Processing results
Debug: Found NodeWithScore object, score: 0.7557319065455824
Debug: Found NodeWithScore object, score: 0.7555729386853011
Debug: Found NodeWithScore object, score: 0.7441187710480391
Debug: Returning 3 processed results

Result 1:
Relevance Score: 0.7557319065455824
Content: Social media项目
background/problem: Mkt这块，研究人们如何表达自己的人设的东西很多，比如我最近买了什么，人们会不会说，用什么方式来说，叫word of mouth。但研究别人如何来看待这些人设的很少。所以我这个项目是研究audience如何看待人们在socail media上发的东西。
我觉得线上和线下的一个主要的区别是，social media上一个人的身份是很多元的，比如一个人可以展现他是父亲，是球迷，是摇滚乐粉丝，但线下其实其他人很多时候看不到这种多元化的人设，比如我一个经常和我一起上课的同学，可能我只能看到他是学霸这一点，并