In [None]:
import getpass
import os

# Load environment variables from .env file if it exists
from dotenv import load_dotenv
load_dotenv()

In [None]:
from langchain.document_loaders.base import BaseLoader
from langchain.schema import Document
import re
from typing import List, TypedDict

class CustomMarkdownLoader(BaseLoader):
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self) -> List[Document]:
        with open(self.file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
       # Extract frontmatter if it exists
        frontmatter_match = re.match(r'^---\n(.*?)\n---\n(.*)', content, re.DOTALL)
        
        if frontmatter_match:
            frontmatter = frontmatter_match.group(1)
            content_body = frontmatter_match.group(2)
            
            # Parse frontmatter into a dictionary
            metadata = {}
            for line in frontmatter.split('\n'):
                if ':' in line:
                    key, value = line.split(':', 1)
                    metadata[key.strip()] = value.strip()
            
            # Add source metadata
            metadata['source'] = os.path.basename(self.file_path)
            
            return [Document(page_content=content_body.strip(), metadata=metadata)]
        else:
            return [Document(page_content=content, metadata={'source': os.path.basename(self.file_path)})]


documents = []
for filename in os.listdir("./../data/translated/"):
    if filename.endswith('.md'):
        loader = CustomMarkdownLoader(os.path.join("./../data/translated/", filename))
        documents.extend(loader.load())
print(f"Loaded {len(documents)} documents.")

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

# Text splitting 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    length_function=len,
)

chunks = text_splitter.split_documents(documents)

# Create vector store 
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./livre_db"
)

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.1)

In [None]:
template = """You are an expert assistant for exploring a book. Based on the provided excerpts, answer the question precisely.

INSTRUCTIONS:
- Quote relevant passages in quotation marks
- Indicate source chapter: (Chapter: "title")
- If info isn't in excerpts, say so clearly
- Synthesize if multiple excerpts address the topic

BOOK EXCERPTS:
{context}

QUESTION: {question}

STRUCTURED RESPONSE:"""

PROMPT = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

In [None]:
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

In [None]:
def retrieve(state: State):
    retrieved_docs = vectorstore.similarity_search(state["question"], k= 20)
    return {"context": retrieved_docs}

def answer(state: State):
    prompt = PROMPT.format(context="\n\n".join([doc.page_content for doc in state["context"]]), question=state["question"])
    response = llm(prompt)
    return {"answer": response}

In [None]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, answer])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [None]:
result = graph.invoke({"question": "Who is Shirone?"})

print(f"Context: {len(result['context'])}\n\n")
print(f"Answer: {result['answer']}")