In [10]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema.document import Document
from langchain_community.llms import Ollama
from langchain_community.document_loaders import PDFPlumberLoader
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.prompts import PromptTemplate
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains.history_aware_retriever import create_history_aware_retriever

import os
import shutil
from dotenv import load_dotenv

# Load environment variables. Assumes that project contains .env file with API keys
load_dotenv()
#---- Set OpenAI API key 
# Change environment variable name from "OPENAI_API_KEY" to the name given in 
# your .env file.
#openai.api_key = os.environ['OPENAI_API_KEY']

CHROMA_PATH = "chroma"
DATA_PATH = "data/books"
cached_llm = Ollama(model="llama3")

aw_prompt = PromptTemplate.from_template(
    """ 
    <s>[INST] You are a AI assistant good at searching docuemnts. If you do not have an answer from the provided information say so. [/INST] </s>
    [INST] {input}
           Context: {context}
           Answer:
    [/INST]
"""
)


raw_prompt = PromptTemplate.from_template(
"""
<s>[INST] You are an AI assistant skilled at searching documents. You should only provide answers based on the information contained in the provided documents. If the information is not available in the documents, state that you do not have an answer. [/INST] </s>
[INST] {input}
Context: {context}
Answer:
[/INST]
"""
)

In [7]:
embedding = FastEmbedEmbeddings()

def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)


def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.pdf")
    print(loader)
    documents = loader.load()
    return documents


def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks


def save_to_chroma(chunks: list[Document]):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, embedding, persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
    
generate_data_store()

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

<langchain_community.document_loaders.directory.DirectoryLoader object at 0x000002805C574500>
Split 1 documents into 698 chunks.
4

1.1 Performance Estimation: Generalization Performance vs. Model Selection . . . . .

4

1.2 Assumptions and Terminology . . . . . . . . . . . . . . . . . . . . . . . . . . . .

5

1.3 Resubstitution Validation and the Holdout Method . . . . . . . . . . . . . . . . . .

7

1.4 Stratiﬁcation .
{'source': 'data\\books\\Model Evaluation, Model Selection, and Algorithm in ML.pdf', 'start_index': 1915}
Saved 698 chunks to chroma_new.


In [11]:
chat_history = []

def retrieve_t_chroma(query):
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding)
    retriever = db.as_retriever(
            search_type="similarity_score_threshold",
            search_kwargs={
                "k": 20,
                "score_threshold": 0.5,
            },
        )
    
    retriever_prompt = ChatPromptTemplate.from_messages(
        [
            MessagesPlaceholder(variable_name="chat_history"),
            ("human", "{input}"),
            (
                "human",
                "Given the above conversation, generate a search query to lookup in order to get information relevant to the conversation",
            ),
        ]
    )

    history_aware_retriever = create_history_aware_retriever(
        llm=cached_llm, retriever=retriever, prompt=retriever_prompt
    )

    document_chain = create_stuff_documents_chain(cached_llm, raw_prompt)
    # chain = create_retrieval_chain(retriever, document_chain)

    retrieval_chain = create_retrieval_chain(
        # retriever,
        history_aware_retriever,
        document_chain,
    )

    # result = chain.invoke({"input": query})
    result = retrieval_chain.invoke({"input": query})
    print("Answer **:: ",result["answer"])
    chat_history.append(HumanMessage(content=query))
    chat_history.append(AIMessage(content=result["answer"]))

    sources = []
    for doc in result["context"]:
        sources.append(
            {"source": doc.metadata["source"], "page_content": doc.page_content}
        )

    response_answer = {"answer": result["answer"], "sources": sources}
    print(response_answer)
    print("!!!!!!!!!!!!!!!!!!")
    return chat_history

In [12]:
query = "what is model Evaluation?"
retrieve_t_chroma(query)

Answer **::  According to the provided documents, model evaluation refers to the process of estimating and assessing the performance of a machine learning model. This involves techniques for estimating the uncertainty of estimated model performance as well as the model's variance and stability. Model evaluation is an essential part of the machine learning pipeline, along with model selection and algorithm selection.

Model evaluation is used to assess the predictive performance of a model and provides insights into the model's generalization ability, bias-variance trade-off, and uncertainty estimates. It also enables comparison between different models or algorithms to determine which one performs best.

The document highlights several techniques for model evaluation, including k-fold cross-validation, holdout method, and various statistical tests. These techniques help ensure that the evaluated performance of a model is accurate and reliable, allowing researchers and practitioners to 

[HumanMessage(content='what is model Evaluation?'),
 AIMessage(content="According to the provided documents, model evaluation refers to the process of estimating and assessing the performance of a machine learning model. This involves techniques for estimating the uncertainty of estimated model performance as well as the model's variance and stability. Model evaluation is an essential part of the machine learning pipeline, along with model selection and algorithm selection.\n\nModel evaluation is used to assess the predictive performance of a model and provides insights into the model's generalization ability, bias-variance trade-off, and uncertainty estimates. It also enables comparison between different models or algorithms to determine which one performs best.\n\nThe document highlights several techniques for model evaluation, including k-fold cross-validation, holdout method, and various statistical tests. These techniques help ensure that the evaluated performance of a model is ac