In [1]:
!pip install -U langchain_mistralai
!pip install git+https://github.com/explodinggradients/ragas
!pip install "mistralai~=1.1.0"
!pip install langchain-huggingface
!pip install PyPDF2
!pip install nltk

Collecting langchain_mistralai
  Downloading langchain_mistralai-0.2.1-py3-none-any.whl.metadata (2.4 kB)
Collecting httpx-sse<1,>=0.3.1 (from langchain_mistralai)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-core<0.4.0,>=0.3.15 (from langchain_mistralai)
  Downloading langchain_core-0.3.19-py3-none-any.whl.metadata (6.3 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.4.0,>=0.3.15->langchain_mistralai)
  Using cached jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting langsmith<0.2.0,>=0.1.125 (from langchain-core<0.4.0,>=0.3.15->langchain_mistralai)
  Downloading langsmith-0.1.143-py3-none-any.whl.metadata (13 kB)
Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.15->langchain_mistralai)
  Using cached jsonpointer-3.0.0-py2.py3-none-any.whl.metadata (2.3 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.125->langchain-core<0.4.0,>=0.3.15->langchain_mistralai)
  Using cached

In [4]:
# add parent forlder to python path
import sys
sys.path.append("..")
from chat_solution.utils import load_env
load_env()

import os
MISTRAL_API_KEY = "<<YOUR KEY>>"
os.environ["MISTRAL_API_KEY"] = MISTRAL_API_KEY

Loading environment variables from /Users/jean.machado@getyourguide.com/prj/rag-workshop/.env


In [5]:
import os
import pickle
import numpy as np
import pandas as pd
from typing import List
from mistralai import Mistral
from sentence_transformers import SentenceTransformer

def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from a PDF file."""
    text = ""
    with open(pdf_path, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            page_text = page.extract_text() or ""  # Handle cases where text extraction might fail
            page_text = page_text.replace("\t", " ")
            text += page_text
    return text


def create_text_chunks(text: str, chunk_size: int, overlap_size: int) -> list[str]:
    """Create overlapping text chunks from the extracted text."""
    chunks = []
    for i in range(0, len(text) - chunk_size + 1, chunk_size - overlap_size):
        chunks.append(text[i : i + chunk_size])
    return chunks


class EmbeddingModel:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def create_embedding(self, text: str) -> np.ndarray:
        """Create an embedding for the given text."""
        return self.model.encode(text)


if __name__ == "__main__":
    model = EmbeddingModel()

    def similarity(text1: str, text2: str) -> float:
        """Calculate the cosine similarity between two texts."""
        embedding1 = model.create_embedding(text1)
        embedding2 = model.create_embedding(text2)
        return float(np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2)))

class EmbeddingDatabase:
    def __init__(self, embedding_model: EmbeddingModel):
        self.embedding_model = embedding_model
        self.db = pd.DataFrame([], columns=["text", "text_embedding"])

        state_file = os.getenv("EMBEDDING_DB_HOME")
        if state_file and os.path.exists(state_file):
            self.load_state(state_file)

    def add_documents(self, documents: List[str]):
        """Add documents to the embedding database."""
        data = [
            {
                "text": doc,
                "text_embedding": self.embedding_model.create_embedding(doc),
            }
            for doc in documents
        ]
        df = pd.DataFrame(data)
        self.db = pd.concat([self.db, df], ignore_index=True)

    def retrieve(self, query: str, top_k: int = 5) -> List[str]:
        """Retrieve the top_k most similar documents for the given query."""
        query_embedding = self.embedding_model.create_embedding(query)
        temp = self.db.copy()
        
        temp["query_similarity"] = temp.apply(
            lambda row: self._compute_cosine_similarity(row["text_embedding"], query_embedding),
            axis=1,
        )

        sorted_df = temp.sort_values("query_similarity", ascending=False)
        return list(sorted_df["text"][:top_k])

    def _compute_cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
        """Compute cosine similarity between two vectors."""
        dot_product = np.dot(vec1, vec2)
        norm_vec1 = np.linalg.norm(vec1)
        norm_vec2 = np.linalg.norm(vec2)
        return dot_product / (norm_vec1 * norm_vec2)

    def save_state(self):
        """Save the current state of the database to a file."""
        state_file = os.getenv("EMBEDDING_DB_HOME")
        if state_file:
            with open(state_file, "wb") as f:
                pickle.dump(self.db, f)

    def load_state(self, state_file: str):
        """Load the database state from a file."""
        with open(state_file, "rb") as f:
            self.db = pickle.load(f)


class LargeLanguageModel(object):
    def __init__(self, model="mistral-small-latest"):
        self.model = model
        api_key = os.environ.get("MISTRAL_API_KEY", None)
        if api_key is None:
            raise Exception(
                f"`MISTRAL_API_KEY` is None. Please set it in your environment variables."
            )
        self.client = Mistral(api_key=api_key)

    def call(self, prompt):
        chat_response = self.client.chat.complete(
            model=self.model,
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                },
            ],
        )
        return chat_response.choices[0].message.content
        
class QuestionAnsweringRAG:
    def __init__(self, llm: LargeLanguageModel, embedding_db: EmbeddingDatabase):
        self.llm = llm
        self.embedding_db = embedding_db

    def _create_prompt(self, context: str, message: str) -> str:
        return f"""Answer the question only using the provided content.

        Context: {context}
        
        User Question: {message}

        Respond in a natural way. If the information cannot be found in the context, respond with "It is out of my pay grade" and be more rude.
        """

    def query(self, query: str) -> str:
        documents = self.embedding_db.retrieve(query)
        context = "\n".join(documents)
        prompt = self._create_prompt(context, query)
        
        return self.llm.call(prompt)

  from tqdm.autonotebook import tqdm, trange


In [10]:
text = extract_text_from_pdf("food_lab_green_chapter.pdf")
text_chunks = create_text_chunks(text, chunk_size=1000, overlap_size=200)


# Initialize embedding model and database
model = EmbeddingModel()
db = EmbeddingDatabase(model)

# Add text chunks to the database and save the state
db.add_documents(text_chunks)
db.save_state()

FileNotFoundError: [Errno 2] No such file or directory: 'food_lab_green_chapter.pdf'

In [7]:
print(db.retrieve("How do you pick a green?"))

NameError: name 'db' is not defined

In [127]:
# User input and response handling
query1 = "what is up?"
query2 = "How do you pick a green?"
response = rag.query(query2)
print(response)

To pick a green, look for the shade that catches your eye. It's that simple!


In [168]:
# Convert each text chunk to a LangChain Document
from langchain.schema import Document

langchain_docs = [
    Document(page_content=text, metadata={"source": f"chunk_{i+1}"})
    for i, text in enumerate(text_chunks)
]

print(len(langchain_docs))
# Display documents with metadata
#for doc in langchain_docs:
#    print(doc.page_content, doc.metadata)

180


In [143]:
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness
from langchain_mistralai import ChatMistralAI
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(ChatMistralAI(model="mistral-large-latest"))
metrics = [LLMContextRecall(), FactualCorrectness(), Faithfulness()]


In [144]:
from langchain_huggingface import HuggingFaceEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper

generator_llm = LangchainLLMWrapper(ChatMistralAI(model="mistral-large-latest"))
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
generator_embeddings = LangchainEmbeddingsWrapper(embedding_model)

In [145]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(langchain_docs, testset_size=10)

#testset = generator.generategenerate_with_langchain_docs(langchain_docs[:10], test_size=10, 
#                                                 distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})


In [128]:
## Prepared questions dataset from huggingface
from datasets import load_dataset
dataset = load_dataset("atitaarora/food_lab_green_qna", split="train")
len(dataset)

34

In [129]:
## Sample question data
dataset[2]

{'query': '{"question": "If the dark green leaves of chicory are removed and discarded, then how might this affect the overall texture and flavor of the salad?"}',
 'reference_contexts': ['ENDIVE AND CHICORY\nSALAD\nWITH GRAPEFRUIT,\nCRANBERRIES, AND FIG AND\nPUMPKIN SEED VINAIGRETTE\nSERVES 4\n1 head chicory, dark green leaves removed and\ndiscarded, pale white and yellow sections washed, spun\ndry, and torn into 2-inch pieces\n2 Belgian endives, bottoms trimmed, separated into'],
 'reference_answer': '{"answer": "Removing and discarding the dark green leaves of chicory results in a salad with a milder flavor and a more tender texture, as the pale white and yellow sections are less bitter and more delicate."}',
 'new_code': 'true',
 'node_metadata': {'excerpt_keywords': 'Keywords: Endive, Chicory, Salad, Grapefruit, Cranberries, Fig, Pumpkin Seed, Vinaigrette',
  'page_number': 90,
  'source_file_name': '/tmp/food_lab_green_chapter.pdf'},
 'question_type': 'CONDITIONAL',
 'metadata': 

In [None]:
#from ragas import evaluate
#results = evaluate(dataset=dataset, metrics=metrics, llm=evaluator_llm)

In [147]:
## In case if its needed
os.environ["OPENAI_API_KEY"] = "sk-XXXX"

In [155]:
## Preparation of Eval dataset for RAGAS (https://docs.ragas.io/en/stable/concepts/components/eval_sample/?h=singleturnsample#example)
##for ragas dataset needs to be in the designated format 
from ragas import EvaluationDataset, SingleTurnSample
from ragas.metrics import Faithfulness
from datasets import load_dataset
from ragas import evaluate
import time

samples = []
eval_size = 5

for i in range(eval_size):
    entry = dataset[i]
    
    # Perform the query with a delay to limit to 1 request per second
    user_query = entry['query']
    response = rag.query(user_query)
    
    sample = SingleTurnSample(
        user_input=user_query,
        reference=entry['reference_answer'],
        response=response,
        retrieved_contexts=db.retrieve(user_query),
    )
    samples.append(sample)
    
    # Wait for 1-2 second before proceeding to the next iteration as we are limited by Mistral API
    time.sleep(2)


In [157]:
import pandas as pd
df = pd.DataFrame(samples)

# Display the DataFrame as a table
print(df)

                                                   0  \
0  (user_input, {"question": "If all vegetables a...   
1  (user_input, {"question": "How might the combi...   
2  (user_input, {"question": "If the dark green l...   
3  (user_input, {"question": "What type of leaves...   
4  (user_input, {"question": "What ingredient is ...   

                                                   1  \
0  (retrieved_contexts, [ want to use plenty ofwa...   
1  (retrieved_contexts, [a dry cooking method, th...   
2  (retrieved_contexts, [ vigorously before using...   
3  (retrieved_contexts, [omatoes, toasted nuts, a...   
4  (retrieved_contexts, [aseddressings tend to be...   

                            2  \
0  (reference_contexts, None)   
1  (reference_contexts, None)   
2  (reference_contexts, None)   
3  (reference_contexts, None)   
4  (reference_contexts, None)   

                                                   3                        4  \
0  (response, If all vegetables are not the sa

In [163]:
## Actual Evaluation
from ragas.metrics import LLMContextPrecisionWithReference
from ragas.metrics import NonLLMContextRecall
from ragas.metrics import LLMContextRecall
from ragas.metrics import Faithfulness
from ragas.metrics import ResponseRelevancy

eval_dataset = EvaluationDataset(samples=samples)

faithfulness = Faithfulness()
context_precision = LLMContextPrecisionWithReference()
context_recall = NonLLMContextRecall()
llm_context_recall = LLMContextRecall()
answer_relevancy = ResponseRelevancy()

eval_results = evaluate(
        dataset=eval_dataset,
        metrics=[
                faithfulness,
                answer_relevancy,
                #context_recall, #This metric [non_llm_context_recall] that is used requires the following additional columns ['reference_contexts'] to be present in the dataset.
                llm_context_recall,
                context_precision,
        ],
       #llm=evaluator_llm
       raise_exceptions=False 
    )
#eval_results = evaluate(
#    dataset=eval_dataset,
#    metrics=[metric],
#llm=evaluator_llm
#)

Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

No statements were generated from the answer.


In [164]:
evaluation_result_df = eval_results.to_pandas()
evaluation_result_df.iloc[:5]

Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_relevancy,context_recall,llm_context_precision_with_reference
0,"{""question"": ""If all vegetables are trimmed to...",[ want to use plenty ofwater—it retains its te...,"If all vegetables are not the same type, they ...","{""answer"": ""The natural diversity of vegetable...",0.5,0.92057,1.0,1.0
1,"{""question"": ""How might the combination of bee...","[a dry cooking method, they barely lose any ju...","""Beets bring a sweet and earthy flavor, while ...","{""answer"": ""The combination of beets, olive oi...",0.636364,0.9255,1.0,1.0
2,"{""question"": ""If the dark green leaves of chic...",[ vigorously before using.KNIFE SKILLS:How to ...,If the dark green leaves of chicory are remove...,"{""answer"": ""Removing and discarding the dark g...",0.5,0.0,1.0,0.7
3,"{""question"": ""What type of leaves are not reco...","[omatoes, toasted nuts, and herbs?Probably not...","""It's not my place to tell you that.""","{""answer"": ""Darker green leaves are not recomm...",,0.0,0.0,0.0
4,"{""question"": ""What ingredient is not recommend...",[aseddressings tend to be thicker and creamier...,It is out of my pay grade.,"{""answer"": ""The shake-it-in-a-jar method is no...",0.0,0.0,0.0,0.0
