In [12]:
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from datasets import Dataset
import matplotlib.pyplot as plt

pd.set_option(
    "display.max_colwidth", None
)  # This will be helpful when visualizing retriever outputs

In [14]:
from langchain.docstore.document import Document as LangchainDocument

def extract_text(pdf_files):
    """
    Function to extract the text from a PDF file

    Args:
        pdf_file (file): The PDF files to extract the text from

    Returns:
        text (str): The extracted text from the PDF file
    """

    # Initialize the raw text variable
    text = ""

    # Iterate over the documents
    for pdf_file in pdf_files:

        # Read the PDF file
        pdf_reader = PdfReader(pdf_file)

        # Extract the text from the PDF pages and add it to the raw text variable
        for page in pdf_reader.pages:
            text += page.extract_text()
    
    return text

raw_text = extract_text(pdf_files=["../data/data.pdf"])

RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=raw_text, metadata={"source":"food imcompatibility"})
]


In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

EMBEDDING_MODEL_NAME = "thenlper/gte-small"

def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

docs_processed = split_documents(
    512,  # We choose a chunk size adapted to our model
    RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

# Let's visualize the chunk sizes we would have in tokens from a common model
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
# lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]
# fig = pd.Series(lengths).hist()
# plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
# plt.show()

In [17]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    # multi_process=True,
    # model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

In [18]:
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query="鲫鱼最好不和什么一起吃？", k=5)
print(
    "\n==================================Top document=================================="
)
print(retrieved_docs[0].page_content)
print("==================================Metadata==================================")
print(retrieved_docs[0].metadata)


猪⾁
　　 （1）不宜⻝⽤未摘除甲状腺的猪⾁
　　 （2）服降压药和降⾎脂药时不宜多⻝
　　 （3）禁忌⻝⽤猪油渣
　　 （4）⼩⼉不宜多⻝
　　 （5）不宜在刚屠后煮⻝
　　 （6）未剔除肾上腺和病变的淋巴结时不宜⻝⽤
　　 （7）⽼⼈不宜多⻝瘦⾁
　　 （8）⻝⽤前不宜⽤热⽔浸泡
　　 （9）在烧煮过程中忌加冷⽔
　　 （10）不宜多⻝煎炸咸⾁
　　 （11）不宜多⻝加硝腌制之猪⾁
　　 （12）不宜多⻝午餐⾁
　　 （13）不宜多⻝肥⾁
　　 （14）忌与鹌鹑同⻝,同⻝令⼈⾯⿊
　　 （15）忌与鸽⾁、鲫⻥、虾同⻝,同⻝令⼈滞⽓
　　 （16）忌与荞⻨同⻝,同⻝令⼈落⽑发
　　 （17）忌与菱⻆、⻩⾖、蕨菜、桔梗、乌梅、百合、巴⾖、⼤⻩、⻩连、苍术、芜荽
同⻝
　　 （18）忌与⽜⾁、驴⾁（易致腹泻）、⽺肝同⻝。
　　 （19）服磺胺类药物时不宜多⻝
　　猪肝
　　 （1）忌与荞⻨、⻩⾖、⾖腐同⻝,同⻝发痼疾
　　 （2）忌与⻥⾁同⻝,否则令⼈伤神　　 （3）忌与雀⾁、⼭鸡、鹌鹑⾁同⻝
　　猪⾎
　　（1）忌⻩⾖,同⻝令⼈⽓滞
　　 （2）忌地⻩、何⾸乌
　　⽺⾁
　　 （1）不宜多⻝烤⽺⾁串
　　 （2）不宜⻝⽤反复剩热或冻藏加温的⽺⾁
{'source': 'food imcompatibility', 'start_index': 7324}


# Reader

In [19]:
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv(override=True)
openai_api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(
    base_url="http://127.0.0.1:8080/v1",
    api_key=openai_api_key
)


In [20]:
retrieved_docs_text = [
    doc.page_content for doc in retrieved_docs
]
context = "\nExtracted documents:\n"
context += "".join(
    [f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)]
)
query = "鲫鱼最好不和什么一起吃？"


RAG_PROMPT_TEMPLATE = [
    {
        "role": "system",
        "content": """Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""".format(context=context, question=query),
    },
]

completion = client.chat.completions.create(
    model="LLaMA_CPP",
    messages=RAG_PROMPT_TEMPLATE
)
print(completion.choices[0].message)

ChatCompletionMessage(content='鲫鱼最好不和“海味”一起吃。因为海味中含有鞣酸，如果与含有鞣酸的水果（如苹果）一起食用，可能会引起腹痛、恶心、呕吐等不适。<|eot_id|>', role='assistant', function_call=None, tool_calls=None)


In [22]:
completion.choices[0].message.content

'鲫鱼最好不和“海味”一起吃。因为海味中含有鞣酸，如果与含有鞣酸的水果（如苹果）一起食用，可能会引起腹痛、恶心、呕吐等不适。<|eot_id|>'