In [4]:
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from datasets import Dataset
import matplotlib.pyplot as plt

pd.set_option(
    "display.max_colwidth", None
)  # This will be helpful when visualizing retriever outputs

In [13]:
from langchain.docstore.document import Document as LangchainDocument


def extract_text(pdf_files):
    """
    Function to extract the text from a PDF file

    Args:
        pdf_file (file): The PDF files to extract the text from

    Returns:
        text (str): The extracted text from the PDF file
    """

    # Initialize the raw text variable
    text = ""

    # Iterate over the documents
    for pdf_file in pdf_files:

        # Read the PDF file
        pdf_reader = PdfReader(pdf_file)

        # Extract the text from the PDF pages and add it to the raw text variable
        for page in pdf_reader.pages:
            text += page.extract_text()

    return text


raw_text = extract_text(pdf_files=["../data/data.pdf"])

RAW_KNOWLEDGE_BASE = [
    LangchainDocument(
        page_content=raw_text, metadata={"source": "food imcompatibility"}
    )
]

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

EMBEDDING_MODEL_NAME = "thenlper/gte-small"


def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique


docs_processed = split_documents(
    512,  # We choose a chunk size adapted to our model
    RAW_KNOWLEDGE_BASE,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

# Let's visualize the chunk sizes we would have in tokens from a common model
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
# lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]
# fig = pd.Series(lengths).hist()
# plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
# plt.show()

In [27]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    # multi_process=True,
    # model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

KeyboardInterrupt: 

In [19]:
# export the knowledge vector database to disk
KNOWLEDGE_VECTOR_DATABASE.save_local("../data/knowledge_vector_database")
KNOWLEDGE_VECTOR_DATABASE = FAISS.load_local(
    "../data/knowledge_vector_database",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True,
)

In [29]:
embedding_model = HuggingFaceEmbeddings(
    model_name="../model/gte-small",
    # multi_process=True,
    # model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

In [20]:
retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query="鲫鱼最好不和什么一起吃？", k=5)
print(
    "\n==================================Top document=================================="
)
print(retrieved_docs[0].page_content)
print("==================================Metadata==================================")
print(retrieved_docs[0].metadata)


猪⾁
　　 （1）不宜⻝⽤未摘除甲状腺的猪⾁
　　 （2）服降压药和降⾎脂药时不宜多⻝
　　 （3）禁忌⻝⽤猪油渣
　　 （4）⼩⼉不宜多⻝
　　 （5）不宜在刚屠后煮⻝
　　 （6）未剔除肾上腺和病变的淋巴结时不宜⻝⽤
　　 （7）⽼⼈不宜多⻝瘦⾁
　　 （8）⻝⽤前不宜⽤热⽔浸泡
　　 （9）在烧煮过程中忌加冷⽔
　　 （10）不宜多⻝煎炸咸⾁
　　 （11）不宜多⻝加硝腌制之猪⾁
　　 （12）不宜多⻝午餐⾁
　　 （13）不宜多⻝肥⾁
　　 （14）忌与鹌鹑同⻝,同⻝令⼈⾯⿊
　　 （15）忌与鸽⾁、鲫⻥、虾同⻝,同⻝令⼈滞⽓
　　 （16）忌与荞⻨同⻝,同⻝令⼈落⽑发
　　 （17）忌与菱⻆、⻩⾖、蕨菜、桔梗、乌梅、百合、巴⾖、⼤⻩、⻩连、苍术、芜荽
同⻝
　　 （18）忌与⽜⾁、驴⾁（易致腹泻）、⽺肝同⻝。
　　 （19）服磺胺类药物时不宜多⻝
　　猪肝
　　 （1）忌与荞⻨、⻩⾖、⾖腐同⻝,同⻝发痼疾
　　 （2）忌与⻥⾁同⻝,否则令⼈伤神　　 （3）忌与雀⾁、⼭鸡、鹌鹑⾁同⻝
　　猪⾎
　　（1）忌⻩⾖,同⻝令⼈⽓滞
　　 （2）忌地⻩、何⾸乌
　　⽺⾁
　　 （1）不宜多⻝烤⽺⾁串
　　 （2）不宜⻝⽤反复剩热或冻藏加温的⽺⾁
{'source': 'food imcompatibility', 'start_index': 7324}


# Reader

In [1]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv(override=True)
openai_api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(base_url="http://127.0.0.1:8080/v1", api_key=openai_api_key)

In [2]:
retrieved_docs_text = [doc.page_content for doc in retrieved_docs]
context = "\nExtracted documents:\n"
context += "".join(
    [f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)]
)
query = "鲫鱼最好不和什么一起吃？"


RAG_PROMPT_TEMPLATE = [
    {
        "role": "system",
        "content": """Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""".format(
            context=context, question=query
        ),
    },
]

completion = client.chat.completions.create(
    model="LLaMA_CPP", messages=RAG_PROMPT_TEMPLATE, temperature=0.0
)
print(completion.choices[0].message)

NameError: name 'retrieved_docs' is not defined

In [26]:
completion.choices[0].message.content

'鲫鱼最好不和“海味”一起吃。因为海味中含有鞣酸，如果与含有鞣酸的水果（如苹果）一起食用，可能会引起腹痛、恶心、呕吐等不适。<|eot_id|>'

## hg作者api

In [5]:
from llama_cpp import Llama

llm = Llama(
    model_path="../model/mistral-7b-instruct-v0.2.Q4_0.llamafile", chat_format="llama-2"
)  # Set chat_format according to the model you are using
llm.create_chat_completion(
    messages=[
        {"role": "system", "content": "You are a story writing assistant."},
        {"role": "user", "content": "Write a story about llamas."},
    ]
)

gguf_init_from_file: invalid magic characters 'MZqF'
llama_model_load: error loading model: llama_model_loader: failed to load model from ../model/mistral-7b-instruct-v0.2.Q4_0.llamafile

llama_load_model_from_file: failed to load model


ValueError: Failed to load model from file: ../model/mistral-7b-instruct-v0.2.Q4_0.llamafile

# Agent

In [2]:
from langchain_community.utilities.sql_database import SQLDatabase

db = SQLDatabase.from_uri("sqlite:///../data/ingredients.db")

In [4]:
import os

from langchain_community.agent_toolkits import create_sql_agent
from langchain_community.chat_models import ChatOpenAI


openai_api_key = os.getenv("OPENAI_API_KEY")


# llm = ChatOpenAI(
#     base_url="http://127.0.0.1:8080/v1",
#     api_key=openai_api_key,
# )

llm = ChatOpenAI(
    # model="gpt-3.5-turbo",
    temperature=0.0,
)

agent_executor = create_sql_agent(llm, db=db, agent_type="openai-tools", verbose=True)
# agent_executor.invoke(
#     {
#         "input": "insert a new ingredient squid into the ingredients table, amount = 100.0",
#         # "input": "increase the amount of eggplant in the ingredients table to 100.0",
#     }
# )

dish = """菜名：茄子炒肉
            食材和用量：
            茄子：500g
            猪肉：500g
            """

agent_executor.invoke(
    {
        "input": """根据菜谱，从数据库中移除被使用的材料以及相应的数量。
                    菜谱：{dish}""".format(
            dish=dish
        ),
        # "input": "increase the amount of eggplant in the ingredients table to 100.0",
    }
)

# agent_executor.invoke(
#     {
#         "input": "在数据库中插入玉米, 数量 = 100.0克",
#         # "input": "increase the amount of eggplant in the ingredients table to 100.0",
#     }
# )



[1m> Entering new SQL Agent Executor chain...[0m
[32;1m[1;3m
Invoking: `sql_db_list_tables` with `{}`


[0m[38;5;200m[1;3mingredients[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `{'table_names': 'ingredients'}`


[0m[33;1m[1;3m
CREATE TABLE ingredients (
	id INTEGER, 
	name TEXT, 
	amount REAL, 
	PRIMARY KEY (id)
)

/*
3 rows from ingredients table:
id	name	amount
1	面粉	2.5
2	糖	1.5
3	鸡蛋	3.0
*/[0m[32;1m[1;3m
Invoking: `sql_db_query` with `{'query': "DELETE FROM ingredients WHERE name IN ('茄子', '猪肉') AND amount = 500"}`
responded: The `ingredients` table contains information about ingredients with columns `id`, `name`, and `amount`. Here are some sample rows from the table:

- **id**: 1, **name**: 面粉, **amount**: 2.5
- **id**: 2, **name**: 糖, **amount**: 1.5
- **id**: 3, **name**: 鸡蛋, **amount**: 3.0

I will now construct a query to remove the used ingredients and their respective amounts based on the recipe provided.

[0m[36;1m[1;3m[0m[32;1m[1;3mThe query to remo

{'input': '根据菜谱，从数据库中移除被使用的材料以及相应的数量。\n                    菜谱：菜名：茄子炒肉\n            食材和用量：\n            茄子：500g\n            猪肉：500g\n            ',
 'output': 'The query to remove the used ingredients and their respective amounts based on the recipe "茄子炒肉" has been executed successfully.'}

In [25]:
from langchain_community.agent_toolkits import create_sql_agent
from langchain_community.chat_models import ChatOpenAI

llm = ChatOpenAI(
    base_url="http://127.0.0.1:8080/v1",
    api_key=openai_api_key,
)

ai_msg = llm.invoke(
    [
        (
            "system",
            "You are a helpful assistant.",
        ),
        ("human", "你好"),
    ]
)

# llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
# agent_executor = create_sql_agent(llm, db=db, agent_type="openai-tools", verbose=True)

In [16]:
from langchain_community.llms.llamafile import Llamafile

llamafile = Llamafile()

llamafile.invoke("你好，你叫什么名字？")

ConnectionError: Could not connect to Llamafile server. Please make sure that a server is running at http://localhost:8080.