加载txt文件，建立本地知识库

In [27]:
%pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting argparse (from -r requirements.txt (line 6))
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/f2/94/3af39d34be01a24a6e65433d19e107099374224905f1e0cc6bbe1fd22a2f/argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Collecting llama-cpp-python (from -r requirements.txt (line 16))
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/29/69/b73ae145d6f40683656f537b8526ca27e8348c7ff9af9c014a6a723fda5f/llama_cpp_python-0.1.44.tar.gz (1.1 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25ldone
[?25h  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.1.44-cp311-cp311-macosx_13_0_arm64.whl size=158925 sha256=4ed35a2a31509e02814fc3b674ae14e208608273f7326798b35a570ae42b93

In [28]:
#######################
# 做一些初始化等准备工作  #
#######################

from dotenv import load_dotenv
import os
import sys

load_dotenv()  # 加载.env文件

openapi_key = os.getenv("OPENAI_API_KEY")  # 确保openapi_key已设置
print(f"OPENAI_API_KEY: {openapi_key}")

if not openapi_key:
    print("[ERROR] OPENAI_API_KEY not set")
    sys.exit(1)

OPENAI_API_KEY: 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


In [29]:
#################
# 进行文档分词处理 #
#################

from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import SpacyTextSplitter


# 初始化目录加载器
# loader = DirectoryLoader(path='./txt/out', glob="**/*.txt", loader_cls=TextLoader)
loader = DirectoryLoader(path='./txt', glob="**/*.txt", loader_cls=TextLoader)

# 加载文档
documents = loader.load()

# 初始化分词器
text_splitter = SpacyTextSplitter(pipeline='zh_core_web_sm',chunk_size=1000,chunk_overlap=0)

# 文档分块
split_docs = text_splitter.split_documents(documents)

In [30]:
print(len(split_docs))

296


In [31]:
###################################################
# 1. 调用open ai的embedding模型，将分词向量化
# 2. 保存分词结果到 chroma_db
###################################################

import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# TODO: 这里可以切换成其他的embedding模型，比如 LlamaCppEmbeddings, HuggingFaceHubEmbeddings
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

# 保存分词结果到向量数据库Chroma
db = Chroma.from_documents(documents=split_docs, embedding=embeddings, persist_directory='./chroma_db')
db.persist()
db = None

Using embedded DuckDB with persistence: data will be stored in: ./chroma_db


In [None]:


# from langchain.llms import OpenAI
# from langchain.chains.question_answering import load_qa_chain
# import os
# from langchain.embeddings.openai import OpenAIEmbeddings
# from langchain.vectorstores import Chroma

# embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

# # 初始化llm
# llm = OpenAI(temperature=0, openai_api_key=os.getenv("OPENAI_API_KEY"))
# chain = load_qa_chain(llm=llm, chain_type='stuff')

# persist_directory = './chroma_db'
# db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

# query = "汉朝有几位皇帝？"
# docs = db.similarity_search(query, 1)

In [32]:


from langchain.llms import LlamaCpp
from langchain.chains.question_answering import load_qa_chain
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

# 初始化 llama.cpp 模型
llm = LlamaCpp(model_path="/Users/zhiwen/code/llama.cpp/zh-models/7B/ggml-model-q4_0.bin")

chain = load_qa_chain(llm=llm, chain_type='stuff')

# chroma加载上述embedding后的暂存的数据
db = Chroma(persist_directory='./chroma_db', embedding_function=embeddings)

# 示例的prompt
query = "如何处理需要不同账户与端口登录的不同机器？"

docs = db.similarity_search(query, 1)

NameError: Could not load Llama model from path: ./llama.cpp/zh-models/7B/ggml-model-q4_0.bin

In [None]:
print(docs[0].page_content)

In [None]:
chain.run(input_documents=docs, question=query)