In [None]:
# 1.导入相关依赖 (Import necessary dependencies)
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
import os
import dotenv
dotenv.load_dotenv()

# 2.定义文档加载器 (Define the document loader)
loader = TextLoader(file_path='./asset/load/09-ai1.txt',encoding="utf-8")

# 3.加载文档 (Load the document)
documents = loader.load()

# 4.定义文本切割器 (Define the text splitter)
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

# 5.切割文档 (Split the document)
docs = text_splitter.split_documents(documents)

# 6.定义嵌入模型 (Define the embedding model)
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

embeddings = OpenAIEmbeddings(
    model = "text-embedding-3-large"
)

In [None]:
# 获取向量数据库 (Get the vector database)
db = FAISS.from_documents(documents=docs, embedding=embeddings)

# 基于向量数据库获取检索器 (Get the retriever based on the vector database)
retriever = db.as_retriever()

# 进行数据的检索 (Perform data retrieval)
docs = retriever.invoke(input = "深度学习是什么?")

print(len(docs))

for doc in docs:
    print(f"----{doc}")

In [None]:
# 1.导入相关依赖 (Import necessary dependencies)
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document

# 2.定义文档 (Define Documents)
document_1 = Document(
    page_content="经济复苏: 美国经济正在从疫情中强劲复苏，失业率降至历史低点。!",
)
document_2 = Document(
    page_content="基础设施: 政府将投资1万亿美元用于修复道路、桥梁和宽带网络。",
)
document_3 = Document(
    page_content="气候变化: 承诺到2030年将温室气体排放量减少50%。",
)
document_4 = Document(
    page_content="医疗保健: 降低处方药的价格，扩大医疗保险覆盖范围。",
)
document_5 = Document(
    page_content="教育: 提供免费的社区大学教育。",
)
document_6 = Document(
    page_content="科技: 增加对半导体产业的投资以减少对外围供应链的依赖。",
)
document_7 = Document(
    page_content="外交政策: 继续支持乌克兰对抗俄罗斯的侵略。",
)
document_8 = Document(
    page_content="枪支管制: 呼吁国会通过更严格的枪支管制法律。",
)
document_9 = Document(
    page_content="移民改革: 提出全面的移民改革方案。",
)
document_10 = Document(
    page_content="社会正义: 承诺解决系统性种族歧视问题。",
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]

# 3.创建向量存储 (Create Vector Store)
embeddings = OpenAIEmbeddings(
    model = "text-embedding-3-large"
)

# 4.将文档向量化, 添加到向量数据库索引中, 得到向量数据库对象 (Vectorize documents, add to vector DB index, get DB object)
db = FAISS.from_documents(documents, embeddings)

# 默认检索器使用相似性搜索 (Default retriever uses similarity search)

In [None]:
# 获取检索器 (Get the retriever)
retriever = db.as_retriever(search_kwargs={"k": 4}) # 这里设置返回的文档数 (Sets the number of documents to return here)

docs = retriever.invoke("经济政策")

for i, doc in enumerate(docs):
    print(f"\n结果 {i+1}:\n{doc.page_content}\n")

3、结合大模型的使用 (3. Combining the Use of Large Models)

示例1: 不使用RAG技术 (Example 1: Without using RAG technology)
[A single-line input field, likely for code or text]

示例2: 通过FAISS构建一个可搜索的向量索引数据库, 并结合RAG技术让LLM去回答问题。
(Example 2: Use FAISS to build a searchable vector index database, and combine it with RAG technology to let the LLM answer questions.)

In [None]:
from langchain_openai import ChatOpenAI
import os
import dotenv
dotenv.load_dotenv()

# 设置环境变量 (Set environment variables)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY1')
os.environ['OPENAI_BASE_URL'] = os.getenv('OPENAI_BASE_URL')

# 创建大模型实例 (Create large model instance)
llm = ChatOpenAI(model='gpt-4o-mini')

# 调用 (Invoke)
response = llm.invoke("北京有什么著名的建筑？")
print(response.content)

情况2：使用RAG给LLM灌输上下文数据


In [None]:
# 1.导入所有需要的包 (Import all required packages)
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
import os
import dotenv

dotenv.load_dotenv()

# 2.创建自定义提示词模板 (Create custom prompt template)
prompt_template = """请使用以下提供的文本内容来回答问题，仅使用提供的文本信息。如果文本中没有相关信息，请回答"抱歉，提供的文本中没有这个信息"。

文本内容:
{context}

问题: {question}

回答:
"""

prompt = PromptTemplate.from_template(prompt_template)

# 3.初始化模型 (Initialize model)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
llm = ChatOpenAI(
    model='gpt-4o-mini',
    temperature=0
)

embedding_model = OpenAIEmbeddings(model='text-embedding-3-large')

# 4.加载文档 (Load documents)
loader = TextLoader(
    "./asset/load/10-test_doc.txt",
    encoding="utf-8"
)
documents = loader.load()

# 5.分割文档 (Split documents)
text_splitter = CharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
)
texts = text_splitter.split_documents(documents)

#print(f"文档个数:{len(texts)}") # Print the number of documents (chunks)

# 6.创建向量存储 (Create vector store)
vectorstore = FAISS.from_documents(
    documents=texts,
    embedding=embedding_model
)

# 7.获取检索器 (Get retriever)
retriever = vectorstore.as_retriever()

# 8.检索 (Retrieve)
docs = retriever.invoke("北京有什么著名的建筑？")

# 9.创建Runnable链 (Create Runnable chain)
chain = prompt | llm

# 10.提问 (Ask question)
result = chain.invoke(
    input={"question": "北京有什么著名的建筑？", "context": docs}
)
print(f"\n回答:\n", result.content)