In [7]:
# https://www.zhihu.com/question/637421964/answer/1927760890482122798

import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from pydantic import SecretStr
from dotenv import load_dotenv


def load_api_key():
    """加载API密钥"""
    load_dotenv()
    api_key = os.getenv('DEEPSEEK_API_KEY', '')
    if not api_key:
        raise ValueError('DEEPSEEK_API_KEY is not set')
    return api_key


# 本地环境变量文件
DEEPSEEK_API_KEY = load_api_key()
DOC_PATH = r"../data/knowledge-base/alphabet_10K_2022.pdf"
CHROMA_PATH = r"../data/knowledge-base/chromaDB_alphabet_10K_2022"

# ----- Data Indexing Process 建立数据索引 -----
loader = PyPDFLoader(DOC_PATH)
pages = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
text_chunks = text_splitter.split_documents(pages)

embeddings = OpenAIEmbeddings(
    openai_api_key=DEEPSEEK_API_KEY,
    openai_api_base="https://api.deepseek.com/v1",
    model='deepseek-reasoner'  # 或 'deepseek-embedding-2'
)
db_chroma = Chroma.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    persist_directory=CHROMA_PATH
)

# ----- Retrieval and Generation Process 检索与生成 -----
query = 'what are the top risks mentioned in the document?'
doc_chroma = db_chroma.similarity_search_with_score(query, k=5)
context_text = "\n\n".join([doc.page_content for doc, _score in doc_chroma])
PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
Answer the question based on the above context: {question}.
Provide a detailed answer.
Don’t justify your answers.
Don’t give information not mentioned in the CONTEXT INFORMATION.
Do not say "according to the context" or "mentioned in the context" or similar.
"""
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(
    context=context_text,
    question=query
)
chat_model = ChatOpenAI(
    model='deepseek-reasoner',
    openai_api_key=SecretStr(DEEPSEEK_API_KEY),
    openai_api_base="https://api.deepseek.com/v1"
)
response_text = chat_model.invoke(prompt).content



ConnectTimeout: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /encodings/cl100k_base.tiktoken (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000015386711CD0>, 'Connection to openaipublic.blob.core.windows.net timed out. (connect timeout=None)'))