## AI Agent智能应用从0到1定制开发 
## AI Agent Intelligent Application Custom Development from 0 to 1
******
- 此代码为网课《AI Agent智能应用从0到1定制开发》的配套代码，需要注意本套代码建议与网课适配配合食用。
- This code for the online course <AI Agent Intelligent Applications from 0 to 1 custom development> supporting code, need to pay attention to this set of code is recommended with the online course adapted to work with consumption.
- 需要注意由于课程开发周期的原因，langchain版本跨越了3个大版本，部分代码会与视频演示有差别!
- Note that due to the course development cycle, the langchain version spans 3 major releases and some of the code will differ from the video demo!
- 课程地址：https://coding.imooc.com/class/822.html
- Course address: https://coding.imooc.com/class/822.html

### 从环境变量中读取密钥
### Read the key from the environment variable
- 注意：尽量将你的OpenAI Key存储在类似.env文件中，而不是明文暴露在代码里，这是一种基本的安全措施
- Note: Try to store your OpenAI Key in something like an .env file, rather than exposing it explicitly in code, as a basic safety measure!
******

In [1]:

import os
from dotenv import load_dotenv
# Load environment variables from openai.env file
load_dotenv("asset/openai.env")

# Read the OPENAI_API_KEY from the environment
api_key = os.getenv("OPENAI_API_KEY")
api_base = os.getenv("OPENAI_API_BASE")
os.environ["OPENAI_API_KEY"] = api_key
os.environ["OPENAI_API_BASE"] = api_base

### ChatDOc:和文件聊天
### ChatDoc:chat with files
*****

In [2]:
#导入必须的包
# Import the required packages
from langchain_community.document_loaders import UnstructuredExcelLoader,Docx2txtLoader,PyPDFLoader
from langchain_text_splitters import  CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
#导入聊天所需的模块
# Import the required packages for chat
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate


#定义chatdoc
# Define the ChatDoc class
class ChatDoc():
    def __init__(self):
        self.doc = None
        self.splitText = [] #分割后的文本 split text
        self.template = [
            ("system","你是一个处理文档的秘书,你从不说自己是一个大模型或者AI助手,你会根据下面提供的上下文内容来继续回答问题.\n 上下文内容\n {context} \n"),
            ("human","你好！"),
            ("ai","你好"),
            ("human","{question}"),
        ]
        self.prompt = ChatPromptTemplate.from_messages(self.template)

    def getFile(self):
        doc = self.doc
        loaders = {
            "docx":Docx2txtLoader,
            "pdf":PyPDFLoader,
            "xlsx":UnstructuredExcelLoader,
        }
        file_extension = doc.split(".")[-1]
        loader_class = loaders.get(file_extension)
        if loader_class:
            try:
                loader = loader_class(doc)
                text = loader.load()
                return text
            except Exception as e: 
                print(f"Error loading {file_extension} files:{e}") 
        else:
             print(f"Unsupported file extension: {file_extension}")
             return  None 

    #处理文档的函数
    # Function to process the document
    def splitSentences(self):
        full_text = self.getFile() #获取文档内容 get the content of the document
        if full_text != None:
            #对文档进行分割
            # Split the document
            text_split = CharacterTextSplitter(
                chunk_size=150,
                chunk_overlap=20,
            )
            texts = text_split.split_documents(full_text)
            self.splitText = texts
    
    #向量化与向量存储
    #Embedding and Vector DB
    def embeddingAndVectorDB(self):
        embeddings = OpenAIEmbeddings()
        db =Chroma.from_documents(
            documents = self.splitText,
            embedding = embeddings,
        )
        return db
    
    #提问并找到相关的文本块
    #Ask and find relevant text blocks
    def askAndFindFiles(self,question):
        db = self.embeddingAndVectorDB()
        #retriever = db.as_retriever(search_type="mmr")
        retriever = db.as_retriever(search_type="similarity_score_threshold",search_kwargs={"score_threshold":.5,"k":1})
        return retriever.invoke(input=question)
    
    #用自然语言和文档聊天
    #Chat with the document using natural language
    def chatWithDoc(self,question):
        _content = ""
        context = self.askAndFindFiles(question)
        for i in context:
            _content += i.page_content
        
        messages = self.prompt.format_messages(context=_content,question=question)
        chat = ChatOpenAI(
            model="gpt-4",
            temperature=0,
        )
        return chat.invoke(messages)

chat_doc = ChatDoc()
chat_doc.doc = "asset/example/fake.docx"
chat_doc.splitSentences()
chat_doc.chatWithDoc("公司注册地址是哪里？")


AIMessage(content='公司的注册地址是江苏省南京市雨花台区软件大道101号。', response_metadata={'token_usage': {'completion_tokens': 25, 'prompt_tokens': 202, 'total_tokens': 227, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'model_name': 'gpt-4', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-cd0bb004-f735-4252-8f79-deaac1971811-0')