## 初始化

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('../')

### 读取环境变量文件和工具config配置

In [None]:
from dotenv import load_dotenv

load_dotenv('../config/.env', override=True)

import os
from modelscope.utils.config import Config

model_cfg_file = os.getenv('LLM_CONFIG_FILE')
tool_cfg_file = os.getenv('TOOL_CONFIG_FILE') 

print(model_cfg_file)
print(tool_cfg_file)

model_cfg = Config.from_file(model_cfg_file)
tool_cfg = Config.from_file(tool_cfg_file)

### 中枢大模型启动

In [None]:
from modelscope_agent.llm import LocalLLM
from modelscope.hub.api import HubApi
from modelscope_agent.agent import AgentExecutor


model_name = 'modelscope-agent-qwen-7b'
model_cfg = {
    'modelscope-agent-qwen-7b':{
        'model_id': 'damo/MSAgent-Qwen-7B',
        'model_revision': 'v1.0.1',
        'use_raw_generation_config': True,
        'custom_chat': True
    }
}


llm = LocalLLM(model_name, model_cfg)

agent = AgentExecutor(llm, tool_cfg)

### 向量库配置

In [None]:
from langchain.embeddings import ModelScopeEmbeddings, DashScopeEmbeddings

from langchain.vectorstores import FAISS
import faiss

# 开源版本的向量库配置
model_id = 'damo/nlp_corom_sentence-embedding_chinese-base'
embeddings = ModelScopeEmbeddings(model_id=model_id)

# 基于阿里云DashScope向量Embedding的配置
#embeddings = DashScopeEmbeddings(model="text-embedding-v1")

### 文档库的切分处理

In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
import re
from typing import List
from modelscope.pipelines import pipeline


# ref: https://github.com/chatchat-space/langchain-ChatGLM/blob/master/textsplitter/ali_text_splitter.py
class AliTextSplitter(CharacterTextSplitter):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def split_text(self, text: str) -> List[str]:

        p = pipeline(
            task="document-segmentation",
            model='damo/nlp_bert_document-segmentation_chinese-base',
            device="cpu")
        result = p(documents=text)
        sent_list = [i for i in result["text"].split("\n\t") if i]
        return sent_list
    
# 导入文件
filepaths = ['tmp/ms.txt']

def load_file(filepaths, sentence_size=100):
    textsplitter = AliTextSplitter()
    docs = []
    for filepath in filepaths:
        if not filepath.lower().endswith(".txt"):
            continue
        loader = TextLoader(filepath, autodetect_encoding=True)
        docs+=(loader.load_and_split(textsplitter))
    
    return docs
docs = load_file(filepaths)

In [None]:

# 建立索引
vector_store = FAISS.from_documents(docs, embeddings)

top_k = 3
def search_query_wrapper(query):
    
    search_docs = vector_store.similarity_search(query, k=top_k)
    
    search_res = '\n'.join([f'[{idx+1}] {s.page_content}' for idx, s in enumerate(search_docs)])
    
    final_query = f'Web search results: \n{search_res.strip()}\n\n<|user|>:{query}'

    return final_query

In [None]:
query_without_search = 'ModelScope模型可以在不联网使用吗？'
# query = '生成一个xxx的图片，用最强大的基模型'

query_with_search = search_query_wrapper(query_without_search)

print(query_with_search)

In [None]:
agent.reset()
agent.run(query_without_search, remote=True)

In [None]:
agent.reset()
agent.run(query_with_search)