In [1]:
!pip install pypdf langchain unstructured transformers_stream_generator
!pip install modelscope  nltk pydantic  tiktoken  llama-index

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Collecting pypdf
  Downloading https://mirrors.aliyun.com/pypi/packages/29/10/055b649e914ad8c5d07113c22805014988825abbeff007b0e89255b481fa/pypdf-3.17.4-py3-none-any.whl (278 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m634.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting langchain
  Downloading https://mirrors.aliyun.com/pypi/packages/23/98/c70fac0f1b3193ced86013b563119c27c68ac26b684815f407555224108d/langchain-0.1.0-py3-none-any.whl (797 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.0/798.0 kB[0m [31m651.7 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting unstructured
  Downloading https://mirrors.aliyun.com/pypi/packages/7e/46/0f1105b77dcabc9cacb8e0767b3ed68b2078da3d52c44b7799def7403443/unstructured-0.12.0-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m650.6 kB/s[

In [4]:
!mkdir -p /root/nltk_data/tokenizers
!mkdir -p /root/nltk_data/taggers
!cp /mnt/workspace/punkt.zip /root/nltk_data/tokenizers
!cp /mnt/workspace/averaged_perceptron_tagger.zip /root/nltk_data/taggers
!cd /root/nltk_data/tokenizers; unzip punkt.zip;
!cd /root/nltk_data/taggers; unzip averaged_perceptron_tagger.zip;

!mkdir -p /mnt/workspace/custom_data
!mv /mnt/workspace/xianjiaoda.md /mnt/workspace/custom_data

!cd /mnt/workspace

Archive:  punkt.zip
   creating: punkt/
  inflating: punkt/greek.pickle      
  inflating: punkt/estonian.pickle   
  inflating: punkt/turkish.pickle    
  inflating: punkt/polish.pickle     
   creating: punkt/PY3/
  inflating: punkt/PY3/greek.pickle  
  inflating: punkt/PY3/estonian.pickle  
  inflating: punkt/PY3/turkish.pickle  
  inflating: punkt/PY3/polish.pickle  
  inflating: punkt/PY3/russian.pickle  
  inflating: punkt/PY3/czech.pickle  
  inflating: punkt/PY3/portuguese.pickle  
  inflating: punkt/PY3/README        
  inflating: punkt/PY3/dutch.pickle  
  inflating: punkt/PY3/norwegian.pickle  
  inflating: punkt/PY3/slovene.pickle  
  inflating: punkt/PY3/english.pickle  
  inflating: punkt/PY3/danish.pickle  
  inflating: punkt/PY3/finnish.pickle  
  inflating: punkt/PY3/swedish.pickle  
  inflating: punkt/PY3/spanish.pickle  
  inflating: punkt/PY3/german.pickle  
  inflating: punkt/PY3/italian.pickle  
  inflating: punkt/PY3/french.pickle  
  inflating: punkt/russian.pic

In [5]:
import os
from abc import ABC
from typing import Any, List, Optional, Dict, cast

import torch
from langchain_core.language_models.llms import LLM
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from modelscope import AutoModelForCausalLM, AutoTokenizer
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader
from llama_index import ServiceContext
from llama_index.embeddings.base import BaseEmbedding
from llama_index import set_global_service_context
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from llama_index.retrievers import VectorIndexRetriever


# 大模型配置
llm_name = "Qwen/Qwen-1_8B-Chat"  # 大模型
llm_revision = "master"  # 大模型的版本

# embedding模型配置
embedding_model = "damo/nlp_gte_sentence-embedding_chinese-small"

# 知识库的原始文件路径
knowledge_doc_file_dir = "/mnt/workspace/custom_data/"
knowledge_doc_file_path = knowledge_doc_file_dir + "xianjiaoda.md"

'''
!pip install pypdf langchain unstructured transformers_stream_generator
!pip install modelscope  nltk pydantic  tiktoken  llama-index

!mkdir -p /root/nltk_data/tokenizers
!mkdir -p /root/nltk_data/taggers
!cp /mnt/workspace/punkt.zip /root/nltk_data/tokenizers
!cp /mnt/workspace/averaged_perceptron_tagger.zip /root/nltk_data/taggers
!cd /root/nltk_data/tokenizers; unzip punkt.zip;
!cd /root/nltk_data/taggers; unzip averaged_perceptron_tagger.zip;

!mkdir -p /mnt/workspace/custom_data
!mv /mnt/workspace/xianjiaoda.md /mnt/workspace/custom_data

!cd /mnt/workspace
'''


# 基于llamaIndex的BaseEmbedding封装我们自己的embedding class，以便能够使用modelscope中的embedding模型
class ModelScopeEmbeddings4LlamaIndex(BaseEmbedding, ABC):
    embed: Any = None
    model_id: str = "damo/nlp_gte_sentence-embedding_chinese-small"

    def __init__(
            self,
            model_id: str,
            **kwargs: Any,
    ) -> None:
        super().__init__(**kwargs)
        try:
            from modelscope.models import Model
            from modelscope.pipelines import pipeline
            from modelscope.utils.constant import Tasks
            # 使用modelscope的embedding模型（包含下载）
            self.embed = pipeline(Tasks.sentence_embedding, model=self.model_id)

        except ImportError as e:
            raise ValueError(
                "Could not import some python packages." "Please install it with `pip install modelscope`."
            ) from e

    def _get_query_embedding(self, query: str) -> List[float]:
        text = query.replace("\n", " ")
        inputs = {"source_sentence": [text]}
        return self.embed(input=inputs)['text_embedding'][0]

    def _get_text_embedding(self, text: str) -> List[float]:
        text = text.replace("\n", " ")
        inputs = {"source_sentence": [text]}
        return self.embed(input=inputs)['text_embedding'][0]

    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
        texts = list(map(lambda x: x.replace("\n", " "), texts))
        inputs = {"source_sentence": texts}
        return self.embed(input=inputs)['text_embedding']

    async def _aget_query_embedding(self, query: str) -> List[float]:
        return self._get_query_embedding(query)


# 为langchain封装llamaIndex的Retriever（langchain自带的LlamaIndexRetriever的接口与llamaIndex当下的定义不兼容）
class LlamaIndexRetriever(BaseRetriever):
    index: Any
    """LlamaIndex index to query."""

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        """Get documents relevant for a query."""
        try:
            from llama_index.indices.base import BaseIndex
            from llama_index.response.schema import Response
        except ImportError:
            raise ImportError(
                "You need to install `pip install llama-index` to use this retriever."
            )
        index = cast(BaseIndex, self.index)
        retriever = VectorIndexRetriever(index=index)
        print('@@@ query=', query)

        response = index.as_query_engine().query(query)
        response = cast(Response, response)
        # parse source nodes
        docs = []
        for source_node in response.source_nodes:
            print('@@@@ source=', source_node)
            metadata = source_node.metadata or {}
            docs.append(
                Document(page_content=source_node.get_text(), metadata=metadata)
            )
        return docs

def torch_gc():
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    DEVICE = "cuda"
    DEVICE_ID = "0"
    CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE
    a = torch.Tensor([1, 2])
    a = a.cuda()
    print(a)

    if torch.cuda.is_available():
        with torch.cuda.device(CUDA_DEVICE):
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()


# 定义全局需要的资源
tokenizer = AutoTokenizer.from_pretrained(llm_name, revision=llm_revision, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(llm_name, revision=llm_revision, device_map="auto",
                                             trust_remote_code=True, fp16=True).eval()


# 基于langchain封装modelscope的LLM，我们可以通过langchain使用modelscope上的所有LLM
class QianWenChatLLM(LLM):
    max_length = 10000
    temperature: float = 0.01
    top_p = 0.9

    def __init__(self):
        super().__init__()

    @property
    def _llm_type(self):
        return "ChatLLM"

    def _call(
            self,
            prompt: str,
            stop: Optional[List[str]] = None,
            run_manager=None,
            **kwargs: Any,
    ) -> str:
        print(prompt)
        response, history = model.chat(tokenizer, prompt, history=None)
        torch_gc()
        return response


# STEP1: 创建通义千问的chat llm
qwllm = QianWenChatLLM()
print('STEP1: qianwen LLM created')

# STEP2: 加载知识库文档 并 创建知识库的向量db
print('STEP2: reading docs ...')
# 创建embedding model，并配置到llamaIndex的context中
embeddings = ModelScopeEmbeddings4LlamaIndex(model_id=embedding_model)
service_context = ServiceContext.from_defaults(embed_model=embeddings, llm=None)
set_global_service_context(service_context)     # 全局配置，可能不是一种好的实践

llamaIndex_docs = SimpleDirectoryReader(knowledge_doc_file_dir).load_data()
llamaIndex_index = GPTVectorStoreIndex.from_documents(llamaIndex_docs, chunk_size=512)
retriever = LlamaIndexRetriever(index=llamaIndex_index)
print(' 2.2 reading doc done, vec db created.')

# STEP3: 创建chat template
prompt_template = """请基于```内的内容回答问题。"
```
{context}
```
我的问题是：{question}。
"""
prompt = ChatPromptTemplate.from_template(template=prompt_template)
print('STEP3: chat prompt template created.')

# STEP4: 创建RAG chain以支持问答
chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | qwllm
        | StrOutputParser()
)
chain.invoke('西安交大的校训是什么？')
# chain.invoke('魔搭社区有哪些模型?')
# chain.invoke('modelscope是什么?')
# chain.invoke('萧峰和乔峰是什么关系?')


2024-01-16 17:06:05,058 - modelscope - INFO - PyTorch version 2.1.0+cu118 Found.
2024-01-16 17:06:05,060 - modelscope - INFO - TensorFlow version 2.14.0 Found.
2024-01-16 17:06:05,061 - modelscope - INFO - Loading ast index from /mnt/workspace/.cache/modelscope/ast_indexer
2024-01-16 17:06:05,061 - modelscope - INFO - No valid ast index found from /mnt/workspace/.cache/modelscope/ast_indexer, generating ast index from prebuilt!
2024-01-16 17:06:05,105 - modelscope - INFO - Loading done! Current index file version is 1.10.0, with md5 44f0b88effe82ceea94a98cf99709694 and a total number of 946 components indexed
  from .autonotebook import tqdm as notebook_tqdm
2024-01-16 17:06:06.173594: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-16 17:06:06.206324: E tensorflo

STEP1: qianwen LLM created
STEP2: reading docs ...


Downloading: 100%|██████████| 772/772 [00:00<00:00, 5.78MB/s]
Downloading: 100%|██████████| 2.02k/2.02k [00:00<00:00, 15.1MB/s]
Downloading: 100%|██████████| 60.7k/60.7k [00:00<00:00, 5.56MB/s]
Downloading: 100%|██████████| 57.7M/57.7M [00:00<00:00, 229MB/s]
Downloading: 100%|██████████| 15.3k/15.3k [00:00<00:00, 5.51MB/s]
Downloading: 100%|██████████| 125/125 [00:00<00:00, 978kB/s]
Downloading: 100%|██████████| 291k/291k [00:00<00:00, 27.8MB/s]
Downloading: 100%|██████████| 425/425 [00:00<00:00, 1.81MB/s]
Downloading: 100%|██████████| 68.4k/68.4k [00:00<00:00, 6.29MB/s]
2024-01-16 17:09:10,871 - modelscope - INFO - initiate model from /mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-small
2024-01-16 17:09:10,871 - modelscope - INFO - initiate model from location /mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-small.
2024-01-16 17:09:10,873 - modelscope - INFO - initialize model from /mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence

LLM is explicitly disabled. Using MockLLM.




 2.2 reading doc done, vec db created.
STEP3: chat prompt template created.
@@@ query= 西安交大的校训是什么？
@@@@ source= Node ID: 28f7450e-0d60-49b1-a93a-5da17846d3d1
Text: 西安交通大学是我国最早兴办、享誉海内外的著名高等学府，是教育部直属重点大学。西迁以来，一代代交大人扎根西部、服务国家，为西部发展
和国家建设作出了卓越贡献，以实际行动铸就了第一批纳入中国共产党人精神谱系的西迁精神。2017年12月，习近平总书记对学校15位老教授来信作出
重要指示。在2018年新年贺词中，习近平总书记再次提到“西安交大西迁的老教授”。2020年4月22日，习近平总书记来校考察并发表重要讲话，强
调西迁精神的核心是爱国主义，精髓是听党指挥跟党走，与党和国家、与民族和人民同呼吸、共命运，勉励师生在新时代创造属于我们这代人的历史功绩，给全
校师生以巨大关怀和极大鼓舞，为学校新时代建设中国特色世界一流大学提供了根本遵循和行动指南。
十九世纪末，甲午战败，民族危难。近代著名实业家、教育...
Score:  0.916

@@@@ source= Node ID: c600566c-65ed-4b98-9d3a-ef2992892215
Text: 2000年国务院决定将西安交通大学、西安医科大学、陕西财经学院三校合并，组建新的西安交通大学。
学校是“七五”“八五”重点建设单位，首批进入国家“211”和“985”工程建设学校。2017 年入选国家一流大学建设名单 A
类建设高校，2022 年入选国家第二轮“双一流”建设高校，8 个学科入选“双一流”建设学科。据 ESI 公布的数据，截至 2023 年 5
月，学校 17 个学科进入世界学术机构前 1%，5 个学科进入前 1‰，其中工程学进入前万分之一。  学校是涵盖理、工、医、经、管、文、法、
哲、艺、教育、交叉等11个学科门类的综合性研究型大学，设有32个学院（部、中心）、9个本科书院和3所直属附属医院。现有在编教工6635人，其
中专任教师3789人。师资队伍中入选院士、杰青等国...
Score:  0.874

Human: 请基于```内的内容回答问题。"
```
[Docume



tensor([1., 2.], device='cuda:0')


'西安交通大学校训是：“求实学、务实业”'