In [None]:
!pip install llama-index llama-index-llms-huggingface ipywidgets
!pip install transformers -U

In [2]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


from IPython.display import Markdown, display
import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts import PromptTemplate
from modelscope import snapshot_download
from llama_index.core.base.embeddings.base import BaseEmbedding, Embedding
from abc import ABC
from typing import Any, List, Optional, Dict, cast
from llama_index.core import (
    VectorStoreIndex,
    ServiceContext,
    set_global_service_context,
    SimpleDirectoryReader,
)

2024-02-21 13:49:53,743 - modelscope - INFO - PyTorch version 2.1.2+cu121 Found.
2024-02-21 13:49:53,745 - modelscope - INFO - TensorFlow version 2.14.0 Found.
2024-02-21 13:49:53,746 - modelscope - INFO - Loading ast index from /mnt/workspace/.cache/modelscope/ast_indexer
2024-02-21 13:49:53,746 - modelscope - INFO - No valid ast index found from /mnt/workspace/.cache/modelscope/ast_indexer, generating ast index from prebuilt!
2024-02-21 13:49:53,803 - modelscope - INFO - Loading done! Current index file version is 1.12.0, with md5 509123dba36c5e70a95f6780df348471 and a total number of 964 components indexed


In [3]:
# Model names 
qwen2_4B_CHAT = "qwen/Qwen1.5-4B-Chat"

selected_model = snapshot_download(qwen2_4B_CHAT)

SYSTEM_PROMPT = """You are a helpful AI assistant.
"""

query_wrapper_prompt = PromptTemplate(
    "[INST]<<SYS>>\n" + SYSTEM_PROMPT + "<</SYS>>\n\n{query_str}[/INST] "
)

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=2048,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=selected_model,
    model_name=selected_model,
    device_map="auto",
    # change these settings below depending on your GPU
    model_kwargs={"torch_dtype": torch.float16},
)

Downloading: 100%|██████████| 662/662 [00:00<00:00, 6.94MB/s]
Downloading: 100%|██████████| 51.0/51.0 [00:00<00:00, 586kB/s]
Downloading: 100%|██████████| 178/178 [00:00<00:00, 2.13MB/s]
Downloading: 100%|██████████| 1.59M/1.59M [00:00<00:00, 27.9MB/s]
Downloading: 100%|█████████▉| 3.72G/3.72G [00:08<00:00, 449MB/s]
Downloading: 100%|█████████▉| 3.64G/3.64G [00:11<00:00, 336MB/s]
Downloading: 100%|██████████| 38.7k/38.7k [00:00<00:00, 40.0MB/s]
Downloading: 100%|██████████| 4.13k/4.13k [00:00<00:00, 5.90MB/s]
Downloading: 100%|██████████| 6.70M/6.70M [00:00<00:00, 121MB/s]
Downloading: 100%|██████████| 1.13k/1.13k [00:00<00:00, 12.4MB/s]
Downloading: 100%|██████████| 2.65M/2.65M [00:00<00:00, 91.6MB/s]


INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
!mkdir -p 'data/xianjiaoda/'
!wget 'https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/xianjiaoda.md' -O 'data/xianjiaoda/xianjiaoda.md'

--2024-02-21 13:51:01--  https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/xianjiaoda.md
正在解析主机 modelscope.oss-cn-beijing.aliyuncs.com (modelscope.oss-cn-beijing.aliyuncs.com)... 8.131.208.119
正在连接 modelscope.oss-cn-beijing.aliyuncs.com (modelscope.oss-cn-beijing.aliyuncs.com)|8.131.208.119|:443... 已连接。
已发出 HTTP 请求，正在等待回应... 200 OK
长度： 13228 (13K) [text/markdown]
正在保存至: ‘data/xianjiaoda/xianjiaoda.md’


2024-02-21 13:51:01 (31.7 MB/s) - 已保存 ‘data/xianjiaoda/xianjiaoda.md’ [13228/13228])



In [None]:
# load documents
documents = SimpleDirectoryReader("/mnt/workspace/data/xianjiaoda/").load_data()
documents

In [6]:
embedding_model = "damo/nlp_gte_sentence-embedding_chinese-base"
class ModelScopeEmbeddings4LlamaIndex(BaseEmbedding, ABC):
    embed: Any = None
    model_id: str = "damo/nlp_gte_sentence-embedding_chinese-base"

    def __init__(
            self,
            model_id: str,
            **kwargs: Any,
    ) -> None:
        super().__init__(**kwargs)
        try:
            from modelscope.models import Model
            from modelscope.pipelines import pipeline
            from modelscope.utils.constant import Tasks
            # 使用modelscope的embedding模型（包含下载）
            self.embed = pipeline(Tasks.sentence_embedding, model=self.model_id)

        except ImportError as e:
            raise ValueError(
                "Could not import some python packages." "Please install it with `pip install modelscope`."
            ) from e

    def _get_query_embedding(self, query: str) -> List[float]:
        text = query.replace("\n", " ")
        inputs = {"source_sentence": [text]}
        return self.embed(input=inputs)['text_embedding'][0].tolist()

    def _get_text_embedding(self, text: str) -> List[float]:
        text = text.replace("\n", " ")
        inputs = {"source_sentence": [text]}
        return self.embed(input=inputs)['text_embedding'][0].tolist()

    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
        texts = list(map(lambda x: x.replace("\n", " "), texts))
        inputs = {"source_sentence": texts}
        return self.embed(input=inputs)['text_embedding'].tolist()

    async def _aget_query_embedding(self, query: str) -> List[float]:
        return self._get_query_embedding(query)


In [7]:
embeddings = ModelScopeEmbeddings4LlamaIndex(model_id=embedding_model)
service_context = ServiceContext.from_defaults(embed_model=embeddings, llm=llm)
set_global_service_context(service_context)

index = VectorStoreIndex.from_documents(documents)

INFO:datasets:PyTorch version 2.1.2+cu121 available.
PyTorch version 2.1.2+cu121 available.
INFO:datasets:TensorFlow version 2.14.0 available.
TensorFlow version 2.14.0 available.


Downloading: 100%|██████████| 917/917 [00:00<00:00, 6.18MB/s]
Downloading: 100%|██████████| 2.29k/2.29k [00:00<00:00, 23.5MB/s]
Downloading: 100%|██████████| 60.7k/60.7k [00:00<00:00, 26.3MB/s]
Downloading: 100%|██████████| 195M/195M [00:00<00:00, 383MB/s] 
Downloading: 100%|██████████| 11.4k/11.4k [00:00<00:00, 40.4MB/s]
Downloading: 100%|██████████| 125/125 [00:00<00:00, 684kB/s]
Downloading: 100%|██████████| 429k/429k [00:00<00:00, 20.8MB/s]
Downloading: 100%|██████████| 366/366 [00:00<00:00, 4.25MB/s]
2024-02-21 13:51:15,095 - modelscope - INFO - initiate model from /mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-base
2024-02-21 13:51:15,096 - modelscope - INFO - initiate model from location /mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-base.
2024-02-21 13:51:15,096 - modelscope - INFO - initialize model from /mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-base
  return self.fget.__get__(instance, owner)()

In [8]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()

In [10]:
response = query_engine.query("西安交大是由哪几个学校合并的?")
print(response)
#display(Markdown(f"<b>{response}</b>"))

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


2000年国务院决定将西安交通大学、西安医科大学、陕西财经学院三校合并，组建新的西安交通大学
