# R4: 模型高效服務
- 向量資料庫
- 量化服務

In [None]:
!pip install langchain_core langchain_chroma langchain_community sentence_transformers

In [9]:
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import re
import chromadb
from pprint import pprint

import pandas as pd
from sentence_transformers import SentenceTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics.pairwise import cosine_similarity

import torch
from transformers import BitsAndBytesConfig
from transformers import LlamaForCausalLM

  from tqdm.autonotebook import tqdm, trange


## 向量資料庫的基本操作
Chroma 是用於建立具有嵌入向量（vector embedding）的 AI 應用程式的向量資料庫。它們可以表示文字、圖像，很快還可以表示音訊和視訊。

### 建立DB
集合（資料庫名稱）是您儲存嵌入、文件和任何其他元資料的地方。您可以建立一個具有以下名稱的集合（相當於關係資料庫mysql裡面的資料庫名稱）

In [10]:
# Create a Chroma Client
chroma_client = chromadb.PersistentClient(path="document_store")
# Create a collection
collection = chroma_client.get_or_create_collection(name="collection_name")

### 匯入資料
這裡的documents是你的數據內容，元數據（Metadata）是關於數據的組織、數據域及其關係的信息，簡言之，元數據就是關於數據的數據，可以你自己定義的章節等內容，ids是索引

In [11]:
collection.add(
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges",
        "This is a document about mango",
        "This is a document about apple",
    ],
    metadatas=[{"chapter": "1", "verse": "a"},
          {"chapter": "1", "verse": "a"},
          {"chapter": "2", "verse": "a"},
          {"chapter": "2", "verse": "a"}],
    ids=["id1", "id2", "id3", "id4"]
)
pprint(collection.get())

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:00<00:00, 83.5MiB/s]


{'data': None,
 'documents': ['This is a document about pineapple',
               'This is a document about oranges',
               'This is a document about mango',
               'This is a document about apple'],
 'embeddings': None,
 'ids': ['id1', 'id2', 'id3', 'id4'],
 'included': ['metadatas', 'documents'],
 'metadatas': [{'chapter': '1', 'verse': 'a'},
               {'chapter': '1', 'verse': 'a'},
               {'chapter': '2', 'verse': 'a'},
               {'chapter': '2', 'verse': 'a'}],
 'uris': None}


#### 讀取DB
讀取先前保存好的db，當document龐大時不用每次都重新轉embedding

In [12]:
client2 = chromadb.PersistentClient(path="document_store")
collection2 = client2.get_or_create_collection(name="collection_name")
pprint(collection2.get())

{'data': None,
 'documents': ['This is a document about pineapple',
               'This is a document about oranges',
               'This is a document about mango',
               'This is a document about apple'],
 'embeddings': None,
 'ids': ['id1', 'id2', 'id3', 'id4'],
 'included': ['metadatas', 'documents'],
 'metadatas': [{'chapter': '1', 'verse': 'a'},
               {'chapter': '1', 'verse': 'a'},
               {'chapter': '2', 'verse': 'a'},
               {'chapter': '2', 'verse': 'a'}],
 'uris': None}


#### 檢索資料
根據問題檢索文檔的相似度

In [14]:
results = collection2.query(
    query_texts=["This is a query document about Hawaii"], # Chroma will embed this for you
    n_results=4 # how many results to return
)
pprint(results)

{'data': None,
 'distances': [[1.0404008937271816,
                1.1399504747618734,
                1.2430800215233073,
                1.3259602282234741]],
 'documents': [['This is a document about pineapple',
                'This is a document about mango',
                'This is a document about oranges',
                'This is a document about apple']],
 'embeddings': None,
 'ids': [['id1', 'id3', 'id2', 'id4']],
 'included': ['metadatas', 'documents', 'distances'],
 'metadatas': [[{'chapter': '1', 'verse': 'a'},
                {'chapter': '2', 'verse': 'a'},
                {'chapter': '1', 'verse': 'a'},
                {'chapter': '2', 'verse': 'a'}]],
 'uris': None}


#### 新增資料
因應營運需要，可以在既有的資料庫中持續新增新文檔

In [15]:
collection2.add(
    documents=["This is a document about plum",
          "This is a document about cherry"],
    metadatas=[{"chapter": "3", "verse": "b"},
          {"chapter": "3", "verse": "b"}],
    ids=["id5", "id6"]
)

In [16]:
collection2.get()

{'ids': ['id1', 'id2', 'id3', 'id4', 'id5', 'id6'],
 'embeddings': None,
 'metadatas': [{'chapter': '1', 'verse': 'a'},
  {'chapter': '1', 'verse': 'a'},
  {'chapter': '2', 'verse': 'a'},
  {'chapter': '2', 'verse': 'a'},
  {'chapter': '3', 'verse': 'b'},
  {'chapter': '3', 'verse': 'b'}],
 'documents': ['This is a document about pineapple',
  'This is a document about oranges',
  'This is a document about mango',
  'This is a document about apple',
  'This is a document about plum',
  'This is a document about cherry'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

#### 檢索特定範圍的資料

In [17]:
# 透過 metadata 做過濾
collection2.query(
    query_texts=["This is a query document about Hawaii"],
    n_results=10,
    where={"verse": "a"}
)



{'ids': [['id1', 'id3', 'id2', 'id4']],
 'distances': [[1.0404008937271816,
   1.1399504747618734,
   1.2430800215233073,
   1.3259602282234741]],
 'metadatas': [[{'chapter': '1', 'verse': 'a'},
   {'chapter': '2', 'verse': 'a'},
   {'chapter': '1', 'verse': 'a'},
   {'chapter': '2', 'verse': 'a'}]],
 'embeddings': None,
 'documents': [['This is a document about pineapple',
   'This is a document about mango',
   'This is a document about oranges',
   'This is a document about apple']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [18]:
# 檢索文本包含特定文字
collection2.query(
    query_texts=["This is a query document about hawaii"],
    n_results=10,
    where_document={"$contains":"p"}
)



{'ids': [['id1', 'id5', 'id4']],
 'distances': [[1.0404008937271816, 1.2933018376352365, 1.3259602282234741]],
 'metadatas': [[{'chapter': '1', 'verse': 'a'},
   {'chapter': '3', 'verse': 'b'},
   {'chapter': '2', 'verse': 'a'}]],
 'embeddings': None,
 'documents': [['This is a document about pineapple',
   'This is a document about plum',
   'This is a document about apple']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

#### 刪除文檔

In [19]:
collection2.delete(
    where={"verse": {"$eq": "b"}}, # 表示 metadata 中 "author" 字段值等于 "jack" 的文档
)

In [20]:
collection2.get()

{'ids': ['id1', 'id2', 'id3', 'id4'],
 'embeddings': None,
 'metadatas': [{'chapter': '1', 'verse': 'a'},
  {'chapter': '1', 'verse': 'a'},
  {'chapter': '2', 'verse': 'a'},
  {'chapter': '2', 'verse': 'a'}],
 'documents': ['This is a document about pineapple',
  'This is a document about oranges',
  'This is a document about mango',
  'This is a document about apple'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

## 量化服務
- Embeddings may be challenging to scale up, which leads to expensive solutions and high latencies. Currently, many state-of-the-art models produce embeddings with 1024 dimensions, each of which is encoded in float32, i.e., they require 4 bytes per dimension. To perform retrieval over 50 million vectors, you would therefore need around 200GB of memory. This tends to require complex and costly solutions at scale.

#### Sample code

In [21]:
model = SentenceTransformer("all-MiniLM-L6-v2")

corpus = ["I am driving to the lake.", "It is a beautiful day."]
embeddings = model.encode(corpus)

binary_embeddings = model.encode(corpus, precision="binary")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [22]:
print(embeddings.shape)
print(embeddings.nbytes)
print(embeddings.dtype)

(2, 384)
3072
float32


In [23]:
print(binary_embeddings.shape)
print(binary_embeddings.nbytes)
print(binary_embeddings.dtype)

(2, 48)
96
int8


#### text clssification example

In [26]:
df = pd.read_parquet('https://huggingface.co/datasets/stanfordnlp/imdb/resolve/main/plain_text/train-00000-of-00001.parquet')
df

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
...,...,...
24995,A hit at the time but now better categorised a...,1
24996,I love this movie like no other. Another time ...,1
24997,This film and it's sequel Barry Mckenzie holds...,1
24998,'The Adventures Of Barry McKenzie' started lif...,1


In [27]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
corpus = df['text'].tolist()
embeddings = model.encode(corpus, show_progress_bar=True)
binary_embeddings = model.encode(corpus, precision="binary", show_progress_bar=True)

In [None]:
clf = LogisticRegression(max_iter=1000, random_state=0)

In [None]:
cross_validate(clf, embeddings, df['label'].tolist(), scoring='accuracy', cv=5, n_jobs=-1, return_train_score=True)

{'fit_time': array([0.36668587, 0.3979466 , 0.42268515, 0.41765976, 0.37822866]),
 'score_time': array([0.01199913, 0.01700163, 0.00935125, 0.0099144 , 0.01199985]),
 'test_score': array([0.806 , 0.801 , 0.8006, 0.7982, 0.7984]),
 'train_score': array([0.81995, 0.8187 , 0.82145, 0.82275, 0.82205])}

In [None]:
cross_validate(clf, binary_embeddings, df['label'].tolist(), scoring='accuracy', cv=5, n_jobs=-1, return_train_score=True)

{'fit_time': array([0.05052805, 0.04852581, 0.05151486, 0.06106687, 0.05105639]),
 'score_time': array([0.00199914, 0.00200224, 0.00199938, 0.00199938, 0.00201774]),
 'test_score': array([0.6598, 0.6604, 0.637 , 0.6506, 0.6476]),
 'train_score': array([0.65905, 0.6562 , 0.66315, 0.6591 , 0.66025])}

### 將 Embedding 量化並放入向量資料庫

In [None]:
# 初始化 Embedding 模型
embedding_func = HuggingFaceEmbeddings(
    model_name="infgrad/stella-base-zh-v3-1792d",
    encode_kwargs={"normalize_embeddings": True})

# 將字句轉換為向量
a = embedding_func.embed_query('突襲式發表！蘋果推 2 款 M3 MacBook Air，強調 AI 、遊戲效能皆強化')
b = embedding_func.embed_query('蘋果最新M3版MacBook Air突襲登場！6亮點下放1技術不漲價 M2版還降3000元')

# 計算相似度
cosine_similarity([a], [b])

  warn_deprecated(
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of BertModel were not initialized from the model checkpoint at infgrad/stella-base-zh-v3-1792d and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


array([[0.90356266]])

In [None]:
# 初始化 Embedding 模型
embedding_func = HuggingFaceEmbeddings(
    model_name="infgrad/stella-base-zh-v3-1792d",
    encode_kwargs={"precision":"binary"})

# 將字句轉換為向量
a = embedding_func.embed_query('突襲式發表！蘋果推 2 款 M3 MacBook Air，強調 AI 、遊戲效能皆強化')
b = embedding_func.embed_query('蘋果最新M3版MacBook Air突襲登場！6亮點下放1技術不漲價 M2版還降3000元')

# 計算相似度
cosine_similarity([a], [b])

Some weights of BertModel were not initialized from the model checkpoint at infgrad/stella-base-zh-v3-1792d and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


array([[0.72331135]])

In [None]:
url = "https://www.bnext.com.tw/article/76864/what-is-the-meaning-of-llm"

loader = WebBaseLoader(url)
news_docs = loader.load()
news_docs[0].page_content = re.sub('\n\s+', '',news_docs[0].page_content)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=20)
texts_chunks = text_splitter.split_documents(news_docs)
pprint(texts_chunks)

[Document(page_content='LLM是什麼？跟AI的關聯為何？大型語言模型要面對什麼挑戰？一文看懂|數位時代 BusinessNextABOUT US廣告合作內容授權新聞最新新聞', metadata={'source': 'https://www.bnext.com.tw/article/76864/what-is-the-meaning-of-llm', 'title': 'LLM是什麼？跟AI的關聯為何？大型語言模型要面對什麼挑戰？一文看懂|數位時代 BusinessNext', 'description': 'LLM（大型語言模型）是一種深度學習模型，它能從大量的文章、影音、書籍中學習單詞和句子之間的關係，然後回答問題、翻譯、生成文本。', 'language': 'zh-Hant-TW'}),
 Document(page_content='熱門圖解前端科技產業應用數位生活服務消費企業職場時事焦點AI與大數據5G通訊電動車／交通科技物聯網區塊鏈能源環保醫療生技半導體與電子產業資訊安全智慧製造雲端運算與服務智慧城市遊戲／電競3C生活影音／新媒體教育／人文金融科技新零售服務創新創新創業商業經營行銷與MARTECH職場／工作術程式開發深度專題\n影音新聞\n專家觀點社群未來商務創業小聚Web3+活動\n課程\n雜誌登入\n/\n註冊熱門\n新聞\n專題', metadata={'source': 'https://www.bnext.com.tw/article/76864/what-is-the-meaning-of-llm', 'title': 'LLM是什麼？跟AI的關聯為何？大型語言模型要面對什麼挑戰？一文看懂|數位時代 BusinessNext', 'description': 'LLM（大型語言模型）是一種深度學習模型，它能從大量的文章、影音、書籍中學習單詞和句子之間的關係，然後回答問題、翻譯、生成文本。', 'language': 'zh-Hant-TW'}),
 Document(page_content='雜誌登入\n/\n註冊熱門\n新聞\n專題\n影音\n活動2023.09.27\n|\nAI與大數據LLM是什麼？跟AI的關聯為何？大型語言模型要面對什麼挑戰？一文看懂', metadata={'source': 'http

In [None]:
# load it into Chroma
db = Chroma.from_documents(texts_chunks, embedding_func)

# query it
query = "什麼是 LLM 模型？"
docs = db.similarity_search_with_score(query)
docs[0]

(Document(page_content='LLM（大型語言模型）是什麼？', metadata={'description': 'LLM（大型語言模型）是一種深度學習模型，它能從大量的文章、影音、書籍中學習單詞和句子之間的關係，然後回答問題、翻譯、生成文本。', 'language': 'zh-Hant-TW', 'source': 'https://www.bnext.com.tw/article/76864/what-is-the-meaning-of-llm', 'title': 'LLM是什麼？跟AI的關聯為何？大型語言模型要面對什麼挑戰？一文看懂|數位時代 BusinessNext'}),
 800762.0)

## 總結
- 量化雖然能加速，但也會掉精準度，值不值得就看專案的需求
- 也因此後面有發展出許多其他量化的技術，嘗試在加速的同事不要掉太多效度