<a href="https://colab.research.google.com/github/niikun/learning_gemini/blob/main/Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Embedding


In [1]:
!pip install -q -U google-generativeai

In [2]:
from google.colab import userdata
import google.generativeai as genai
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [3]:
# for m in genai.list_models():
#     # Convert the model object's attributes to a dictionary
#     model_dict = {key: getattr(m, key) for key in dir(m) if not key.startswith('_')}

#     # Remove unexpected keys
#     if 'max_temperature' in model_dict:
#         del model_dict['max_temperature']

#     # Print the model information
#     print(model_dict)

## text-embedding-004の使い方

In [4]:
def embedding(texts):
    return genai.embed_content(
        model="models/text-embedding-004",
        content=texts,
    )["embedding"]

In [6]:
texts = ["Hello,Nippon"]
embeds_1 = embedding(texts)
print(embeds_1,"\n",len(embeds_1[0]))

[[-0.0011010753, 0.029711585, 0.0056836004, 0.030616648, 0.026661484, 0.04196647, 0.06396211, 0.0031853358, 0.022116935, 0.052044354, -0.0331204, 0.0055429162, 0.0041644867, 0.0084454, -0.09033817, -0.04253521, 0.04968971, 0.028258754, -0.06471178, 0.0044787843, 0.013444823, 0.03048044, 0.011999661, 0.018809153, -0.024157504, 0.06812055, -0.056437507, -0.00027996162, -0.02849936, -0.004328436, -0.07434978, 0.00974244, 0.016949635, -0.010299489, 4.481605e-05, 0.044382885, -0.010417502, 0.052939367, 0.037198167, -0.08997924, 0.010272823, 0.053499475, 0.029246897, 0.02567525, -0.029034816, 0.012084723, 0.037862536, 0.07812992, -0.033583466, 0.009323488, 0.03501291, -0.016606484, -0.06576129, 0.019133031, 0.017816208, -0.020673962, -0.039589483, -0.010243201, 0.038810212, -0.0047679157, -0.039988328, -0.03159898, 0.024442703, 0.01908776, -0.018697776, -0.033458047, -0.023508132, -0.013641903, -0.039826427, -0.0142673, 0.009675121, -0.023106126, -0.032626573, 0.024609124, -0.05005463, 0.034

In [7]:
texts_2 = ["hello,Japan"]
embeds_2 = embedding(texts_2)
print(embeds_2,"\n",len(embeds_2[0]))

[[-0.0022049171, 0.02491195, -0.0054116887, 0.012948, 0.022547016, 0.051785946, 0.07655057, -0.005147262, 0.034436747, 0.067665845, -0.0491255, -0.013027362, -0.002657021, 0.015858492, -0.08021673, -0.04504765, 0.04028893, 0.019017411, -0.05390016, -0.018217292, 0.007851342, 0.019549489, -0.0033197312, 0.015685247, -0.036957845, 0.072617956, -0.06558123, -0.00097276265, -0.01132729, -0.011787266, -0.07457305, 0.008067304, -0.006868727, -0.0248161, -0.018601041, 0.032325596, -0.03286583, 0.05288702, 0.02239743, -0.10360042, -0.00036237988, 0.05070189, 0.04499737, 0.01688809, -0.018904628, 0.0074999346, 0.046586003, 0.0697927, -0.024939258, 0.00086454535, 0.035230707, -0.012320776, -0.07471922, -0.008389674, 0.009265163, 0.023592573, -0.035499536, -0.005504258, 0.043939933, 0.00223121, -0.037334453, -0.024649153, 0.02993828, -0.01165099, -0.0088941185, -0.042561542, -0.026687117, -0.012340887, -0.029648287, -0.03620713, 0.005757219, -0.0260175, -0.04411481, 0.0327304, -0.05706533, 0.0116

In [8]:
import numpy as np
embeds_1 = np.array(embeds_1)
embeds_2 = np.array(embeds_2)
np.dot(embeds_1, embeds_2.T)

array([[0.93593453]])

## text-embedding-004の近傍探索  
「近傍探索」は、入力テキストと最も関連性の高いものを対象テキストから探すタスクです。  
今回は、Meta社が開発したオープンソースの効率的な類似性検索ライブラリ「Faiss」を使って近傍探索を行います。

### Faissのパッケージのインストール

In [9]:
!pip install faiss-cpu



### 入力テキストと対象テキストの準備と埋め込みへの変換

In [None]:
in_texts = ["I am glad it did not rain today."]

target_texts = [
    "What is your favorite food?",
    "Where do you live?",
    "Morning trains are crowded.",
    "It is a nice weather today.",
    "the economy is bad lately."
]

In [None]:
in_embeds = embedding(in_texts)
target_embeds = embedding(target_texts)

### numpy に変換

In [None]:
import numpy as np
in_embeds = np.array(in_embeds).astype("float32")
target_embeds = np.array(target_embeds).astype("float32")

In [None]:
len(in_embeds[0]),len(target_embeds[0])

In [None]:
in_embeds[0][:10]

### Faissのインデックス生成

- IndexFlatL2（L2ノルム）  
ユークリッド距離を使用してベクトルの距離を計算する最も基本的なインデックス
- IndexFlatIP（コサイン類似度）  
内積を使用してベクトル間の類似度を計算するインデックス
- IndexIVFFlat（高速化アルゴリズム）  
高次元ベクトルをクラスタリングすることによって、ベクトルの検索を高速化するインデックス

In [None]:
import faiss
index = faiss.IndexFlatL2(len(in_embeds[0]))

In [None]:
index.add(target_embeds)
index

#### index.search()の引数  
- in_embeds:  
入力テキストの埋め込みベクトル（numpy配列）
- k:  
返される最近傍ベクトルの数  

#### index.search()の戻り値  
-  distances:  
最近傍ベクトルの距離
- indices:  
最近傍ベクトルのインデックス

In [None]:
distances,indices = index.search(in_embeds,1)
print(distances)
print(indices)
print(target_texts[indices[0][0]])

## bge-m3の使い方  
bge-m3は、多機能、多言語、多粒度の特徴を持つ埋め込みモデル。  
日本語の埋め込みベンチマークでも高スコアをマークしています
[BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3)
https://huggingface.co/BAAI/bge-m3

In [10]:
!pip install FlagEmbedding



In [11]:
from FlagEmbedding import BGEM3FlagModel
model = BGEM3FlagModel("BAAI/bge-m3",use_fp16=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

imgs/.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

imgs/bm25.jpg:   0%|          | 0.00/132k [00:00<?, ?B/s]

colbert_linear.pt:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

imgs/long.jpg:   0%|          | 0.00/485k [00:00<?, ?B/s]

imgs/miracl.jpg:   0%|          | 0.00/576k [00:00<?, ?B/s]

imgs/nqa.jpg:   0%|          | 0.00/158k [00:00<?, ?B/s]

imgs/mkqa.jpg:   0%|          | 0.00/608k [00:00<?, ?B/s]

imgs/others.webp:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/127k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

onnx/config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

onnx/Constant_7_attr__value:   0%|          | 0.00/65.6k [00:00<?, ?B/s]

onnx/special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/725k [00:00<?, ?B/s]

model.onnx_data:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

onnx/tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

sparse_linear.pt:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

In [12]:
def embedding(texts):
    return model.encode(texts)["dense_vecs"]

In [14]:
texts = ["おなかがすいた"]
embeds = embedding(texts)
print(embeds)
print(len(embeds[0]))

[[ 0.05038613  0.02036831 -0.03186885 ... -0.00965181 -0.01776708
  -0.00504627]]
1024


### bge-m3の近傍探索

In [16]:
!pip install faiss-cpu



In [29]:
in_text = [
    "好きなご飯はカレーです",
]

target_texts = [
    "好きな食べ物は何ですか？",
    "どこに住んでいますか？",
    "朝の電車は混みますね",
    "天気はいいです",
    "経済は悪いです"
]


In [30]:
in_embeds = embedding(in_text)
target_embeds = embedding(target_texts)

In [31]:
import numpy as np
in_embeds = np.array(embeds).astype("float32")
target_embeds = np.array(target_embeds).astype("float32")

In [32]:
import faiss
index = faiss.IndexFlatL2(len(in_embeds[0]))

In [33]:
index.add(target_embeds)
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x787061475da0> >

In [34]:
distances,indices = index.search(in_embeds,1)
print(distances)
print(indices)
print(target_texts[indices[0][0]])

[[0.9063513]]
[[3]]
天気はいいです
