# 用 Redis 存取 Embedding

In [1]:
# !pip install gensim
# !pip install nltk
# !pip uninstall scipy -y
# !pip install scipy==1.12.0
# !pip install jieba

In [2]:
!pip list | grep gensim
!pip list | grep scipy

gensim                                   4.3.2
scipy                                    1.12.0


In [3]:
import time
import json
import redis
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk import download

import util

download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/changluo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1. Word2vec 生成 Embedding

In [4]:
corpus = [
    "This is the first sentence for our word2vec example.",
    "Here is another sentence.",
    "Word2Vec is a great tool for word embeddings.",
    "This example is meant to show how to generate word vectors."
]

stop_words = set(stopwords.words('english'))
len(stop_words)

179

In [5]:
processed_corpus = [
    [word for word in simple_preprocess(doc) if word not in stop_words]
    for doc in corpus
]
processed_corpus

[['first', 'sentence', 'word', 'vec', 'example'],
 ['another', 'sentence'],
 ['word', 'vec', 'great', 'tool', 'word', 'embeddings'],
 ['example', 'meant', 'show', 'generate', 'word', 'vectors']]

In [6]:
model = Word2Vec(sentences=processed_corpus, vector_size=10, window=5, min_count=1, workers=4)
model

<gensim.models.word2vec.Word2Vec at 0x7fd04c34c3d0>

In [7]:
word_vectors = model.wv
word_vectors

<gensim.models.keyedvectors.KeyedVectors at 0x7fd04c34f730>

In [8]:
vector = word_vectors['word']
vector.shape

(10,)

In [9]:
vector

array([-0.00534929,  0.00237317,  0.05104667,  0.09009523, -0.09301735,
       -0.07117365,  0.06458575,  0.08973929, -0.05016771, -0.03763947],
      dtype=float32)

In [10]:
# 待存 key - Embedding 对
words = [ee for e in processed_corpus for ee in e]
dct = { w : word_vectors[w] for w in words }
len(dct)

13

## 2. 数据类型转换

由于 Embedding 是 list 类型的。为了将 list 存入 Redis，我们需要建一个将 list 转换成 str 的 pipeline.

In [11]:
# 创建一个 RedisHandler 实例
r = util.RedisHandler()

# 测试下 Redis 通不通
r.ping()

True

In [12]:
# 将 list 存入 Redis 之前，先用 json.dumps 把 list 转换成 str
my_list = dct['first'].tolist()
lst_str = json.dumps(my_list)
lst_str

'[0.0234837643802166, -0.045190323144197464, 0.08388731628656387, -0.09858163446187973, 0.06764640659093857, 0.02914416790008545, -0.04932831600308418, 0.043981872498989105, -0.0173957459628582, 0.06711383908987045]'

In [13]:
# 读取时，再用 json.loads 将 str 转回 list
lst = json.loads(lst_str)
type(lst)

list

In [14]:
# 在 util.py 新建 Cast 类，用来做上述类型转换
cast = util.Cast()

# list to str
r = cast.list2str(my_list)
type(r), r

(str,
 '[0.0234837643802166, -0.045190323144197464, 0.08388731628656387, -0.09858163446187973, 0.06764640659093857, 0.02914416790008545, -0.04932831600308418, 0.043981872498989105, -0.0173957459628582, 0.06711383908987045]')

In [15]:
# str to list
r = cast.str2list(r)
type(r), r

(list,
 [0.0234837643802166,
  -0.045190323144197464,
  0.08388731628656387,
  -0.09858163446187973,
  0.06764640659093857,
  0.02914416790008545,
  -0.04932831600308418,
  0.043981872498989105,
  -0.0173957459628582,
  0.06711383908987045])

## 3. 存取 Embedding

现在，我们有了能将 list 转换成 str 的 `Cast` 类，还有能将 KV 对送入 Redis 的 `RedisHandler` 类。下面，我们综合两个类的功能，实现 Embedding 的存取。

In [16]:
rh = util.RedisHandler()
cast = util.Cast()

# 存储
success_num, cnt = 0, 0
for k, v in dct.items():
    v = cast.list2str(v.tolist())
    if rh.set(k, v):
        success_num += 1
    cnt += 1

success_rate = success_num / cnt
print(f'success_rate: {success_rate:.2f}')

success_rate: 1.00


In [17]:
# 取回
fetch_dict = dict()
for k in dct.keys():
    v = rh.get(k)
    if v:
        v = cast.str2list(v)
        fetch_dict[k] = v
    else:
        print(f'Can not fetch key: {k}')

# 与原 Embedding 对比
fetch_dict == { k : v.tolist() for k, v in dct.items()}

True