# Word2Vec

用 Word2Vec 生成中英文词向量

In [1]:
# !pip install gensim
# !pip install nltk
# !pip uninstall scipy -y
# !pip install scipy==1.12.0
# !pip install jieba

In [2]:
!pip list | grep gensim
!pip list | grep scipy

gensim                        4.3.0
scipy                         1.10.0


In [3]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk import download

download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/changluo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1. 英文词向量

In [4]:
corpus = [
    "This is the first sentence for our word2vec example.",
    "Here is another sentence.",
    "Word2Vec is a great tool for word embeddings.",
    "This example is meant to show how to generate word vectors."
]

stop_words = set(stopwords.words('english'))
len(stop_words)

179

In [5]:
simple_preprocess('generate word vectors')

['generate', 'word', 'vectors']

In [6]:
processed_corpus = [
    [word for word in simple_preprocess(doc) if word not in stop_words]
    for doc in corpus
]
processed_corpus

[['first', 'sentence', 'word', 'vec', 'example'],
 ['another', 'sentence'],
 ['word', 'vec', 'great', 'tool', 'word', 'embeddings'],
 ['example', 'meant', 'show', 'generate', 'word', 'vectors']]

In [7]:
model = Word2Vec(sentences=processed_corpus, vector_size=100, window=5, min_count=1, workers=4)
model

<gensim.models.word2vec.Word2Vec at 0x14b64db10>

In [8]:
word_vectors = model.wv
word_vectors

<gensim.models.keyedvectors.KeyedVectors at 0x14b64e5f0>

In [9]:
vector = word_vectors['word']
vector.shape

(100,)

In [10]:
vector

array([-5.3634081e-04,  2.3680906e-04,  5.1020449e-03,  9.0093454e-03,
       -9.3017817e-03, -7.1157864e-03,  6.4582159e-03,  8.9736823e-03,
       -5.0170030e-03, -3.7624310e-03,  7.3798303e-03, -1.5341062e-03,
       -4.5350916e-03,  6.5543107e-03, -4.8589921e-03, -1.8168911e-03,
        2.8776051e-03,  9.9337066e-04, -8.2866373e-03, -9.4499886e-03,
        7.3106494e-03,  5.0695864e-03,  6.7569995e-03,  7.6134264e-04,
        6.3525443e-03, -3.4063507e-03, -9.4513345e-04,  5.7697804e-03,
       -7.5229201e-03, -3.9355783e-03, -7.5105214e-03, -9.3124068e-04,
        9.5370552e-03, -7.3194657e-03, -2.3354974e-03, -1.9379142e-03,
        8.0775162e-03, -5.9302971e-03,  4.5453926e-05, -4.7538397e-03,
       -9.6045341e-03,  5.0078020e-03, -8.7599270e-03, -4.3908102e-03,
       -3.4420791e-05, -2.9533822e-04, -7.6611647e-03,  9.6145393e-03,
        4.9814614e-03,  9.2332652e-03, -8.1577078e-03,  4.4955891e-03,
       -4.1383887e-03,  8.2338450e-04,  8.4972316e-03, -4.4634552e-03,
      

In [11]:
similar_words = word_vectors.most_similar('word')
similar_words

[('tool', 0.21618759632110596),
 ('first', 0.09310852736234665),
 ('meant', 0.0929148867726326),
 ('another', 0.07966356724500656),
 ('great', 0.06283358484506607),
 ('embeddings', 0.027093391865491867),
 ('show', 0.016160279512405396),
 ('example', -0.010845641605556011),
 ('vectors', -0.02772490680217743),
 ('vec', -0.052069298923015594)]

In [12]:
type(word_vectors)

gensim.models.keyedvectors.KeyedVectors

## 2. 中文词向量

In [13]:
import re
import jieba

In [14]:
with open('./data/红楼梦.txt', 'r') as f:
    content = f.read()
len(content)

858628

In [15]:
content[:300]

'第1章 甄士隐梦幻识通灵 贾雨村风尘怀闺秀\n\u3000\u3000此开卷第一回也。\n\u3000\u3000作者自云：因曾历过一番梦幻之后，故将真事隐去，而借通灵说撰此《石头记》一书也，故曰“甄士隐”云云。但书中所记何事何人？自己又云：“今风尘碌碌，一事无成，忽念及当日所有之女子，一一细考较去，觉其行止见识皆出我之上。我堂堂须眉，诚不若彼裙钗，我实愧则有馀，悔又无益，大无可如何之日也。当此日，欲将已往所赖天思祖德，锦衣纨裤之时，饫甘餍肥之日，背父兄教育之恩，负师友规训之德，以致今日一技无成，半生潦倒之罪，编述一集，以告天下。知我之负罪固多，然闺阁中历历有人，万不可因我之不肖，自护己短，一并使其泯灭也。所以蓬牖茅椽，绳床瓦灶，并不足'

In [16]:
# 删除 \n \u3000 \u3000
pattern = re.compile(r'(\n|\u3000|\u3000)', re.IGNORECASE)
content = pattern.sub('', content)

# 对句子分段
sentences = re.split('。|！|？', content)
len(sentences), sentences[:5]

(35077,
 ['第1章 甄士隐梦幻识通灵 贾雨村风尘怀闺秀此开卷第一回也',
  '作者自云：因曾历过一番梦幻之后，故将真事隐去，而借通灵说撰此《石头记》一书也，故曰“甄士隐”云云',
  '但书中所记何事何人',
  '自己又云：“今风尘碌碌，一事无成，忽念及当日所有之女子，一一细考较去，觉其行止见识皆出我之上',
  '我堂堂须眉，诚不若彼裙钗，我实愧则有馀，悔又无益，大无可如何之日也'])

In [17]:
# 加载中文停用词
with open('./data/cn_stopwords.txt') as f:
    cn_stop_words = f.read()
cn_stop_words = cn_stop_words.split('\n')
len(cn_stop_words)

749

In [18]:
cn_processed_corpus = [
    [word for word in jieba.lcut(text) if word not in cn_stop_words]
    for text in sentences
]
cn_processed_corpus[:1]

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/0v/110wmd1964s9xk3hg_ty7hnh0000gn/T/jieba.cache
Loading model cost 0.253 seconds.
Prefix dict has been built successfully.


[['章', '甄士隐', '梦幻', '识通灵', '贾雨村', '风尘', '怀', '闺秀', '开卷', '第一回']]

In [19]:
cn_model = Word2Vec(sentences=cn_processed_corpus, vector_size=100, window=15, min_count=1, workers=4)
cn_model

<gensim.models.word2vec.Word2Vec at 0x14b6c5690>

In [20]:
cn_word_vectors = cn_model.wv
cn_vector = cn_word_vectors['林黛玉']
cn_vector.shape

(100,)

In [21]:
cn_vector

array([ 0.03187357,  0.08712577,  0.06776608,  0.06925862, -0.16397369,
       -0.3272053 ,  0.09465191,  0.45383644,  0.00816236, -0.10721069,
        0.00467131, -0.20834783, -0.0743042 ,  0.11238264,  0.13771193,
       -0.11834833,  0.05911426, -0.129058  , -0.17230403, -0.43833685,
        0.10324208,  0.10583433,  0.12489247, -0.01912691,  0.11316622,
       -0.03308889, -0.15794286, -0.01048738, -0.21774493,  0.03137633,
        0.04212022,  0.05329863,  0.01756241, -0.14765704, -0.14253245,
        0.2794736 ,  0.01149549, -0.14126092, -0.07886385, -0.3664317 ,
        0.05706621, -0.23403077, -0.15466402, -0.05464713,  0.06577917,
       -0.00273063, -0.23880409, -0.00428622,  0.0086732 ,  0.2836198 ,
        0.03997937, -0.23093075, -0.19872925, -0.07947168, -0.23022132,
        0.1292211 ,  0.07332551,  0.11955866, -0.12997931,  0.14047755,
       -0.03745651,  0.15746516, -0.13795659,  0.0033019 , -0.14843433,
        0.31583366,  0.09175596,  0.11569478, -0.26650828,  0.36

In [22]:
cn_similar_words = cn_word_vectors.most_similar('林黛玉')
cn_similar_words

[('大观园', 0.998341977596283),
 ('寺', 0.9974862337112427),
 ('三间', 0.9965772032737732),
 ('围着', 0.9965060353279114),
 ('亭', 0.9964518547058105),
 ('箸', 0.9964424967765808),
 ('悲', 0.9963930249214172),
 ('园中', 0.9962874054908752),
 ('未', 0.9962827563285828),
 ('铁槛', 0.9962796568870544)]

这里有一个小插曲，因为 scipy 更新了 `scipy.linalg.triu` 函数，导致 gensim 在今天这个时点（2024 年 6 月 16日）crash 了。

> The scipy.linalg functions tri, triu & tril are deprecated and will be removed in SciPy 1.13. Users are recommended to use the NumPy versions of these functions with identical names.
>
> Source: [SciPy 1.11.0 Release Notes](https://scipy.github.io/devdocs/release/1.11.0-notes.html#deprecated-features)

其实 `gensim` 的代码库针对这个问题已经修改了 [[issues 3525]](https://github.com/piskvorky/gensim/issues/3525)，但是没有发布到 release，所以至今依然存在这个问题。

这个问题过段时间肯定就好了。如果你所处的时空跟我一样，依旧没有修复，可以考虑用以下两种解决方法：

1. 将 Scipy 回退到 `1.12.0`（推荐，已实践）：

```bash
   pip uninstall scipy -y
   pip install scipy==1.12.0
```

2. 从 [gensim](https://github.com/piskvorky/gensim) 代码库下载最新代码，然后从源代码构建 Package：

```bash
   git clone https://github.com/piskvorky/gensim.git
   cd gensim
   pip uninstall gensim -y
   pip install -e .
```