# 存储词向量

用 word2vec 获取单词的词向量，然后用句子里单词的平均词向量代表句子。

In [1]:
import re
import collections
import json
import jieba
import torch
import numpy as np
import pandas as pd
from ast import literal_eval

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [2]:
CN_BOOK_PATH = './data/honglou.txt'
CN_STOP_WORDS = './data/cn_stopwords.txt'
DATAFILE_PATH = './data/embedding.csv'
MIN_FREQ = 10

In [3]:
# 加载停用词
def load_stop_words(stop_words_path):
    with open(stop_words_path, 'r') as f:
        stop_words = f.read()
    return stop_words.split('\n')

## 1. 对语料的一些预处理

In [4]:
# 获取《红楼梦》中的句子
def get_sentences(book_path, cn_stop_words_path):
    with open(book_path, 'r') as f:
        content = f.read()

    # 删除 \n \u3000 \u3000
    pattern = re.compile(r'(\n|\u3000|\u3000)', re.IGNORECASE)
    content = pattern.sub('', content)
    
    # 加载中文停用词
    cn_stop_words = load_stop_words(cn_stop_words_path)
    
    # 切分句子
    sentences = re.split('。|！|？', content)
    
    # 去除停用词
    return [
        [word for word in jieba.cut(text) if word not in cn_stop_words]
        for text in sentences
    ]

In [5]:
sentences = get_sentences(book_path=CN_BOOK_PATH,
                       cn_stop_words_path=CN_STOP_WORDS)
len(sentences)

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/0v/110wmd1964s9xk3hg_ty7hnh0000gn/T/jieba.cache
Loading model cost 0.294 seconds.
Prefix dict has been built successfully.


35077

## 2. 获取词嵌入和句子嵌入

In [6]:
cn_model = Word2Vec(sentences=sentences, vector_size=100, window=15, min_count=1, workers=4)
cn_model

<gensim.models.word2vec.Word2Vec at 0x13d1b5540>

In [7]:
# 获取词嵌入
def get_word_embedding(word):
    words = cn_model.wv
    if word in words:
        return words[word]
    else:
        return None

get_word_embedding(word='林黛玉')

array([ 0.02695482,  0.07779247,  0.07012616,  0.06487115, -0.15949781,
       -0.3419014 ,  0.10270573,  0.4820606 ,  0.00289474, -0.09030233,
       -0.0143318 , -0.21964096, -0.06298142,  0.15229826,  0.14050776,
       -0.13950881,  0.07108366, -0.11627953, -0.16915977, -0.4733254 ,
        0.11839887,  0.10822443,  0.10420915, -0.03805748,  0.12504137,
       -0.02423785, -0.21124965, -0.04285785, -0.2179474 ,  0.04692608,
        0.03653614,  0.05885538, -0.00092662, -0.14675492, -0.14369257,
        0.2934998 , -0.00702528, -0.14393175, -0.08159426, -0.3762011 ,
        0.05363935, -0.25379714, -0.15995392, -0.05318958,  0.05903193,
       -0.0099874 , -0.2335613 , -0.00736681,  0.00442902,  0.282669  ,
        0.05990946, -0.22081053, -0.17153227, -0.09689675, -0.21542862,
        0.13948385,  0.07004401,  0.10408314, -0.12259302,  0.1600684 ,
       -0.02643811,  0.14391859, -0.13720137,  0.01075041, -0.18874075,
        0.34105572,  0.11198212,  0.12553063, -0.2942361 ,  0.39

In [8]:
# 获取句子嵌入
def get_sentence_embedding(sentence: list):
    words = cn_model.wv
    if all([word in words for word in sentence]):
        return torch.Tensor([words[word].tolist() for word in sentence]).mean(dim=0).numpy()
    else:
        return None

In [9]:
sentences[2]

['但书中', '所记', '何事', '何人']

In [10]:
sentence_embedding = get_sentence_embedding(sentence=sentences[2])
sentence_embedding

array([ 0.01920669,  0.04407325,  0.02712532,  0.03531424, -0.06799279,
       -0.15175235,  0.04094258,  0.20957285, -0.01188125, -0.07008227,
        0.01283324, -0.10395701, -0.0303293 ,  0.05464818,  0.04932523,
       -0.04905527,  0.02437358, -0.05570764, -0.06707565, -0.20514601,
        0.04664106,  0.04197637,  0.05018541, -0.02218731,  0.04988867,
       -0.02249959, -0.07519446, -0.0163099 , -0.10285886,  0.01710362,
        0.03293168,  0.00995623,  0.01883415, -0.06185373, -0.05676237,
        0.13762921, -0.01974768, -0.05732157, -0.04990767, -0.15983196,
        0.03456043, -0.11234295, -0.07592264, -0.01036044,  0.02235567,
       -0.02994039, -0.10262044,  0.00975732, -0.00993088,  0.11616881,
        0.02437238, -0.09928151, -0.08067535, -0.03674751, -0.09661898,
        0.0503144 ,  0.02380429,  0.04341634, -0.05078467,  0.07080653,
       -0.02259982,  0.05594487, -0.02838944,  0.00968448, -0.08074763,
        0.14024888,  0.04144536,  0.06442392, -0.13777986,  0.15

## 3. 将词嵌入存入 csv

In [11]:
# 计算词频
corpus = [w for text in sentences for w in text]
ctr = collections.Counter(corpus)

# 过滤词频过低的词
n_corpus = [k for k, v in ctr.items() if v > MIN_FREQ]
len(n_corpus)

3234

In [12]:
# 获取每个词的 embedding
ebd_dict = dict()
for word in n_corpus:
    ebd_dict[word] = get_word_embedding(word)

len(ebd_dict)

3234

In [13]:
# 将 embedding 存成字符串
def ebd2str(embedding):
    return json.dumps(embedding.tolist())

In [14]:
data = {
    'word': ebd_dict.keys(),
    'embedding': ebd_dict.values(),
}

df = pd.DataFrame(data)
df

Unnamed: 0,word,embedding
0,章,"[-0.06583424, -0.35617286, 0.7112745, -0.21317..."
1,贾雨村,"[0.06584845, 0.06028839, 0.06956228, 0.0681048..."
2,一番,"[0.15242763, 0.3560388, 0.235145, 0.27399004, ..."
3,之后,"[0.11055881, 0.22398143, 0.18437295, 0.1915483..."
4,说,"[0.5745651, 0.7350919, 0.18694074, 0.64299136,..."
...,...,...
3229,腰门,"[0.016465, 0.05250697, 0.04141871, 0.04207318,..."
3230,主上,"[0.07417383, 0.026597852, 0.040637456, 0.02408..."
3231,赵堂官,"[0.018719776, 0.079556175, 0.027746059, 0.0569..."
3232,甄宝玉,"[0.045802593, 0.17701188, 0.07884574, 0.139127..."


In [15]:
# 将 embedding 列里的所有 embedding 转成 str
df['embedding'] = df['embedding'].apply(ebd2str)

In [16]:
# 验证一下是不是转成 str 了
ebd = df[df['word'] == '林黛玉']['embedding'].iloc[0]
type(ebd)

str

In [17]:
df.to_csv(DATAFILE_PATH, index=False)

## 4. 从 csv 中读取词嵌入

In [18]:
rdf = pd.read_csv(DATAFILE_PATH)
# rdf

In [19]:
rdf.embedding = rdf.embedding.apply(literal_eval).apply(lambda e: np.array(e))
rdf

Unnamed: 0,word,embedding
0,章,"[-0.06583423912525177, -0.3561728596687317, 0...."
1,贾雨村,"[0.0658484473824501, 0.06028838828206062, 0.06..."
2,一番,"[0.15242762863636017, 0.35603880882263184, 0.2..."
3,之后,"[0.11055880784988403, 0.22398142516613007, 0.1..."
4,说,"[0.5745651125907898, 0.7350919246673584, 0.186..."
...,...,...
3229,腰门,"[0.016465000808238983, 0.05250696837902069, 0...."
3230,主上,"[0.07417383044958115, 0.026597851887345314, 0...."
3231,赵堂官,"[0.01871977560222149, 0.0795561745762825, 0.02..."
3232,甄宝玉,"[0.04580259323120117, 0.1770118772983551, 0.07..."


In [20]:
type(rdf.iloc[0]['embedding'])

numpy.ndarray

## 5. 整合成函数

将以上功能整合成一个函数，并写入 `util.py`

In [21]:
# 假设名为 embedding 的列，被用来存储词向量
data = {
    'word': ebd_dict.keys(),
    'embedding': ebd_dict.values(),
}

df = pd.DataFrame(data)
df

Unnamed: 0,word,embedding
0,章,"[-0.06583424, -0.35617286, 0.7112745, -0.21317..."
1,贾雨村,"[0.06584845, 0.06028839, 0.06956228, 0.0681048..."
2,一番,"[0.15242763, 0.3560388, 0.235145, 0.27399004, ..."
3,之后,"[0.11055881, 0.22398143, 0.18437295, 0.1915483..."
4,说,"[0.5745651, 0.7350919, 0.18694074, 0.64299136,..."
...,...,...
3229,腰门,"[0.016465, 0.05250697, 0.04141871, 0.04207318,..."
3230,主上,"[0.07417383, 0.026597852, 0.040637456, 0.02408..."
3231,赵堂官,"[0.018719776, 0.079556175, 0.027746059, 0.0569..."
3232,甄宝玉,"[0.045802593, 0.17701188, 0.07884574, 0.139127..."


In [22]:
# 参数 ebd_cols 定义哪些列存了 embedding
def embedding_df_to_csv(df, csv_path, ebd_cols: list):
    """将带有 embedding 的 DataFrame 存入 csv"""
    def ebd2str(embedding):
        if not isinstance(embedding, list):
            ebd = embedding.tolist()
        return json.dumps(ebd)

    for col in ebd_cols:
        df[col] = df[col].apply(ebd2str)

    df.to_csv(csv_path, index=False)

In [23]:
embedding_df_to_csv(df,
                    csv_path=DATAFILE_PATH,
                    ebd_cols=['embedding'])

In [24]:
def read_embedding_csv(csv_path, ebd_cols: list):
    """将带有 embedding 的 csv 读入 DataFrame"""
    df = pd.read_csv(csv_path)
    for col in ebd_cols:
        df[col] = df[col].apply(literal_eval).apply(lambda e: np.array(e))

    return df

In [25]:
read_embedding_csv(csv_path=DATAFILE_PATH,
                   ebd_cols=['embedding'])

Unnamed: 0,word,embedding
0,章,"[-0.06583423912525177, -0.3561728596687317, 0...."
1,贾雨村,"[0.0658484473824501, 0.06028838828206062, 0.06..."
2,一番,"[0.15242762863636017, 0.35603880882263184, 0.2..."
3,之后,"[0.11055880784988403, 0.22398142516613007, 0.1..."
4,说,"[0.5745651125907898, 0.7350919246673584, 0.186..."
...,...,...
3229,腰门,"[0.016465000808238983, 0.05250696837902069, 0...."
3230,主上,"[0.07417383044958115, 0.026597851887345314, 0...."
3231,赵堂官,"[0.01871977560222149, 0.0795561745762825, 0.02..."
3232,甄宝玉,"[0.04580259323120117, 0.1770118772983551, 0.07..."


试着用 `util.py` 来使用

In [26]:
import util

In [27]:
data = {
    'word': ebd_dict.keys(),
    'embedding': ebd_dict.values(),
}

df = pd.DataFrame(data)
# df

In [28]:
util.embedding_df_to_csv(df,
                         csv_path=DATAFILE_PATH,
                         ebd_cols=['embedding'])

In [29]:
util.read_embedding_csv(csv_path=DATAFILE_PATH,
                        ebd_cols=['embedding'])

Unnamed: 0,word,embedding
0,章,"[-0.06583423912525177, -0.3561728596687317, 0...."
1,贾雨村,"[0.0658484473824501, 0.06028838828206062, 0.06..."
2,一番,"[0.15242762863636017, 0.35603880882263184, 0.2..."
3,之后,"[0.11055880784988403, 0.22398142516613007, 0.1..."
4,说,"[0.5745651125907898, 0.7350919246673584, 0.186..."
...,...,...
3229,腰门,"[0.016465000808238983, 0.05250696837902069, 0...."
3230,主上,"[0.07417383044958115, 0.026597851887345314, 0...."
3231,赵堂官,"[0.01871977560222149, 0.0795561745762825, 0.02..."
3232,甄宝玉,"[0.04580259323120117, 0.1770118772983551, 0.07..."
