# 存储词向量

用 word2vec 获取单词的词向量，然后用句子里单词的平均词向量代表句子。

In [1]:
import re
import collections
import json
import jieba
import torch
import numpy as np
import pandas as pd
from ast import literal_eval

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [2]:
CN_BOOK_PATH = './data/honglou.txt'
CN_STOP_WORDS = './data/cn_stopwords.txt'
DATAFILE_PATH = './data/embedding.csv'
MIN_FREQ = 10

In [3]:
# 加载停用词
def load_stop_words(stop_words_path):
    with open(stop_words_path, 'r') as f:
        stop_words = f.read()
    return stop_words.split('\n')

## 1. 对语料的一些预处理

In [4]:
# 获取《红楼梦》中的句子
def get_sentences(book_path, cn_stop_words_path):
    with open(book_path, 'r') as f:
        content = f.read()

    # 删除 \n \u3000 \u3000
    pattern = re.compile(r'(\n|\u3000|\u3000)', re.IGNORECASE)
    content = pattern.sub('', content)
    
    # 加载中文停用词
    cn_stop_words = load_stop_words(cn_stop_words_path)
    
    # 切分句子
    sentences = re.split('。|！|？', content)
    
    # 去除停用词
    return [
        [word for word in jieba.cut(text) if word not in cn_stop_words]
        for text in sentences
    ]

In [5]:
sentences = get_sentences(book_path=CN_BOOK_PATH,
                       cn_stop_words_path=CN_STOP_WORDS)
len(sentences)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/97/m67m_56s0dq5k20t3vp4_pgh0000gn/T/jieba.cache
Loading model cost 0.612 seconds.
Prefix dict has been built successfully.


35077

## 2. 获取词嵌入和句子嵌入

In [6]:
cn_model = Word2Vec(sentences=sentences, vector_size=100, window=15, min_count=1, workers=4)
cn_model

<gensim.models.word2vec.Word2Vec at 0x17a6ddf00>

In [7]:
# 获取词嵌入
def get_word_embedding(word):
    words = cn_model.wv
    if word in words:
        return words[word]
    else:
        return None

get_word_embedding(word='林黛玉')

array([ 0.07251878,  0.15770395,  0.11379204,  0.06604948, -0.15489732,
       -0.29318848,  0.10615966,  0.40679982, -0.03119876, -0.06020098,
        0.00895631, -0.23698795, -0.01784477,  0.13235734,  0.17417051,
       -0.12999234,  0.12181527, -0.11276755, -0.18719095, -0.44675317,
        0.16567987,  0.14572555,  0.18636416, -0.03581078,  0.05338819,
       -0.04896234, -0.22351736, -0.00616824, -0.22950584,  0.03151248,
        0.08740475,  0.06269028, -0.01241498, -0.15811025, -0.145261  ,
        0.28001964,  0.02062484, -0.18381305, -0.16281833, -0.34243762,
        0.11386459, -0.20567004, -0.11124558, -0.027125  ,  0.10988442,
       -0.06218803, -0.19698441,  0.03841772,  0.01851665,  0.25603873,
        0.09005591, -0.25230888, -0.15981318, -0.08463471, -0.22542712,
        0.09570424, -0.00809338,  0.03964585, -0.14593245,  0.160187  ,
       -0.02926883,  0.15132795, -0.18317954,  0.06496811, -0.19175184,
        0.293342  ,  0.05851482,  0.1363457 , -0.26319325,  0.35

In [8]:
# 获取句子嵌入
def get_sentence_embedding(sentence: list):
    words = cn_model.wv
    if all([word in words for word in sentence]):
        return torch.Tensor([words[word].tolist() for word in sentence]).mean(dim=0).numpy()
    else:
        return None

In [9]:
sentences[2]

['但书中', '所记', '何事', '何人']

In [10]:
sentence_embedding = get_sentence_embedding(sentence=sentences[2])
sentence_embedding

array([ 0.037882  ,  0.07169044,  0.05131721,  0.03170025, -0.0725347 ,
       -0.13424191,  0.04338332,  0.18211511, -0.02519005, -0.05061328,
        0.01525364, -0.11491416, -0.01498895,  0.05756009,  0.06506534,
       -0.05236512,  0.04888804, -0.05529148, -0.08138166, -0.19626214,
        0.06844007,  0.06120624,  0.08241642, -0.01825217,  0.01898634,
       -0.02958478, -0.08497117, -0.00987194, -0.10439227,  0.00978291,
        0.04725308,  0.01796812,  0.00746861, -0.06670398, -0.05820407,
        0.12952685, -0.00574111, -0.07867193, -0.07742932, -0.15093225,
        0.05866503, -0.09438916, -0.0519472 , -0.00358284,  0.04341889,
       -0.0457468 , -0.08929627,  0.02641489, -0.00192572,  0.11063823,
        0.03389947, -0.11098844, -0.07778414, -0.034293  , -0.10359886,
        0.04242608, -0.00479957,  0.02337768, -0.06247374,  0.07830533,
       -0.02165234,  0.06178435, -0.06249453,  0.03456859, -0.08073604,
        0.1255085 ,  0.01946511,  0.06318392, -0.12430546,  0.14

## 3. 将词嵌入存入 csv

In [11]:
# 计算词频
corpus = [w for text in sentences for w in text]
ctr = collections.Counter(corpus)

# 过滤词频过低的词
n_corpus = [k for k, v in ctr.items() if v > MIN_FREQ]
len(n_corpus)

3234

In [12]:
# 获取每个词的 embedding
ebd_dict = dict()
for word in n_corpus:
    ebd_dict[word] = get_word_embedding(word)

len(ebd_dict)

3234

In [13]:
# 将 embedding 存成字符串
def ebd2str(embedding):
    return json.dumps(embedding.tolist())

In [14]:
data = {
    'word': ebd_dict.keys(),
    'embedding': ebd_dict.values(),
}

df = pd.DataFrame(data)
df

Unnamed: 0,word,embedding
0,章,"[0.24359591, 0.11027567, 0.9609027, -0.1914261..."
1,贾雨村,"[0.09581466, 0.12258382, 0.12087647, 0.0613169..."
2,一番,"[0.29162875, 0.5331246, 0.39700887, 0.24120261..."
3,之后,"[0.21557285, 0.37488845, 0.29100847, 0.1664445..."
4,说,"[0.56785476, 0.9324802, 0.5415122, 0.49521267,..."
...,...,...
3229,腰门,"[0.03943501, 0.06789011, 0.064401105, 0.037343..."
3230,主上,"[0.084934786, 0.028853638, 0.061858777, 0.0039..."
3231,赵堂官,"[0.041808877, 0.09019277, 0.051837936, 0.04455..."
3232,甄宝玉,"[0.10016797, 0.25801668, 0.14713632, 0.1226646..."


In [15]:
# 将 embedding 列里的所有 embedding 转成 str
df['embedding'] = df['embedding'].apply(ebd2str)

In [16]:
# 验证一下是不是转成 str 了
ebd = df[df['word'] == '林黛玉']['embedding'].iloc[0]
type(ebd)

str

In [17]:
df.to_csv(DATAFILE_PATH, index=False)

## 4. 从 csv 中读取词嵌入

In [18]:
rdf = pd.read_csv(DATAFILE_PATH)
# rdf

In [19]:
rdf.embedding = rdf.embedding.apply(literal_eval).apply(lambda e: np.array(e))
rdf

Unnamed: 0,word,embedding
0,章,"[0.2435959130525589, 0.11027567088603973, 0.96..."
1,贾雨村,"[0.09581466019153595, 0.12258382141590118, 0.1..."
2,一番,"[0.29162874817848206, 0.5331246256828308, 0.39..."
3,之后,"[0.21557284891605377, 0.37488844990730286, 0.2..."
4,说,"[0.5678547620773315, 0.9324802160263062, 0.541..."
...,...,...
3229,腰门,"[0.0394350104033947, 0.06789010763168335, 0.06..."
3230,主上,"[0.0849347859621048, 0.028853638097643852, 0.0..."
3231,赵堂官,"[0.041808877140283585, 0.0901927724480629, 0.0..."
3232,甄宝玉,"[0.10016796737909317, 0.2580166757106781, 0.14..."


In [20]:
type(rdf.iloc[0]['embedding'])

numpy.ndarray

## 5. 整合成函数

将以上功能整合成一个函数，并写入 `util.py`

In [21]:
# 假设名为 embedding 的列，被用来存储词向量
data = {
    'word': ebd_dict.keys(),
    'embedding': ebd_dict.values(),
}

df = pd.DataFrame(data)
df

Unnamed: 0,word,embedding
0,章,"[0.24359591, 0.11027567, 0.9609027, -0.1914261..."
1,贾雨村,"[0.09581466, 0.12258382, 0.12087647, 0.0613169..."
2,一番,"[0.29162875, 0.5331246, 0.39700887, 0.24120261..."
3,之后,"[0.21557285, 0.37488845, 0.29100847, 0.1664445..."
4,说,"[0.56785476, 0.9324802, 0.5415122, 0.49521267,..."
...,...,...
3229,腰门,"[0.03943501, 0.06789011, 0.064401105, 0.037343..."
3230,主上,"[0.084934786, 0.028853638, 0.061858777, 0.0039..."
3231,赵堂官,"[0.041808877, 0.09019277, 0.051837936, 0.04455..."
3232,甄宝玉,"[0.10016797, 0.25801668, 0.14713632, 0.1226646..."


In [22]:
# 参数 ebd_cols 定义哪些列存了 embedding
def embedding_df_to_csv(df, csv_path, ebd_cols: list):
    """将带有 embedding 的 DataFrame 存入 csv"""
    def ebd2str(embedding):
        if not isinstance(embedding, list):
            ebd = embedding.tolist()
        return json.dumps(ebd)

    for col in ebd_cols:
        df[col] = df[col].apply(ebd2str)

    df.to_csv(csv_path, index=False)

In [23]:
embedding_df_to_csv(df,
                    csv_path=DATAFILE_PATH,
                    ebd_cols=['embedding'])

In [24]:
def read_embedding_csv(csv_path, ebd_cols: list):
    """将带有 embedding 的 csv 读入 DataFrame"""
    df = pd.read_csv(csv_path)
    for col in ebd_cols:
        df[col] = df[col].apply(literal_eval).apply(lambda e: np.array(e))

    return df

In [25]:
read_embedding_csv(csv_path=DATAFILE_PATH,
                   ebd_cols=['embedding'])

Unnamed: 0,word,embedding
0,章,"[0.2435959130525589, 0.11027567088603973, 0.96..."
1,贾雨村,"[0.09581466019153595, 0.12258382141590118, 0.1..."
2,一番,"[0.29162874817848206, 0.5331246256828308, 0.39..."
3,之后,"[0.21557284891605377, 0.37488844990730286, 0.2..."
4,说,"[0.5678547620773315, 0.9324802160263062, 0.541..."
...,...,...
3229,腰门,"[0.0394350104033947, 0.06789010763168335, 0.06..."
3230,主上,"[0.0849347859621048, 0.028853638097643852, 0.0..."
3231,赵堂官,"[0.041808877140283585, 0.0901927724480629, 0.0..."
3232,甄宝玉,"[0.10016796737909317, 0.2580166757106781, 0.14..."


试着用 `util.py` 来使用

In [26]:
import util

In [27]:
data = {
    'word': ebd_dict.keys(),
    'embedding': ebd_dict.values(),
}

df = pd.DataFrame(data)
# df

In [28]:
util.embedding_df_to_csv(df,
                         csv_path=DATAFILE_PATH,
                         ebd_cols=['embedding'])

In [29]:
util.read_embedding_csv(csv_path=DATAFILE_PATH,
                        ebd_cols=['embedding'])

Unnamed: 0,word,embedding
0,章,"[0.2435959130525589, 0.11027567088603973, 0.96..."
1,贾雨村,"[0.09581466019153595, 0.12258382141590118, 0.1..."
2,一番,"[0.29162874817848206, 0.5331246256828308, 0.39..."
3,之后,"[0.21557284891605377, 0.37488844990730286, 0.2..."
4,说,"[0.5678547620773315, 0.9324802160263062, 0.541..."
...,...,...
3229,腰门,"[0.0394350104033947, 0.06789010763168335, 0.06..."
3230,主上,"[0.0849347859621048, 0.028853638097643852, 0.0..."
3231,赵堂官,"[0.041808877140283585, 0.0901927724480629, 0.0..."
3232,甄宝玉,"[0.10016796737909317, 0.2580166757106781, 0.14..."
