# 开源词嵌入向量实验

1. tencent ailab
    - 读取腾讯词典，导入切词工具
    - 将预训练词向量导入模型词嵌入层
    
2. google glove
    - 读取词典导入切词工具
    - 将预训练词向量导入模型词嵌入层

# 1. Tencent AI-lib 

下载数据：[https://ai.tencent.com/ailab/nlp/embedding.html](https://ai.tencent.com/ailab/nlp/embedding.html)

Tencent AI Lab Embedding Corpus for Chinese Words and Phrases


## 1.1 读取腾讯词典，导入切词工具

1. 生成词典

生成词典文件`tencent.bin`

In [1]:
from tqdm import tqdm

def gendict(inputFile, ouputFile):
    output_f = open(ouputFile, 'ab')
    with open(inputFile, "r", encoding='ISO-8859-1') as f:
        header = f.readline()
        vocab_size, vector_size = map(int, header.split())
        for i in tqdm(range(vocab_size)):
            line = f.readline()
            lists = line.split(' ')
            word = lists[0]
            try: 
                word = word.encode('ISO-8859-1').decode('utf8')
                output_f.write((word+'\n').encode('utf8'))
            except: pass
        output_f.close()
        f.close()


inputfile = 'E:\\Desktop\\nlp\\Tencent_AILab_ChineseEmbedding.txt'
outputfile = 'E:\\Desktop\\nlp\\tencent.bin'
#gendict(inputfile, outputfile)

2. 读取词典

将词典文`tencent.bin`导入分词工具。
- jieba


In [2]:
import jieba
jieba.lcut('我今天吃了西红柿炒面，隔壁的人也是因吹斯听的人。')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Hongwen\AppData\Local\Temp\jieba.cache
Loading model cost 0.787 seconds.
Prefix dict has been built succesfully.


['我',
 '今天',
 '吃',
 '了',
 '西红柿',
 '炒面',
 '，',
 '隔壁',
 '的',
 '人',
 '也',
 '是',
 '因吹斯',
 '听',
 '的',
 '人',
 '。']

In [3]:
import jieba
from tqdm import tqdm
jieba.add_word('开始')

def load_userdict(f):
    f = open(f, 'r', encoding='utf8')
    data = f.readlines()
    for i in tqdm(range(len(data))):
        word = data[i].strip('\n')
        jieba.add_word(word)

load_userdict('E:\\Desktop\\nlp\\tencent.bin')

100%|█████████████████████████████████████████████████████████████████████| 9046401/9046401 [04:05<00:00, 36854.88it/s]


In [4]:
jieba.lcut('我今天吃了西红柿炒面，隔壁的人也是因吹斯听的人。')

['我', '今天', '吃了', '西红柿炒面', '，', '隔壁', '的人', '也是', '因吹斯听', '的人', '。']

## 1.2 将读取到的词向量载入模型embedding层

- **embeddingFile:** `Tencent_AILab_ChineseEmbedding.txt`
- **word2id:** GenData.ch2id
- **embeddingSize:** 200



In [5]:
from tqdm import tqdm

def loadEmbedding(embeddingFile, word2id, embeddingSize):
    with open(embeddingFile, "r", encoding='ISO-8859-1') as f:
        header = f.readline()
        vocab_size, vector_size = map(int, header.split())
        initW = np.random.uniform(-0.25,0.25,(len(word2id), vector_size))
        count = 0
        print('loadding embedding data from tencent ailab ...')
        for i in tqdm(range(vocab_size)):
            line = f.readline()
            lists = line.split(' ')
            word = lists[0]
            try: word = word.encode('ISO-8859-1').decode('utf8')
            except: pass
            if word in word2id:
                count += 1
                number = map(float, lists[1:])
                number = list(number)
                vector = np.array(number)
                initW[word2id[word]] = vector
        print(count)
        return initW

In [6]:
import tensorflow as tf
import numpy as np
# 包含数据处理函数
from utils import GenData

    
data = GenData('cmn.txt','jieba',200)
weights = loadEmbedding('E:\\Desktop\\nlp\\Tencent_AILab_ChineseEmbedding.txt', data.ch2id, 200)
print(weights.shape)


tf.reset_default_graph()
embedding = tf.get_variable('embedding', [len(data.ch2id), 200])

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    sess.run(embedding.assign(weights))
    print(sess.run(embedding[0]))

loadding embedding data from tencent ailab ...


100%|█████████████████████████████████████████████████████████████████████| 8825658/8825658 [02:28<00:00, 59376.01it/s]


239
(248, 200)
Instructions for updating:
Use `tf.global_variables_initializer` instead.
[ 0.21811423 -0.09034403 -0.02901633  0.15603746 -0.07284238  0.09341685
  0.2026991  -0.071945   -0.04600149  0.18145561  0.179964   -0.15550348
 -0.16100383 -0.15658209 -0.13439766 -0.23189929  0.19315624 -0.04714389
 -0.00332712  0.1696456   0.20265692  0.18811639 -0.20272435  0.03150773
  0.22014199 -0.1266679  -0.02821003 -0.01831185  0.11978181 -0.22922224
 -0.00780576  0.04416671 -0.22243042 -0.22626986  0.15837246 -0.22912757
  0.22339202 -0.06263614 -0.1612309   0.10037225 -0.09283496 -0.0659591
 -0.09785461 -0.07583669  0.21064714 -0.22052026  0.21821363  0.0054501
 -0.19645277  0.18829118  0.0121442  -0.17428291  0.03204966 -0.16647246
 -0.09764255  0.01065319 -0.12923214 -0.24241629  0.00064717 -0.07459965
  0.10835728 -0.18958226 -0.17542696 -0.07144472  0.24785972  0.14723393
 -0.13104634 -0.11659891 -0.09243217 -0.23772074  0.20870206  0.214679
  0.2485326   0.21340017 -0.16217211 -0

# 2. GOOGLE Glove

下载数据：[https://nlp.stanford.edu/projects/glove/](https://nlp.stanford.edu/projects/glove/)

## Introduction

GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.

## 1.1 读取谷歌词典，导入切词工具

1. 读取词典

生成词典文件`google.bin`

In [7]:
from tqdm import tqdm

def gendict(inputFile, ouputFile):
    output_f = open(ouputFile, 'ab')
    with open(inputFile, "r", encoding='ISO-8859-1') as f:
        data = f.readlines()
        for i in tqdm(range(len(data))):
            line = data[i]
            lists = line.split(' ')
            word = lists[0]
            try: 
                word = word.encode('ISO-8859-1').decode('utf8')
                output_f.write((word+'\n').encode('utf8'))
            except: pass
        output_f.close()
        f.close()

gendict('E:\\Desktop\\nlp\\glove.840B.300d.txt', 'E:\\Desktop\\nlp\\google.bin')

100%|█████████████████████████████████████████████████████████████████████| 2196017/2196017 [01:06<00:00, 32984.01it/s]


2. 读取词典

将词典文`tencent.bin`导入分词工具。
- jieba


In [9]:
import jieba


def load_userdict(f):
    f = open(f, 'r', encoding='utf8')
    data = f.readlines()
    print('load userdict to jieba dict ...')
    for i in tqdm(range(len(data))):
        word = data[i].strip('\n')
        jieba.add_word(word)

load_userdict('E:\\Desktop\\nlp\\google.bin')

print(jieba.lcut('Hi, my name is sunhongwen.'))

load userdict to jieba dict ...


100%|█████████████████████████████████████████████████████████████████████| 2196017/2196017 [02:05<00:00, 17502.50it/s]


['Hi', ',', ' ', 'my', ' ', 'name', ' ', 'is', ' ', 'sun', 'hongwen', '.']


In [10]:
print([char for char in jieba.lcut(
        'If a person has not had a chance \
        to acquire his target language by \
        the time he\'s an adult, he\'s unlikely \
        to be able to reach native speaker level \
        in that language.') if char != ' '])

['If', 'a', 'person', 'has', 'not', 'had', 'a', 'chance', 'to', 'acquire', 'his', 'target', 'language', 'by', 'the', 'time', 'he', "'", 's', 'an', 'adult', ',', 'he', "'", 's', 'unlikely', 'to', 'be', 'able', 'to', 'reach', 'native', 'speaker', 'level', 'in', 'that', 'language.']


## 1.2 将读取到的词向量载入模型embedding层

- **embeddingFile:** `glove.840B.300d.txt.txt`
- **word2id:** GenData.en2id
- **embeddingSize:** 200

In [11]:
import numpy as np
from tqdm import tqdm
# 包含数据处理函数

def loadEmbedding(embeddingFile, word2id, embeddingSize):
    with open(embeddingFile, "r", encoding='ISO-8859-1') as f:
        data = f.readlines()
        initW = np.random.uniform(-0.25,0.25,(len(word2id), embeddingSize))
        count = 0
        for i in tqdm(range(len(data))):
            line = data[i]
            lists = line.split(' ')
            word = lists[0]
            try: word = word.encode('ISO-8859-1').decode('utf8')
            except: pass
            if word in word2id:
                count += 1
                number = map(float, lists[1:])
                number = list(number)
                vector = np.array(number)
                initW[word2id[word]] = vector
        print(count)
        return initW

In [None]:
from utils import GenData


def main():
    data = GenData('cmn.txt','jieba',200)
    print(data.en2id)
    weight = loadEmbedding('E:\\Desktop\\nlp\\glove.840B.300d.txt', data.en2id, 300)
    print(weight.shape)

main()

{'<PAD>': 0, '<UNK>': 1, '!': 2, "'": 3, ',': 4, '.': 5, '3': 6, '30.': 7, ':': 8, '?': 9, 'All': 10, 'Am': 11, 'Ask': 12, 'Back': 13, 'Be': 14, 'Birds': 15, 'Call': 16, 'Can': 17, 'Catch': 18, 'Cheers': 19, 'Come': 20, 'Cuff': 21, 'DJ.': 22, 'Definitely': 23, 'Do': 24, 'Dogs': 25, 'Don': 26, 'Drive': 27, 'Excuse': 28, 'Feel': 29, 'Fill': 30, 'Follow': 31, 'Get': 32, 'Go': 33, 'God': 34, 'Good': 35, 'Goodbye': 36, 'Grab': 37, 'Hands': 38, 'Hang': 39, 'Have': 40, 'He': 41, 'Hello': 42, 'Help': 43, 'Hey': 44, 'Hi.': 45, 'Hold': 46, 'Hop': 47, 'How': 48, 'Hug': 49, 'Humor': 50, 'Hurry': 51, 'I': 52, 'Is': 53, 'It': 54, 'Join': 55, 'Keep': 56, 'Kiss': 57, 'Leave': 58, 'Let': 59, 'Lie': 60, 'Listen': 61, 'Look': 62, 'Move': 63, 'No': 64, 'OK.': 65, 'Of': 66, 'Oh': 67, 'Open': 68, 'Perfect': 69, 'Read': 70, 'Really': 71, 'Run': 72, 'See': 73, 'She': 74, 'Shut': 75, 'Sit': 76, 'Skip': 77, 'Slow': 78, 'Stand': 79, 'Stay': 80, 'Stop': 81, 'Take': 82, 'They': 83, 'Tom': 84, 'Tom.': 85, 'Trust': 

if you are
