<a href="https://colab.research.google.com/github/Xw-Jia/2019_SaveYourself_GetOffer/blob/master/%E4%B9%A6%E7%9B%AE%E6%95%B4%E7%90%86/Python%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0/Part2%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0%E5%AE%9E%E8%B7%B5/%E4%BB%A3%E7%A0%81/6_1_one_hot_encoder_of_word_or_char.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import keras
keras.__version__

Using TensorFlow backend.


'2.2.4'

In [0]:
'''
word级别one-hot编码
'''
import numpy as np

# 初始数据; one entry per "sample"，每个样本是列表中的一个元素，是个句子，也可以是个文档
# (in this toy example, a "sample" is just a sentence, but
# it could be an entire document).
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

# First, build an index of all tokens in the data.
token_index = {}
for sample in samples:
    # We simply tokenize the samples via the `split` method.使用split方法，分词
    # in real life, we would also strip punctuation and special characters//实际中，要去除标点和特殊字符
    # from the samples.
    for word in sample.split():
        if word not in token_index:
            # Assign a unique index to each unique word//为每个单词指定一个索引，但是，索引0没有指定单词
            token_index[word] = len(token_index) + 1
            # Note that we don't attribute index 0 to anything.

# Next, we vectorize our samples.
# We will only consider the first `max_length` words in each sample.对样本分词，只考虑前max_length个单词
max_length = 10

# This is where we store our results:
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1.

In [0]:
#char级别 one-hot编码
import string

samples = ['The cat sat on the mat.', 'The dog ate my homework.']
characters = string.printable  # All printable ASCII characters.
token_index = dict(zip(characters, range(1, len(characters) + 1)))

max_length = 50
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
for i, sample in enumerate(samples):
    for j, character in enumerate(sample[:max_length]):
        index = token_index.get(character)
        results[i, j, index] = 1.

In [5]:
'''
Keras内置函数，可以实现word或者char的one-hot
'''

# word级别one-hot
from keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

# 创建一个分词器tokenizer，设置只考虑前1000个最常见的词
tokenizer = Tokenizer(num_words=1000)
# 构建单词索引
tokenizer.fit_on_texts(samples)

# 将字符串转成，整数index组成的list
sequences = tokenizer.texts_to_sequences(samples)

# You could also directly get the one-hot binary representations.
# Note that other vectorization modes than one-hot encoding are supported!
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')

# 找回单词索引
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 9 unique tokens.


one-hot编码的一种变体：
> one-hot散列技巧（适用于词表中的token过多）--hashing trick,
不再为每个单词显式分配一个index，并保存在一个字典中，而是将单词散列编码成固定长度的向量，优点在于：不再维护一个显式的单词索引，节省内存，

> 缺点在于：可能出现散列冲突，hash collision，两个不同单词可能有相同的hash

In [0]:
'''
使用hash one-hot  word level
'''

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

# 单词保存为长度1000的向量
# 实际单词数量远大于1000，容易出现hash问题
dimensionality = 1000
max_length = 10

results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        # 单词hash为0-1000的一个随机整数索引
        index = abs(hash(word)) % dimensionality
        results[i, j, index] = 1.