### 单词级的one-hot编码

In [4]:
import numpy as np

samples = ['The cat sat on the mat.','The dog ate my homework.']

token_index = {} #构建数据中所有标记的索引

for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1

token_index #为每一个单词指定索引
max_lenght = 10

results = np.zeros(shape=(len(samples),
                          max_lenght,
                          max(token_index.values())+1))
results
for i,sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_lenght]:
        index = token_index.get(word)
        results[i,j,index] = 1.
results

array([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0

### 字符级别的one-hot编码

In [13]:
import string

samples = ['The cat sat on the mat.','The dog ate my homework.']
charactors = string.printable

token_index = dict(zip(range(1,len(charactors)+1),charactors))

token_index
max_lenght = 50

results = np.zeros((len(charactors),
                  max_lenght,
                  max(token_index.keys())+1))

for i,sample in enumerate(samples):
    for j,charactor in enumerate(sample):
        index = token_index.get(charactor)
        results[i,j,index] = 1.
        
results

array([[[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

### 用Keras实现单词级的one-hot编码

In [19]:
from keras.preprocessing.text import Tokenizer
samples = ['The cat sat on the mat.','The dog ate my homework.']

tokenizer = Tokenizer(num_words=1000) #常见一个分词器，设置只考虑前1000个常用单词
tokenizer.fit_on_texts(samples) #构建单词索引

swquences = tokenizer.texts_to_sequences(samples) 
one_hot_results = tokenizer.texts_to_matrix(samples,mode='binary')

work_index = tokenizer.word_index #找回单词索引
# print('Found %s unique tokens.' %len(work_index))
work_index

{'the': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'mat': 5,
 'dog': 6,
 'ate': 7,
 'my': 8,
 'homework': 9}

### 使用散列单词级one-hot编码

In [22]:
samples = ['The cat sat on the mat.','The dog ate my homework.']

dimensionality = 1000 #
max_lenght = 10

results = np.zeros((len(samples),
                  max_lenght,
                   dimensionality))

for i,sample in enumerate(samples):
    for j,word in enumerate(samples):
        index = abs(hash(word)) % dimensionality #将单词散列为0~1000范围内的随机整数索引
        results[i,j,index] = 1.
        
results

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

## 使用词嵌入

### 将一个Embedding层实例化

In [24]:
from keras.layers import Embedding

embedding_layer = Embedding(1000,64) # Embedding层至少需要两个参数：标记的个数和嵌入的维度


<keras.layers.embeddings.Embedding at 0x1250b2278>

#### 加载IMDB，准备Embedding层

In [13]:
from keras.datasets import imdb
from keras import preprocessing

max_features = 10000
maxlen = 20

# 将数据加载为整数列表
(x_train,y_train),(x_test,y_test) = imdb.load_data(num_words=max_features)

# 将整数列表转换成形状为(sample,maxlen)的二维整数张量
x_train = preprocessing.sequence.pad_sequences(x_train,maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test,maxlen=maxlen)
x_train

array([[  65,   16,   38, ...,   19,  178,   32],
       [  23,    4, 1690, ...,   16,  145,   95],
       [1352,   13,  191, ...,    7,  129,  113],
       ...,
       [  11, 1818, 7561, ...,    4, 3586,    2],
       [  92,  401,  728, ...,   12,    9,   23],
       [ 764,   40,    4, ...,  204,  131,    9]], dtype=int32)

### 在IMDB数据上使用Embedding层和分类器

In [16]:
from keras.models import Sequential
from keras.layers import Flatten,Dense,Embedding

model = Sequential()
model.add(Embedding(1954,8,input_length=maxlen)) 
model.add(Flatten())

model.add(Dense(1,activation='sigmoid')) #添加分类器
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 20, 8)             15632     
_________________________________________________________________
flatten_6 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 161       
Total params: 15,793
Trainable params: 15,793
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(x_train,y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2)

## 从原始文本到词嵌入
### 下载IMDB数据的原始文本

In [25]:
import os 

imdb_dir = '/Users/liuhuan/Downloads/aclImdb'
train_dir = os.path.join(imdb_dir,'train')

labels = []
texts = []

for label_type in ['neg','pos']:
    dir_name = os.path.join(train_dir,label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name,fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

### 对IMDB原始数据的文本进行分词

In [26]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 100
training_sample = 200
validation_samples = 10000
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts=texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences,maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:',labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_sample]
y_train = labels[:training_sample]
x_val = data[training_sample: training_sample + validation_samples]
y_val = labels[training_sample: training_sample + validation_samples]

SyntaxError: invalid syntax (<ipython-input-26-54720d46fe60>, line 2)