In [0]:
#https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

from google.colab import drive
import os
drive.mount('/content/gdrive')

In [0]:

TEXT_DATA_DIR = os.path.join('/content/gdrive/My Drive/AI/keras-newsgroup')
print("TEXT_DATA_DIR: ", TEXT_DATA_DIR)

TEXT_DATA_DIR:  /content/gdrive/My Drive/AI/keras-newsgroup


In [0]:
import sys

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))
print('Len: ', len(texts[0]))
textArr = texts[0].split('\n')
print(textArr[0:10])
print('labels:', labels)

Found 400 texts.
Len:  11521
['', '', 'Archive-name: atheism/resources', 'Alt-atheism-archive-name: resources', 'Last-modified: 11 December 1992', 'Version: 1.0', '', '                              Atheist Resources', '', '                      Addresses of Atheist Organizations']
labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

MAX_NB_WORDS = 10000
MAX_SEQUENCE_LENGTH = 200
VALIDATION_SPLIT = 0.2 

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
print('labels[0]', labels[0])

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
print('indices: ', indices)

np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
print('data.shape[0]: ', data.shape[0])
print('VALIDATION_SPLIT: ', VALIDATION_SPLIT)
print('nb_validation_samples: ', nb_validation_samples)

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

Using TensorFlow backend.


Found 10356 unique tokens.
Shape of data tensor: (400, 200)
Shape of label tensor: (400, 2)
labels[0] [1. 0.]
indices:  [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216



In [0]:
GLOVE_DIR = os.path.join('/content/gdrive/My Drive/AI')
print("GLOVE_DIR: ", GLOVE_DIR)


embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

GLOVE_DIR:  /content/gdrive/My Drive/AI
Found 400000 word vectors.


In [0]:
EMBEDDING_DIM = 100

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [0]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)




In [0]:
from keras.layers import Input, Dense, Conv1D, MaxPooling1D, Flatten
from keras.models import Model
from keras.layers import TimeDistributed


sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
print('sequence_input.shape: ', sequence_input.shape)

embedded_sequences = embedding_layer(sequence_input)
print('embedded_sequences.shape: ', embedded_sequences.shape)

x = Conv1D(128, 5, activation='relu')(embedded_sequences)
print('x.shape: ', x.shape)
x = MaxPooling1D(5)(x)
print('m x.shape: ', x.shape)

x = Conv1D(128, 5, activation='relu')(x)
print('x.shape: ', x.shape)
x = MaxPooling1D(5)(x)
print('m x.shape: ', x.shape)

#x = Conv1D(128, 5, activation='relu')(x)
#print('x.shape: ', x.shape)
#x = MaxPooling1D(35)(x)  # global max pooling
#print('m x.shape: ', x.shape)

x = Flatten()(x)
print('f x.shape: ', x.shape)

x = Dense(128, activation='relu')(x)
print('d x.shape: ', x.shape)

preds = Dense(len(labels_index), activation='softmax')(x)
print('preds.shape: ', preds.shape)

model = Model(sequence_input, preds)
model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])



sequence_input.shape:  (?, 200)







embedded_sequences.shape:  (?, 200, 100)
x.shape:  (?, 196, 128)

m x.shape:  (?, 39, 128)
x.shape:  (?, 35, 128)
m x.shape:  (?, 7, 128)
f x.shape:  (?, ?)
d x.shape:  (?, 128)
preds.shape:  (?, 2)
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 100)          1035700   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 196, 128)          64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 35, 128)           82048     
__________________

In [0]:
print('x_train: ', x_train[0:1])
print('y_train: ', y_train[0])
print('x_val: ', x_val[0:1])
print('y_val: ', y_val[0])

# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=2, batch_size=32)

x_train:  [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
y_train:  [0. 1.]
x_val:  [[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0  256  297   37   77  353   12  361    2  789    2 2425  191  172
  7674    3  257   43 3484 1580    2   16    1   32  160  257   87   75
   749    7   11   48  789  509   11   36   16 1884    7   33   48   43
  3484  186    2   16    1   32  160  257  157  791    2 1302  202    1
  1617    5    5   12  257   79    3  122 7675

<keras.callbacks.History at 0x7f1110e64f28>