https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [33]:
import os
import sys
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical


In [5]:
sys.version_info

sys.version_info(major=3, minor=6, micro=7, releaselevel='final', serial=0)

In [37]:
TEXT_DATA_DIR = os.path.join('data_untracked', '20_newsgroup')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [7]:
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))

Found 19997 texts.


In [16]:
texts[0]

'\n\nArchive-name: atheism/resources\nAlt-atheism-archive-name: resources\nLast-modified: 11 December 1992\nVersion: 1.0\n\n                              Atheist Resources\n\n                      Addresses of Atheist Organizations\n\n                                     USA\n\nFREEDOM FROM RELIGION FOUNDATION\n\nDarwin fish bumper stickers and assorted other atheist paraphernalia are\navailable from the Freedom From Religion Foundation in the US.\n\nWrite to:  FFRF, P.O. Box 750, Madison, WI 53701.\nTelephone: (608) 256-8900\n\nEVOLUTION DESIGNS\n\nEvolution Designs sell the "Darwin fish".  It\'s a fish symbol, like the ones\nChristians stick on their cars, but with feet and the word "Darwin" written\ninside.  The deluxe moulded 3D plastic fish is $4.95 postpaid in the US.\n\nWrite to:  Evolution Designs, 7119 Laurel Canyon #4, North Hollywood,\n           CA 91605.\n\nPeople in the San Francisco Bay area can get Darwin Fish from Lynn Gold --\ntry mailing <figmo@netcom.com>.  For net 

In [31]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [23]:
sequences[0]

[1237,
 273,
 1213,
 1439,
 1071,
 1213,
 1237,
 273,
 1439,
 192,
 2515,
 348,
 2964,
 779,
 332,
 28,
 45,
 1628,
 1439,
 2516,
 3,
 1628,
 2144,
 780,
 937,
 29,
 441,
 2770,
 8854,
 4601,
 7969,
 11979,
 5,
 12806,
 75,
 1628,
 40963,
 19,
 229,
 29,
 1,
 937,
 29,
 441,
 2770,
 6,
 1,
 118,
 558,
 2,
 59314,
 90,
 106,
 482,
 3979,
 6602,
 5375,
 40964,
 1871,
 12260,
 1632,
 17687,
 1828,
 5101,
 1828,
 5101,
 788,
 1,
 8854,
 4601,
 96,
 4,
 4601,
 5455,
 64,
 1,
 751,
 563,
 1716,
 15,
 71,
 844,
 24,
 20,
 1971,
 5,
 1,
 389,
 8854,
 744,
 1023,
 1,
 7762,
 40965,
 1300,
 2912,
 4601,
 8,
 73,
 1698,
 36188,
 6,
 1,
 118,
 558,
 2,
 1828,
 5101,
 48170,
 16500,
 13447,
 73,
 1261,
 10982,
 170,
 59315,
 66,
 6,
 1,
 869,
 2235,
 2544,
 534,
 34,
 79,
 8854,
 4601,
 29,
 6603,
 3388,
 264,
 1505,
 59316,
 535,
 49,
 12,
 343,
 66,
 60,
 155,
 2,
 6603,
 1043,
 1,
 427,
 8,
 73,
 1698,
 618,
 4601,
 417,
 1628,
 632,
 11716,
 4602,
 814,
 1628,
 691,
 25836,
 3,
 1,
 467,
 2163,

In [34]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 174074 unique tokens.
Shape of data tensor: (19997, 1000)
Shape of label tensor: (19997, 20)


In [35]:
len(tokenizer.word_index)

174074

In [38]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]