In [1]:

from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant


Using TensorFlow backend.


In [14]:
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'data')
TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [15]:

print('Indexing word vectors.')
embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs

Indexing word vectors.


In [16]:

print('Found %s word vectors.' % len(embeddings_index))

# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = [] # list of label ids

Found 1 word vectors.
Processing text dataset


In [18]:
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
                with open(fpath, **args) as f:
                    t = f.read()
                    i = t.find('\n\n')  # skip header
                    if 0 < i:
                        t = t[i:]
                    texts.append(t)
                labels.append(label_id)

In [19]:
print('Found %s texts.' % len(texts))

Found 39994 texts.


In [20]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [34]:
word_index = tokenizer.word_index

In [35]:
word_index

{'the': 1,
 'to': 2,
 'of': 3,
 'a': 4,
 'and': 5,
 'in': 6,
 'i': 7,
 'is': 8,
 'that': 9,
 "'ax": 10,
 'it': 11,
 'for': 12,
 'you': 13,
 'this': 14,
 'on': 15,
 'be': 16,
 'not': 17,
 'have': 18,
 'are': 19,
 'with': 20,
 'as': 21,
 'or': 22,
 'if': 23,
 'but': 24,
 'was': 25,
 'edu': 26,
 'they': 27,
 '1': 28,
 'from': 29,
 'by': 30,
 'at': 31,
 'an': 32,
 'my': 33,
 'can': 34,
 'what': 35,
 'all': 36,
 '2': 37,
 'would': 38,
 'there': 39,
 'one': 40,
 'will': 41,
 'do': 42,
 'writes': 43,
 'about': 44,
 '0': 45,
 'we': 46,
 '3': 47,
 'so': 48,
 'com': 49,
 'no': 50,
 'he': 51,
 'has': 52,
 'your': 53,
 'article': 54,
 'any': 55,
 'm': 56,
 'me': 57,
 'some': 58,
 'x': 59,
 'who': 60,
 'which': 61,
 'out': 62,
 "'": 63,
 'like': 64,
 "don't": 65,
 'people': 66,
 'when': 67,
 'more': 68,
 'just': 69,
 'were': 70,
 'their': 71,
 'up': 72,
 '4': 73,
 'know': 74,
 'other': 75,
 'only': 76,
 'them': 77,
 '5': 78,
 'get': 79,
 'how': 80,
 'had': 81,
 'than': 82,
 'been': 83,
 'think': 84