## 20 News Group Classification

Adapted from https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [1]:
#assure that GPU is available to Keras 
from keras import backend
assert len(backend.tensorflow_backend._get_available_gpus()) > 0

Using TensorFlow backend.


In [2]:
#install spacy and spacy embedding

In [3]:
#!pip3 install -U spacy

In [4]:
#!python3 -m spacy download en_vectors_web_lg

### Preparing the Text Data 

In [5]:
import os
import sys
import numpy as np
import random
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Flatten
from keras.models import Model
from keras.initializers import Constant

In [6]:
TEXT_DATA_DIR = "data"

MAX_SEQUENCE_LENGTH = 1000 #messages are truncated to max. length of 1000
MAX_NUM_WORDS = 20000      #top 20000 most occuring words in the text
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

In [7]:
import sys 
sys.version_info

sys.version_info(major=3, minor=5, micro=3, releaselevel='final', serial=0)

In [8]:
import os 

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        
        for fname in sorted(os.listdir(path)):
            
            if fname.isdigit():    
                fpath = os.path.join(path, fname)
                f = open(fpath, encoding='latin-1')
                
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                
                texts.append(t)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))

Found 19997 texts.


In [9]:
texts[3]

'\n\ndmn@kepler.unh.edu (...until kings become philosophers or philosophers become kings) writes:\n>      Recently, RAs have been ordered (and none have resisted or cared about\n> it apparently) to post a religious flyer entitled _The Soul Scroll: Thoughts\n> on religion, spirituality, and matters of the soul_ on the inside of bathroom\n> stall doors. (at my school, the University of New Hampshire) It is some sort\n> of newsletter assembled by a Hall Director somewhere on campus. It poses a\n> question about \'spirituality\' each issue, and solicits responses to be \n> included in the next \'issue.\' It\'s all pretty vague. I assume it\'s put out\n> by a Christian, but they\'re very careful not to mention Jesus or the bible.\n> I\'ve heard someone defend it, saying "Well it doesn\'t support any one religion.\n> " So what??? This is a STATE university, and as a strong supporter of the\n> separation of church and state, I was enraged.\n> \n>      What can I do about this?\n\nIt sounds to

In [10]:
labels_index

{'alt.atheism': 0,
 'comp.graphics': 1,
 'comp.os.ms-windows.misc': 2,
 'comp.sys.ibm.pc.hardware': 3,
 'comp.sys.mac.hardware': 4,
 'comp.windows.x': 5,
 'misc.forsale': 6,
 'rec.autos': 7,
 'rec.motorcycles': 8,
 'rec.sport.baseball': 9,
 'rec.sport.hockey': 10,
 'sci.crypt': 11,
 'sci.electronics': 12,
 'sci.med': 13,
 'sci.space': 14,
 'soc.religion.christian': 15,
 'talk.politics.guns': 16,
 'talk.politics.mideast': 17,
 'talk.politics.misc': 18,
 'talk.religion.misc': 19}

In [11]:
print(labels[:5])
print(labels[-5:])

[0, 0, 0, 0, 0]
[19, 19, 19, 19, 19]


In [12]:
len(labels)

19997

### Launching Keras

#### Tokenizing 

In [13]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [14]:
#Sequences contains tokens of words in messages 
for i in range(10): 
    print(len(sequences[i]))

1528
5116
678
219
83
814
68
226
432
39


In [15]:
#word_index stores our created word tokens indices 
type(tokenizer.word_index)

dict

In [16]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 174074 unique tokens.


In [17]:
#the word_index dict contains the indices of the 174074 found words
for i in range(5): 
    print(random.choice(list(word_index.items())))

('weds', 142473)
('dfko', 108396)
('pkit', 118941)
('labatts', 74824)
('subfile', 155732)


In [18]:
#ordered by frequency
for word in ['the', 'to', 'of', 'a', 'and', 'be']: 
    print((word, word_index[word]))

('the', 1)
('to', 2)
('of', 3)
('a', 4)
('and', 5)
('be', 16)


In [19]:
#word counts can be accessed as well
word_counts = tokenizer.word_counts
for word in ['the', 'to', 'of', 'a', 'and', 'be']: 
    print((word, word_counts[word]))

('the', 252472)
('to', 127265)
('of', 114002)
('a', 109464)
('and', 99868)
('be', 32688)


#### Padding Sequences

In [20]:
from keras.preprocessing.sequence import pad_sequences

#Pad Sequences fills array up with 0s 
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
data[2]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [21]:
#Convert Labels into 2d array
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (19997, 1000)
Shape of label tensor: (19997, 20)


In [22]:
#label for text 10000
labels[10000]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.], dtype=float32)

In [23]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices) #inplace!
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

X_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
X_test = data[-nb_validation_samples:]
y_test = labels[-nb_validation_samples:]

In [24]:
X_train.shape

(15998, 1000)

In [25]:
X_train[:10]

array([[    0,     0,     0, ...,  2726,  1791,    26],
       [    0,     0,     0, ...,   410,  1673,  1821],
       [    0,     0,     0, ...,    63,    63,    63],
       ...,
       [    0,     0,     0, ...,    53, 17120,  2050],
       [    0,     0,     0, ...,    17,  1147,   428],
       [    0,     0,     0, ...,    69,   209,  7318]], dtype=int32)

In [26]:
#~97% of our training set samples < max_sequence_length of 1000  
count = 0 
for messages in X_train:
    if messages[0] == 0: 
        count += 1 

print(round(count / X_train.shape[0] * 100, 2), '% of first train messages < 1000 tokens')

97.17 % of first train messages < 1000 tokens


### Embedding Layer

In [27]:
import spacy 

nlp = spacy.load('en_vectors_web_lg')

In [28]:
embeddings_index = {}

for word in nlp.vocab: 
    
    embeddings_index[word.text] = word.vector

print('Found %s word vectors.' % len(embeddings_index))

Found 1070925 word vectors.


In [29]:
import random

print(random.choice(list(embeddings_index.items())))

('Polytheist', array([ 1.5644e-01, -2.9517e-01, -4.1215e-01, -9.8563e-02,  6.0967e-01,
       -7.2976e-01,  4.7989e-01,  1.6789e-01,  2.4484e-01, -4.7923e-01,
        7.7935e-01,  3.2632e-01,  4.7726e-02, -2.6023e-02,  7.6045e-01,
       -1.1377e-01, -2.9051e-01, -1.7867e+00,  2.1130e-01,  6.5804e-01,
       -5.0218e-01, -1.2106e-01, -6.0476e-01,  5.1975e-01,  1.2457e-01,
        1.0774e-01,  3.7064e-01,  4.4262e-02,  1.8892e-01, -2.9232e-03,
        1.7389e-01,  1.0232e-01,  2.4751e-01, -3.0006e-01, -5.5702e-03,
       -1.1887e-01, -2.1895e-01, -6.3357e-01, -3.2615e-01, -2.0657e-01,
        1.8058e-01,  2.0180e-01, -1.0955e-01, -2.2905e-01, -2.7891e-01,
       -5.7444e-01,  5.7610e-01, -5.0193e-01, -2.6081e-01,  8.3600e-01,
       -1.4143e-01,  2.4012e-01, -6.0781e-01, -3.4914e-01, -4.4395e-02,
        2.5574e-01, -2.3381e-03, -1.4333e-01,  8.4756e-03,  4.5842e-01,
       -1.1005e-03, -6.1238e-01, -1.8476e-01, -2.8646e-01, -4.4901e-01,
       -3.6561e-02,  4.1014e-01, -2.0191e-01, -2.

**Let's remember our word_index**: <br>
('the', 1)
('to', 2)
('of', 3)
('a', 4)
('and', 5)
('be', 16)
... <br>

In [30]:
#Fill embedded matrix according to our word_index order
# words not found in embedding index will be all-zeros.

num_words = min(MAX_NUM_WORDS, len(word_index)) + 1

embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [31]:
embeddings_index['the'][:10]

array([ 0.27204  , -0.06203  , -0.1884   ,  0.023225 , -0.018158 ,
        0.0067192, -0.13877  ,  0.17708  ,  0.17709  ,  2.5882   ],
      dtype=float32)

In [32]:
embedding_matrix[1][:10] # --> 'the'

array([ 0.27204001, -0.06203   , -0.1884    ,  0.023225  , -0.018158  ,
        0.0067192 , -0.13877   ,  0.17708001,  0.17709   ,  2.58820009])

In [33]:
embedding_matrix.shape

(20001, 300)

In [34]:
#of the first 40000 words, we have no word vector for 8231 words 
len(np.where(~embedding_matrix[:20000].any(axis=1))[0])

2105

#### Create Embedding Layer

In [35]:
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

### Model 

In [36]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

In [37]:
sequence_input

<tf.Tensor 'input_1:0' shape=(?, 1000) dtype=int32>

In [38]:
embedded_sequences

<tf.Tensor 'embedding_1/embedding_lookup/Identity:0' shape=(?, 1000, 300) dtype=float32>

In [39]:
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit(X_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(X_test, y_test))

Train on 15998 samples, validate on 3999 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fad48c416a0>

### Remarks 

- see Githb issue https://github.com/keras-team/keras/issues/9104 (poor accuracy)
- Currenly only words as tokens, no windows or sequences --> possible extension 