# Trains and evaluate a simple MLP on the Reuters newswire topic classification task.

Dataset of 11,228 newswires from Reuters, labeled over 46 topics.

Returns 2 types data:

x_train and  x_test - list of sequences, which are lists of indexes (integers).

y_train and y_test - list of integer labels (0 to 45).

In [61]:
import numpy as np
import keras
from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from termcolor import colored

In [62]:
def show_shapes(x_train, y_train, x_test, y_test, color='green'):
    print(colored('Training shape:', color, attrs=['bold']))
    print('  x_train.shape:', x_train.shape)
    print('  y_train.shape:', y_train.shape)
    print(colored('\nTesting shape:', color, attrs=['bold']))
    print('  x_test.shape:', x_test.shape)
    print('  y_test.shape:', y_test.shape)

In [63]:
def show_sample(x_train, y_train, idx=0, color='blue'):
    print(colored('x_train sample:', color, attrs=['bold']))
    print(x_train[idx])
    print(colored('\ny_train sample:', color, attrs=['bold']))
    print(y_train[idx])

In [64]:
(x_train, y_train), (x_test, y_test) = reuters.load_data()


In [65]:
show_shapes(x_train, y_train, x_test, y_test)
print('\n******************************\n')
show_sample(x_train, y_train, idx=1)

[1m[32mTraining shape:[0m
  x_train.shape: (8982,)
  y_train.shape: (8982,)
[1m[32m
Testing shape:[0m
  x_test.shape: (2246,)
  y_test.shape: (2246,)

******************************

[1m[34mx_train sample:[0m
[1, 3267, 699, 3434, 2295, 56, 16784, 7511, 9, 56, 3906, 1073, 81, 5, 1198, 57, 366, 737, 132, 20, 4093, 7, 19261, 49, 2295, 13415, 1037, 3267, 699, 3434, 8, 7, 10, 241, 16, 855, 129, 231, 783, 5, 4, 587, 2295, 13415, 30625, 775, 7, 48, 34, 191, 44, 35, 1795, 505, 17, 12]
[1m[34m
y_train sample:[0m
4


In [66]:
num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

46 classes


In [67]:
from keras.preprocessing.text import Tokenizer
max_words = 1000
batch_size = 32
epochs = 5
print('Vectorizing sequence data...')

'''
Tokenizer - This class allows to vectorize a text corpus, by turning each text into a sequence of integers 
num_words: the maximum number of words to keep, based on word frequency. 
Only the most common `num_words` words will be kept.

'''

tokenizer = Tokenizer(num_words=max_words)

#Converts a list of sequences into a Numpy matrix using tokenizer.sequences_to_matrix
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('\n******************************\n')
print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
print('\n******************************\n')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('\n******************************\n')
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)


Vectorizing sequence data...
x_train shape: (8982, 1000)
x_test shape: (2246, 1000)

******************************

Convert class vector to binary class matrix (for use with categorical_crossentropy)

******************************


******************************

y_train shape: (8982, 46)
y_test shape: (2246, 46)


In [68]:
print('Building model...')
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

Building model...


In [69]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [70]:

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)


Train on 8083 samples, validate on 899 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [71]:
#scoring the model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.029557424702765362
Test accuracy: 0.9918793022154913


In [72]:
# predictions
predictions = model.predict(x_test,batch_size=batch_size,verbose=1)



In [73]:
predictions

array([[4.5224665e-06, 6.6651855e-06, 3.9775696e-08, ..., 9.2698123e-08,
        1.8257314e-08, 1.0373899e-07],
       [4.3695550e-03, 2.7142791e-02, 1.1779716e-01, ..., 1.1422976e-05,
        2.1401420e-05, 1.8974653e-04],
       [5.2423376e-05, 9.9429178e-01, 9.1717673e-05, ..., 5.4077959e-06,
        1.0180253e-05, 7.7340273e-06],
       ...,
       [2.3582978e-05, 3.5150547e-04, 2.7706354e-05, ..., 9.9711579e-06,
        4.5883667e-06, 1.8667630e-05],
       [2.7315789e-03, 2.1219974e-02, 2.6843280e-03, ..., 1.2953189e-04,
        9.1713919e-05, 1.5116598e-04],
       [2.9209952e-04, 7.4542016e-01, 7.3268302e-03, ..., 7.7719640e-05,
        1.9456425e-05, 7.4511729e-05]], dtype=float32)

In [74]:
pred = predictions.flatten()

In [75]:
pred = pred.astype(int)

In [76]:
pred

array([0, 0, 0, ..., 0, 0, 0])

In [77]:
#---> used flatten since data has to be one dimensional
import pandas as pd 
df_predictions = pd.DataFrame({"True_Values" : y_test.flatten().astype(int),
                               "Predictions":predictions.flatten().astype(int)})

In [78]:
df_predictions.head()

Unnamed: 0,Predictions,True_Values
0,0,0
1,0,0
2,0,0
3,0,1
4,0,0
