In [25]:
from keras.datasets import reuters
(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)


In [44]:
print(train_data)


[list([1, 2, 2, 8, 43, 10, 447, 5, 25, 207, 270, 5, 3095, 111, 16, 369, 186, 90, 67, 7, 89, 5, 19, 102, 6, 19, 124, 15, 90, 67, 84, 22, 482, 26, 7, 48, 4, 49, 8, 864, 39, 209, 154, 6, 151, 6, 83, 11, 15, 22, 155, 11, 15, 7, 48, 9, 4579, 1005, 504, 6, 258, 6, 272, 11, 15, 22, 134, 44, 11, 15, 16, 8, 197, 1245, 90, 67, 52, 29, 209, 30, 32, 132, 6, 109, 15, 17, 12])
 list([1, 3267, 699, 3434, 2295, 56, 2, 7511, 9, 56, 3906, 1073, 81, 5, 1198, 57, 366, 737, 132, 20, 4093, 7, 2, 49, 2295, 2, 1037, 3267, 699, 3434, 8, 7, 10, 241, 16, 855, 129, 231, 783, 5, 4, 587, 2295, 2, 2, 775, 7, 48, 34, 191, 44, 35, 1795, 505, 17, 12])
 list([1, 53, 12, 284, 15, 14, 272, 26, 53, 959, 32, 818, 15, 14, 272, 26, 39, 684, 70, 11, 14, 12, 3886, 18, 180, 183, 187, 70, 11, 14, 102, 32, 11, 29, 53, 44, 704, 15, 14, 19, 758, 15, 53, 959, 47, 1013, 15, 14, 19, 132, 15, 39, 965, 32, 11, 14, 147, 72, 11, 180, 183, 187, 44, 11, 14, 102, 19, 11, 123, 186, 90, 67, 960, 4, 78, 13, 68, 467, 511, 110, 59, 89, 90, 67, 139

In [45]:
word_index = reuters.get_word_index()
index_word = dict([(value, key) for (key, value) in word_index.items()])
decoded_newswire = ' '.join([index_word.get(i-3, '?') for i in train_data[0]])



In [46]:
print(decoded_newswire)

? ? ? said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3


In [36]:
import numpy as np
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences),dimension))
    for i, sequence in enumerate(sequences):
        for word in sequence:
            results[i, word]+=1.0
    return results
    

In [37]:
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)


In [41]:
def to_one_hot(labels, dimension=46):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i,label]+=1.0
    return results

In [57]:
categorizedTrainLabel = to_one_hot(train_labels)
categorizedTestLabel = to_one_hot(test_labels)

In [405]:
from keras import backend as K

def selfdef_activation(x):
    return K.log(K.relu(x)+0.639)**2/2+0.7*K.relu(x)-0.1003
    #f\left(x\right)=\frac{\ln\left(\max\left(x,0\right)+0.639\right)^{2}}{2}+0.7\max\left(x,0\right)-0.1003
# Relu is a good activation function because its derivative is constant, 
# which means that the learning rate is almost exclusively governed by the weights. 
# In this way, gradient explosion and gradient vanishing problems can be alleviated. 
# Sigmoid and tanh have derivative approaching 0 when the function input approaches 
# infinity in both directions and this reduces the gradient explosion problem but may 
# cause gradient vanishing problem. Relu is not a smooth function (its derivative jumps 
# from 0 to 1 when crossing x=0 threshold). This generates inconsistency in training 
# (accuracy may jump around a little bit especially when the batch size is small, 
# in which case a smooth activation function may perform better). Because of its angularity,
# relu is less susceptible to being trapped at local minima (enabling the use of larger batch size). 
# It’s also more similar to the biological neuron firing threshold and pattern. Smoothening the relu
# function can theoretically increase its stability in performance. Further, reducing the derivative
# of relu near infinity may help to suppress the gradient explosion problem and introducing a smooth
# transition of derivative near 0 may help to reduce the dead-neuron problem. However, if done inappropriately,
# this could increase the computation time and susceptibility to local minima. 


In [414]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(128, activation='relu', input_shape=(10000,)))
# why does blowing the layer up work? Blowing the layer up exacerbates the gradient explosion problem while
# reducing gradient vanishing problem. This is often good especially in cases where the relationship between
# features and their effects seems direct (that is, a feature directly contributes to the probability of classifying
# to a specific group. A feature is not much related to other features where they must satisfy some condition to
# contribute to classifying to a specific group)
model.add(layers.Dense(256, activation='relu', input_shape=(128,)))
# at any time, the number of nodes in a layer should be greater than the number of output nodes. This is apparent if 
# we view each layer as a representation of the input info provided. A representation more compressed than desired may
# lose us information.
model.add(layers.Dense(46, activation='softmax', input_shape=(256,)))
# softmax activation function is an activation function applied to the entire layer where it
# calculates the probability of each of the nodes being the correctly activated (as in a classification problem)
# node. It doesn't just divide a number by the total, it applies exponential function to the values
# before dividing each by the total. This makes sense because exponential is often internally related to
# probability.

In [415]:
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics = ['accuracy'])

In [416]:
x_val = x_train[:1000]
partial_x_train = x_train[1000:]

y_val = categorizedTrainLabel[:1000]
partial_y_train = categorizedTrainLabel[1000:]

In [417]:
history = model.fit(partial_x_train,partial_y_train,epochs=5,batch_size = 32, validation_data=(x_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
