<a href="https://colab.research.google.com/github/mrtzcardo/Short-Newswire-Multiclassification/blob/main/Short_Newswire_Multiclassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Attempting to classify short newswires and their topics (published by Reuters in 1986) into the category of topics they are. 46 different topics, each topic has at least 10 examples.

In [None]:
from keras.datasets import reuters
(train_data, train_labels), (test_data, test_labels) = reuters.load_data(
num_words=10000) #restricting the data to only the 10,000 most frequently occurring words found in the data.

print(train_labels)
#print(len(train_data))
#print(len(test_data))

In [None]:
'''Looking at what the data looks like a bit'''

print(train_data[45])
print(train_labels[45])
word_index = reuters.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[45]])
print(decoded_review)

In [None]:
'''Vectorize data'''
import numpy as np

def vectorize_sequences(sequences, dimension=10000):
  results = np.zeros((len(sequences), dimension)) #matrix len(sequences) x dimension of all zeros
  for i, sequences in enumerate(sequences):
    results[i, sequences] = 1     
  return results

In [None]:
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

To vectorize the labels, there are two possibilities: you can cast the label list as an integer
tensor, or you can use one-hot encoding. One-hot encoding is a widely used format
for categorical data, also called categorical encoding.

In [None]:
'''In this case, one-hot encoding of the labels consists of embedding 
each label as an all-zero vector with a 1 in the place ofthe label index.
It looks like this, but Keras already built in.

def to_one_hot(labels, dimension=46):
  results = np.zeros((len(labels), dimension))
  for i, label in enumerate(labels):
    results[i, label] = 1
  return results

one_hot_train_labels = to_one_hot(train_labels)
one_hot_test_labels = to_one_hot(test_labels)
'''

In [None]:
from keras.utils.np_utils import to_categorical

one_hot_train_labels = to_categorical(train_labels)
one_hot_test_labels = to_categorical(test_labels)
#print(one_hot_test_labels)

In [None]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(46, activation='softmax')) #probability distribution over the 46 different output classes, sums to 1

In [None]:
model.compile(optimizer='rmsprop',              #rmsprop kinda just works all the time
              loss='categorical_crossentropy',  #best for categorical models
              metrics=['accuracy'])

In [None]:
'''Setting aside a validation set'''

x_val = x_train[:1000]
partial_x_train = x_train[1000:]

y_val = one_hot_train_labels[:1000]
partial_y_train = one_hot_train_labels[1000:]

In [None]:
'''Training model'''

history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val))

In [None]:
'''Plotting the training and validation loss'''

import matplotlib.pyplot as plt
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(loss) + 1)

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show

In [None]:
'''Plotting the training and validation accuracy'''

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
plt.plot(epochs, acc, 'bo', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
'''Model begins to over fit at 9 epochs ish so that's where I will be fixing it to'''

In [None]:
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(46, activation='softmax'))

model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(partial_x_train,
          partial_y_train,
          epochs=9,
          batch_size=512,
          validation_data=(x_val, y_val))

results = model.evaluate(x_test, one_hot_test_labels)

print(results)

In [None]:
'''Generating predictions for new data'''

predictions = model.predict(x_test)

print(predictions[0].shape)     #46 len vector, check
print(np.sum(predictions[0]))   #prob sums to 1, check

np.argmax(predictions[0])      #looks like label[3] / 4 is the class with 71% prob

#for i in predictions[0]:
#  print(i)

In [None]:
'''A model with an information bottleneck as in one layer has less than 46 nodes'''

model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
#model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(46, activation='softmax'))

model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(partial_x_train,
          partial_y_train,
          epochs=20,
          batch_size=128,
          validation_data=(x_val, y_val))

results = model.evaluate(x_test, one_hot_test_labels)

print(results)

val_accuracy: 0.6710 Epoch 8/20 with 4 nodes in 2nd layer

With 32 nodes, bounced between val_acc of 79 and 80 as of 3rd epoch

With 128 nodes, val_acc of stayed around 80 as of epoch 6

Similar results taking out the middle layer