In [None]:
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional

In [None]:
!wget --no-check-certificate \
    https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv \
    -O /tmp/bbc-text.csv

--2020-09-02 11:17:18--  https://storage.googleapis.com/dataset-uploader/bbc/bbc-text.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.20.128, 74.125.142.128, 74.125.195.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.20.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5057493 (4.8M) [text/csv]
Saving to: ‘/tmp/bbc-text.csv’


2020-09-02 11:17:18 (106 MB/s) - ‘/tmp/bbc-text.csv’ saved [5057493/5057493]



In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
vocab_size = 5000 # make the top list of words (common words)
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # OOV = Out of Vocabulary
training_portion = .8

In [None]:
articles = []
labels = []

with open("/tmp/bbc-text.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[0])
        article = row[1]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
            article = article.replace(' ', ' ')
        articles.append(article)

In [None]:
train_size = int(len(articles) * training_portion)

train_articles = articles[0: train_size]
train_labels = labels[0: train_size]

validation_articles = articles[train_size:]
validation_labels = labels[train_size:]

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_articles)

In [None]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_articles)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

validation_sequences = tokenizer.texts_to_sequences(validation_articles)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [None]:
import tensorflow as tf
  
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    #tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
])

In [None]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy'],
)

In [None]:
num_epochs = 10
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)


Epoch 1/10
56/56 - 12s - loss: 1.5842 - accuracy: 0.2798 - val_loss: 1.3054 - val_accuracy: 0.5551
Epoch 2/10
56/56 - 11s - loss: 1.2491 - accuracy: 0.4888 - val_loss: 0.8940 - val_accuracy: 0.6831
Epoch 3/10
56/56 - 11s - loss: 0.6714 - accuracy: 0.7837 - val_loss: 0.4813 - val_accuracy: 0.8764
Epoch 4/10
56/56 - 11s - loss: 0.3183 - accuracy: 0.8978 - val_loss: 0.4201 - val_accuracy: 0.8472
Epoch 5/10
56/56 - 11s - loss: 0.5183 - accuracy: 0.8562 - val_loss: 0.5305 - val_accuracy: 0.8404
Epoch 6/10
56/56 - 11s - loss: 0.1851 - accuracy: 0.9596 - val_loss: 0.1952 - val_accuracy: 0.9483
Epoch 7/10
56/56 - 11s - loss: 0.0827 - accuracy: 0.9781 - val_loss: 0.1678 - val_accuracy: 0.9416
Epoch 8/10
56/56 - 11s - loss: 0.0633 - accuracy: 0.9826 - val_loss: 0.1575 - val_accuracy: 0.9596
Epoch 9/10
56/56 - 11s - loss: 0.0379 - accuracy: 0.9876 - val_loss: 0.1427 - val_accuracy: 0.9551
Epoch 10/10
56/56 - 11s - loss: 0.0294 - accuracy: 0.9938 - val_loss: 0.1611 - val_accuracy: 0.9551
