<a href="https://colab.research.google.com/github/memoandrea/Sinandah/blob/main/Text_classification_AG_News_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#IMPORT NECESSARY MODULES

import pandas as pd
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SimpleRNN, Bidirectional,Flatten,GRU,Conv1D,GlobalMaxPooling1D
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split


In [5]:
#LOAD THE AG NEWS DATASET FROM TFDS
"""with_info=True ensures that the datasets metadata is also collected.
as_supervised=True ensures that the datasets are returned as (input, target) pairs."""

dataset, info = tfds.load('ag_news_subset', with_info=True, as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']


Downloading and preparing dataset 11.24 MiB (download: 11.24 MiB, generated: 35.79 MiB, total: 47.03 MiB) to /root/tensorflow_datasets/ag_news_subset/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/120000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/ag_news_subset/incomplete.KYD4VZ_1.0.0/ag_news_subset-train.tfrecord*...: …

Generating test examples...:   0%|          | 0/7600 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/ag_news_subset/incomplete.KYD4VZ_1.0.0/ag_news_subset-test.tfrecord*...:  …

Dataset ag_news_subset downloaded and prepared to /root/tensorflow_datasets/ag_news_subset/1.0.0. Subsequent calls will reuse this data.


In [6]:
#DATA PREPARATION FOR MODELLING
#TOKENIZE AND PAD SEQUENCES

"""initialize the tokenizer used to convert data from text to a sequence of integers.
num_words = 20000 considers the first 20000 occurring words in the dataset for tokenization.
oov_token = "<OOV>" ensures we cater for unseen words that may be encountered during model inferencing.
padding each sequence ensures that the data inputed in the model is of a consistent shape.
post ensures padding is applied at the end of the sequence"""


tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
train_texts = [x[0].numpy().decode('utf-8') for x in train_dataset]
tokenizer.fit_on_texts(train_texts)
sequences = tokenizer.texts_to_sequences(train_texts)
sequences = pad_sequences(sequences, padding = 'post')

In [7]:
#CONVERT THE LABELS TO ONE_HOT ENCODING

train_labels = [label.numpy() for _, label in train_dataset]
train_labelsl =  to_categorical(train_labels, num_classes = 4) # assuming 4 classes




In [8]:
#SPLIT THE TRAINING SET INTO TRAINING AND VALIDATION SETS

train_sequences, val_sequences, train_labels, val_labels = train_test_split(sequences, train_labels,test_size = 0.2)

In [9]:
#SET THE vocab_size, embedding_dim and max_lenght parameters

""" selecting vocab size is important to strike a good balance between computation efficiency, model complexity and ability to capture language nuances"""

vocab_size = 20000
embedding_dim = 64 #represent each word in our vocab by a 64-dimensional vector
max_length = sequences.shape[1]#set to match longest padded and tokenized sequence

In [10]:
#MODEL BUILDING: STARTING WITH A DEEP NEURAL NETWORK
#define the DNN model
"""i use the Sequential API from TensorFlow to build a DNN made up of an ebedding layer,
a flatten layer, two hidden layers and an output layer for multi-class classification"""


model_dnn = Sequential([
    Embedding(vocab_size, embedding_dim, input_length = max_length),
    Flatten(),
    tf.keras.layers.Dense(64, activation = 'relu'),
    Dense(16, activation = 'relu'),
    Dense(4, activation = 'softmax')
])

In [11]:
#MODEL 2:CONVOLUTIONAL NEURAL NETWORK

""" i'll use a Conv1D layer with 128 filters(feature detectors) and a kernel size of 5,
meaning i'll consider 5 words at a time.
GlobalMaxPooling1D() to downsample the output of the convolutional layer to the most significant features"""

#define the convolutional neural network
model_cnn = Sequential([
    Embedding(vocab_size, embedding_dim, input_length = max_length),
    Conv1D(128, 5, activation = 'relu'),
    GlobalMaxPooling1D(),
    tf.keras.layers.Dense(64, activation = 'relu'),
    Dense(4, activation = 'softmax')
])

In [12]:
#MODEL 3: LONG SHORT TERM MEMORY

""" My LSTM  architecture is made up of 2 LSTM layers with 32 units each.
set the return_sequences = True in the first LSTM to pass the output of one layer to the next.
This allows the model to capture dependencies across the entire sequence.
feed the output of the last LSTM layer to a fully connected layer to classify the data"""

#define the LSTM model

model_lstm = Sequential([
    Embedding(vocab_size, embedding_dim, input_length = max_length),
    LSTM(32, return_sequences = True),
    LSTM(32),
    tf.keras.layers.Dense(64, activation = 'relu'),
    Dense(4, activation = 'softmax')
])

In [13]:
#MODEL 4: BIDIRECTIONAL MODEL

""" I add two bidirectional LSTM layers instead of the LSTM layers.
bidirectional wrapper allows each LSTM layer access to both past and future context
when processing each element of the input sequence"""

#define the bi_LSTM model

model_BiLSTM = Sequential([
    Embedding(vocab_size, embedding_dim, input_length = max_length),
    Bidirectional(LSTM(32, return_sequences = True)),
    Bidirectional(LSTM(16)),
    tf.keras.layers.Dense(64, activation = 'relu'),
    Dense(4, activation = 'softmax')
])

In [17]:
#COMPILE AND FIT ALL THE MODELS
#COMPILE AND FIT ALL THE MODELS

"""I'll use a for loop to compile and fit all my models.
set verbose to false to avoid printing out the training information.
i'll train for 10 epochs"""

models = [model_dnn, model_cnn, model_lstm, model_BiLSTM]
for model in models:
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'], run_eagerly=True) # Add run_eagerly=True

In [18]:
#EVALUATE THE MODEL ON UNSEEN DATA

test_texts = [x[0].numpy().decode('utf-8') for x in test_dataset]
test_sequences = tokenizer.texts_to_sequences(test_texts)
# Pad sequences to match the max_length used during training
test_sequences = pad_sequences(test_sequences, maxlen=173, padding = 'post')
test_labels = [label.numpy() for _, label in test_dataset]
test_labels = to_categorical(test_labels, num_classes = 4)

model_names= ["Model_DNN", "Model_CNN", "Model_LSTM", "Model_BiLSTM"]
for i, model in enumerate(models):
  loss, accuracy = model.evaluate(test_sequences, test_labels)

  print("Model Evaluation -", model_names[i])
  print("Loss:", loss)
  print("Accuracy:", accuracy)
  print()


Model Evaluation - Model_DNN
Loss: 1.3868361711502075
Accuracy: 0.25

Model Evaluation - Model_CNN
Loss: 1.3871707916259766
Accuracy: 0.25631579756736755

Model Evaluation - Model_LSTM
Loss: 1.386448621749878
Accuracy: 0.25

Model Evaluation - Model_BiLSTM
Loss: 1.3864014148712158
Accuracy: 0.25

