<a href="https://colab.research.google.com/github/memoandrew/Sinandah/blob/main/Text_classification_pretrained_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#IMPORT THE NECESSARY LIBRARIES

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM,Dense,Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_datasets as tfds

In [None]:
#DOWNLOAD THE PRETRAINED EMBEDDING

!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip   -d glove.6B

--2024-05-03 08:53:34--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-05-03 08:53:34--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-05-03 08:53:34--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2

In [None]:
#LOAD THE AG-NEWS DATASET AND SPLIT INTO TRAINING AND TESTING SETS

dataset, info = tfds.load('ag_news_subset',with_info = True, as_supervised = True)
train_dataset, test_dataset = dataset['train'], dataset['test']

In [None]:
#TOKENIZE AND SEQUENCE THE TRAINING SET
tokenizer = Tokenizer(num_words = 20000, oov_token = "<OOV>")
train_texts = [x[0].numpy().decode('utf-8') for x in train_dataset]

tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
train_sequences = pad_sequences(train_sequences, padding = 'post')
max_length = train_sequences.shape[1]

In [None]:
#PREPROCESS THE TEST DATA
#fit_on-texts IS NOT APPLIED ON TEST SET TO ENSURE THAT THE TOKENIZER REMAINS THE SAME AS THE TRAINING SET

test_texts = [x[0].numpy().decode('utf-8') for x in test_dataset]
test_sequences = tokenizer.texts_to_sequences(test_texts)
train_sequences = pad_sequences(test_sequences, padding = 'post', maxlen = max_length)



In [None]:
#THE EMBEDDING PARAMETERS
#I DEFINE THE VOCABULARY AND SET THE DIMENSIONALITY OF MY EMBEDDING TO 50 AS I'LL BE WORKING WITH THE 50d PRETRAINED EMBEDDING

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50

In [None]:
#APPLY THE PRETRAINED WORD EMBEDDING
#I ACCESS THE glove.6B.txt FILE AND READ IT LINE BY LINE, EACH LINE CONTAINING A WORD AND ITS CORRESPONDING EMBEDDING
#I CROSS-MATCH   WORDS IN THE GLOVE FILE WITH THOSE IN MY VOCABULARY, CONSTRUCTED WITH THE KERAS TOKENIZER
#IF THERE'S A MATCH, I TAKE THE CORRESPONDING WORD INDEX FROM MY OWN VOCABULARY AND UPDATE MY INITIALLY ZERO-INITIALIZED
#EMBEDDING MATRIX AT THAT INDEX WITH THE GLOVE EMBEDDINGS
#WORDS THAT DON'T MATCH WILL REMAIN AS ZERO VECTORS IN THE MATRIX
#I WILL USE THIS EMBEDDING MATRIX TO INITIALIZE THE WEIGHT OF MY EMBEDDING LAYER
      #USE THE FILE PATH TO THE 50d EMBEDDING
#DOWNLOAD GLOVE EMBEDDINGS AND PREPARE EMBEDDING MATRIX

with open('/content/glove.6B/glove.6B.txt.50d', 'r', encoding = 'utf-8') as f:
  for line in f:
    values = line.split()
    word = values[0]
    if word in tokenizer.word_index:
      idx = tokenizer.word_index[word]
      embedding_matrix[idx] = np.array(values[:1], dtype = np.float32)

In [None]:
#BUILD, COMPILE AND TRAIN THE LSTM MODEL
#I LEVERAGE THE PRETRAINED EMBEDDING MATRIX TO INITIALIZE THE EMBEDDING LAYER
#I SET TRAINABLE PARAMETERS TO FALSE TO ENSURE THE WEIGHTS REMAIN UNCHANGED

model_lstm = Sequential([
    Embedding(vocab_size, embedding_dim, input_lenght = max_length, weights = [embedding_matrix], trainable = False),
    LSTM(32, return_sequences = True),
    LSTM(32),
    Dense(64, activation = 'relu'),
    Dense(4, activation = 'softmax')

])

In [None]:
#COMPILE AND FIT THE MODEL

model_lstm.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [None]:
#CONVER LABELS TO ONE-HOT ENCODING

train_labels = tf.keras.utils.to_categorical(
    [label.numpy() for _, label in test_dataset]
)

model_lstm.fit(train_sequences, train_labels, epochs = 10, validation_split = 0.2)

In [None]:
#MODEL EVALUATION

loss, accuracy = model_lstm.evaluate(test_sequences, test_labels)
print("Loss:" loss)
print("Accuracy:" accuracy)