In [1]:
import keras as k
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.optimizers import SGD, Adagrad, Adam, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import regularizers

from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import gensim.downloader


2023-10-25 19:27:26.884827: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
path = '../Datasets/Processed/TREC'

training_dev_df = pd.read_csv(f'{path}/train.dev.csv')
training_df = pd.read_csv(f'{path}/train.csv')
test_df = pd.read_csv(f'{path}/test.csv')
print("Training Size: ", training_df.shape[0])
print("Development Size: ", training_dev_df.shape[0])
print("Test Size: ", test_df.shape[0])

Training Size:  3861
Development Size:  500
Test Size:  1091


In [3]:
label_encoder = LabelEncoder()

training_encoded_labels = label_encoder.fit_transform(training_df['label-coarse'])
dev_encoded_labels = label_encoder.fit_transform(training_dev_df['label-coarse'])
test_encoded_labels = label_encoder.fit_transform(test_df['label-coarse'])

training_df['label-coarse'] = training_encoded_labels
training_dev_df['label-coarse'] = dev_encoded_labels
test_df['label-coarse'] = test_encoded_labels

In [4]:
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print(label_mapping)
num_labels = len(label_mapping)

{'0': 0, '2': 1, '3': 2, '5': 3, 'OTHERS': 4}


In [5]:
# Tokenize the text
tokenizer = Tokenizer()
#tokenizer.fit_on_texts(training_dev_df['text'])
tokenizer.fit_on_texts(training_df['text'])

# Convert text to sequences
X_train_sequences = tokenizer.texts_to_sequences(training_df['text'])
X_val_sequences = tokenizer.texts_to_sequences(training_dev_df['text'])
X_test_sequences = tokenizer.texts_to_sequences(test_df['text'])

In [6]:
w2v_model = gensim.downloader.load("word2vec-google-news-300")

In [7]:
# Load Pretrained Model
pretrained_weights = w2v_model.vectors
google_vocab_size, embedding_size = pretrained_weights.shape

voc = tokenizer.word_index
vocab_size = len(voc) + 1
word2idx = {k: v for v, k in enumerate(voc)}

max_sentence_len = max([len(s) for s in training_df['text']])

print("Vocab Size: ", vocab_size) # vocab size taken from training dataset
print("Embedding Size: ", embedding_size)
print("Voc Size: ", vocab_size)
print("Max Sentence Len: ", max_sentence_len)

Vocab Size:  6884
Embedding Size:  300
Voc Size:  6884
Max Sentence Len:  194


In [8]:
embedding_matrix = np.zeros((vocab_size, embedding_size))

for word, i in tokenizer.word_index.items():
    if word in w2v_model:
        embedding_matrix[i] = w2v_model[word]

In [9]:
# Padding sequences
sequence_length = max_sentence_len  # Choose an appropriate sequence length (follow the pretrain)
X_train_padded = pad_sequences(X_train_sequences, maxlen=sequence_length, padding='post')
X_val_padded = pad_sequences(X_val_sequences, maxlen=sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=sequence_length, padding='post')

# Convert labels to one-hot encoded format
y_train = to_categorical(training_df['label-coarse'])
y_val = to_categorical(training_dev_df['label-coarse'])
y_test = to_categorical(test_df['label-coarse'])

In [19]:
num_classes = num_labels

model = Sequential()
model.add(Embedding(input_dim=vocab_size, 
                    output_dim=embedding_size, 
                    weights=[embedding_matrix],
                    input_length=max_sentence_len,
                    embeddings_initializer=k.initializers.Constant(embedding_matrix))
        )

model.add(Bidirectional(LSTM(units=128, 
                dropout=0.5, 
                recurrent_dropout=0.5, 
                kernel_initializer=k.initializers.he_normal()
                )
        ))

model.add(Dense(units=embedding_size, activation= "relu", kernel_regularizer=regularizers.L1L2(l1=0.0025, l2=0.0025)))
model.add(Dense(units=num_classes, activation='softmax'))
optimizer = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 194, 300)          2065200   
                                                                 
 bidirectional_3 (Bidirecti  (None, 256)               439296    
 onal)                                                           
                                                                 
 dense_8 (Dense)             (None, 300)               77100     
                                                                 
 dense_9 (Dense)             (None, 5)                 1505      
                                                                 
Total params: 2583101 (9.85 MB)
Trainable params: 2583101 (9.85 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
num_epochs = 40
batch_size = 128

early_stopping = EarlyStopping(monitor='accuracy', patience=10)

# Save 'best' model
model_path = '../Models/Question_Classification/'
checkpoint = ModelCheckpoint(model_path + 'questions_best.h5',  # Save the model to a file named 'model-<epoch_number>.h5'
                             monitor='accuracy',      # Monitor validation loss
                             verbose=1,               # Verbosity mode: 1 = print progress bar, 0 = silent
                             save_best_only=True,     # Only save the model if 'val_loss' has improved
                             mode='auto')             # Mode: 'auto' decides whether to maximize or minimize 'val_loss' based on its name ('loss' should be minimized, 'acc' should be maximized)


model.fit(X_train_padded, 
          y_train, 
          epochs=num_epochs, 
          batch_size=batch_size, 
          shuffle=True, 
          validation_data = (X_val_padded, y_val), 
          callbacks = [early_stopping, checkpoint], 
          workers = 4)

# Save latest model
save_path = model_path + 'questions_last.h5'
model.save(save_path)

Epoch 1/40
Epoch 1: accuracy improved from -inf to 0.44367, saving model to ../Models/Question_Classification/questions_best.h5


  saving_api.save_model(


Epoch 2/40
Epoch 2: accuracy improved from 0.44367 to 0.68687, saving model to ../Models/Question_Classification/questions_best.h5
Epoch 3/40
Epoch 3: accuracy improved from 0.68687 to 0.80627, saving model to ../Models/Question_Classification/questions_best.h5
Epoch 4/40
Epoch 4: accuracy improved from 0.80627 to 0.86169, saving model to ../Models/Question_Classification/questions_best.h5
Epoch 5/40
Epoch 5: accuracy improved from 0.86169 to 0.90417, saving model to ../Models/Question_Classification/questions_best.h5
Epoch 6/40
Epoch 6: accuracy improved from 0.90417 to 0.92852, saving model to ../Models/Question_Classification/questions_best.h5
Epoch 7/40
Epoch 7: accuracy improved from 0.92852 to 0.94639, saving model to ../Models/Question_Classification/questions_best.h5
Epoch 8/40
Epoch 8: accuracy improved from 0.94639 to 0.96296, saving model to ../Models/Question_Classification/questions_best.h5
Epoch 9/40
Epoch 9: accuracy improved from 0.96296 to 0.96503, saving model to ../M

<keras.src.callbacks.History at 0x18404e7d0>

In [None]:
from keras.models import load_model
load_path = '../Models/Question_Classification/questions_best.h5'
loaded_model = load_model(load_path)

In [21]:
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 0.8159940242767334, Test Accuracy: 0.8276810050010681
