In [None]:
##!wget http://nlp.stanford.edu/data/glove.6B.zip! unzip -q glove.6B.zip

In [None]:
# --- SETUP ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast

import tensorflow as tf
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score

In [None]:
# --- LOAD DATA ---
train_df = pd.read_csv('/content/Dataset_A_POS_train.csv')
test_df = pd.read_csv('/content/Dataset_A_POS_test.csv')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("\nSample row:\n", train_df.iloc[0])
print("\nUnique POS sequences:", train_df['POS'].nunique())

## Importing glove model


In [None]:
glove_path = '/content/glove.6B.100d.txt'

glove_embedding = {}
with open(glove_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.array(values[1:], dtype='float32')
        glove_embedding[word] = coeffs

print(f"Loaded {len(glove_embedding)} word vectors.")


## Text Processing

In [None]:
train_df['POS'] = train_df['POS'].apply(ast.literal_eval)


all_tags = [tag for sentence in train_df['POS'] for tag in sentence]

tag_counts = Counter(all_tags)

print("Most Frequent POS Tags:")
print(tag_counts.unique)

test_df['POS'] = test_df['POS'].apply(ast.literal_eval)

In [None]:
train_df

In [None]:
# --- PREPROCESSING ---
train_sentences = train_df['Sentence'].tolist()
test_sentences = test_df['Sentence'].tolist()
train_tags = train_df['POS'].tolist()
test_tags = test_df['POS'].tolist()

In [None]:
# Tokenize words
word_tokenizer = Tokenizer(lower=True, oov_token="<OOV>")
word_tokenizer.fit_on_texts(train_sentences)

X_train = word_tokenizer.texts_to_sequences(train_sentences)
X_test = word_tokenizer.texts_to_sequences(test_sentences)

word_index = word_tokenizer.word_index
vocab_size = len(word_index) + 1

In [None]:
# Tokenize POS tags
""""
all_tags = set(tag for seq in train_tags for tag in seq)
tag_tokenizer = Tokenizer(lower=False)
tag_tokenizer.fit_on_texts([' '.join(seq) for seq in train_tags])
y_train = tag_tokenizer.texts_to_sequences([' '.join(seq) for seq in train_tags])
y_test = tag_tokenizer.texts_to_sequences([' '.join(seq) for seq in test_tags])
tag_index = tag_tokenizer.word_index
num_tags = len(tag_index) + 1"""

In [None]:
from sklearn.preprocessing import LabelEncoder

flat_train_tags = [tag for sentence in train_tags for tag in sentence]

le = LabelEncoder()
le.fit(flat_train_tags)


In [None]:

encoded_train_tags = [le.transform(tags) for tags in train_tags]
encoded_test_tags = [le.transform(tags) for tags in test_tags]

In [None]:
num_tags = len(le.classes_)

In [None]:
# Pad sequences
max_len = max(max(len(seq) for seq in X_train), max(len(seq) for seq in X_test))

y_train = pad_sequences(encoded_train_tags, maxlen=max_len, padding='post', value = 0)
y_test = pad_sequences(encoded_test_tags, maxlen=max_len, padding='post', value = 0)

X_train = pad_sequences(X_train, maxlen=max_len, padding='post', value = 0)
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', value = 0)

In [None]:

y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_tags)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=num_tags)


In [None]:
print(max_len)

In [None]:
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    embedding_vector = glove_embedding.get(word)
    if embedding_vector is not None:

        embedding_matrix[i] = embedding_vector
    else:

        pass

print(f"Embedding matrix shape: {embedding_matrix.shape}")

In [None]:
from tensorflow.keras.layers import (Input, Embedding, SimpleRNN, LSTM, GRU, Bidirectional,
                                     Dense, TimeDistributed,Dropout)

## Simple RNN


In [None]:
# --- RNN MODEL ---
model = models.Sequential()
model.add(layers.Embedding(input_dim=vocab_size,
                           output_dim=embedding_dim,
                           weights=[embedding_matrix],
                           input_length=max_len,
                           trainable=False))
model.add(layers.SimpleRNN(64, return_sequences=True))
model.add(layers.Dropout(0.1))
model.add(layers.TimeDistributed(layers.Dense(num_tags, activation='softmax')))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_len))
model.summary()



In [None]:
# --- TRAINING ---
history = model.fit(X_train, y_train, batch_size=64, epochs=10, validation_split=0.1)

In [None]:

plt.figure(figsize=(12, 5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='validation Loss')
plt.title('Model Loss Progression')
plt.ylabel('categorical_crossentropy')
plt.xlabel('Epoch')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# --- EVALUATION ---
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=-1)
y_true_classes = np.argmax(y_test, axis=-1)

# Flatten
y_pred_flat = []
y_true_flat = []

for i in range(len(y_true_classes)):
    for j in range(len(y_true_classes[i])):
        if X_test[i][j] != 0:
            y_pred_flat.append(y_pred_classes[i][j])
            y_true_flat.append(y_true_classes[i][j])
# Remove any 0s in true labels (padding tokens)
filtered_true = []
filtered_pred = []

for true, pred in zip(y_true_flat, y_pred_flat):
    if true != 0:  # 0 is padding
        filtered_true.append(true)
        filtered_pred.append(pred)

# Recalculate label set and target names
labels = sorted(set(filtered_true))
idx_to_tag = {i: tag for i, tag in enumerate(le.classes_)}
target_names = [idx_to_tag[i] for i in labels]

# Evaluation
print("\nClassification Report:")
print(classification_report(filtered_true, filtered_pred, labels=labels, target_names=target_names))

# Confusion Matrix
cm = confusion_matrix(filtered_true, filtered_pred, labels=labels)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, xticklabels=target_names, yticklabels=target_names, cmap='Blues', annot=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

accuracy = accuracy_score(filtered_true, filtered_pred)
f1_macro = f1_score(filtered_true, filtered_pred, average='macro')
f1_weighted = f1_score(filtered_true, filtered_pred, average='weighted')

# Accuracy and F1
print("Accuracy:", accuracy)
print("F1 Score (Macro):", f1_macro)
print("F1 Score (Weighted):", f1_weighted)


## LSTM

In [None]:
# --- LSTM MODEL ---
lstm = models.Sequential()

lstm.add(layers.Embedding(input_dim=vocab_size,
                          output_dim=embedding_dim,
                          weights=[embedding_matrix],
                          input_length=max_len,
                          trainable=False))


lstm.add(layers.LSTM(64, return_sequences=True))

lstm.add(layers.Dropout(0.1))

lstm.add(layers.TimeDistributed(layers.Dense(num_tags, activation='softmax')))

lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

lstm.build(input_shape=(None, max_len))
lstm.summary()

In [None]:
history_lstm1 = lstm.fit(X_train, y_train, batch_size=64, epochs=10, validation_split=0.1)


In [None]:
plt.figure(figsize=(12, 5))
plt.plot(history_lstm1.history['loss'], label='Training Loss')
plt.plot(history_lstm1.history['val_loss'], label='validation Loss')
plt.title('Model Loss Progression')
plt.ylabel('categorical_crossentropy')
plt.xlabel('Epoch')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# --- EVALUATION ---
y_pred =lstm.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=-1)
y_true_classes = np.argmax(y_test, axis=-1)

# Flatten
y_pred_flat = []
y_true_flat = []

for i in range(len(y_true_classes)):
    for j in range(len(y_true_classes[i])):
        if X_test[i][j] != 0:
            y_pred_flat.append(y_pred_classes[i][j])
            y_true_flat.append(y_true_classes[i][j])
# Remove any 0s in true labels (padding tokens)
filtered_true = []
filtered_pred = []

for true, pred in zip(y_true_flat, y_pred_flat):
    if true != 0:  # 0 is padding
        filtered_true.append(true)
        filtered_pred.append(pred)

# Recalculate label set and target names
labels = sorted(set(filtered_true))
idx_to_tag = {i: tag for i, tag in enumerate(le.classes_)}
target_names = [idx_to_tag[i] for i in labels]

# Evaluation
print("\nClassification Report:")
print(classification_report(filtered_true, filtered_pred, labels=labels, target_names=target_names))

# Confusion Matrix
cm = confusion_matrix(filtered_true, filtered_pred, labels=labels)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, xticklabels=target_names, yticklabels=target_names, cmap='Blues', annot=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

lstm_accuracy = accuracy_score(filtered_true, filtered_pred)
lstm_f1_macro = f1_score(filtered_true, filtered_pred, average='macro')
lstm_f1_weighted = f1_score(filtered_true, filtered_pred, average='weighted')

# Accuracy and F1
print("Accuracy:", lstm_accuracy)
print("F1 Score (Macro):", lstm_f1_macro)
print("F1 Score (Weighted):", lstm_f1_weighted)


## GRU

In [None]:
# --- GRU MODEL ---
gru = models.Sequential()
gru.add(layers.Embedding(input_dim=vocab_size,
                           output_dim=embedding_dim,
                           weights=[embedding_matrix],
                           input_length=max_len,
                           trainable=False))
gru.add(layers.GRU(64, return_sequences=True))
gru.add(layers.Dropout(0.1))
gru.add(layers.TimeDistributed(layers.Dense(num_tags, activation='softmax')))

gru.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
gru.build(input_shape=(None, max_len))
gru.summary()


In [None]:
history_gru = gru.fit(X_train, y_train, batch_size=64, epochs=10, validation_split=0.1)


In [None]:
plt.figure(figsize=(12, 5))
plt.plot(history_gru.history['loss'], label='Training Loss')
plt.plot(history_gru.history['val_loss'], label='validation Loss')
plt.title('Model Loss Progression')
plt.ylabel('categorical_crossentropy')
plt.xlabel('Epoch')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# --- EVALUATION ---
y_pred = gru.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=-1)
y_true_classes = np.argmax(y_test, axis=-1)

# Flatten
y_pred_flat = []
y_true_flat = []

for i in range(len(y_true_classes)):
    for j in range(len(y_true_classes[i])):
        if X_test[i][j] != 0:
            y_pred_flat.append(y_pred_classes[i][j])
            y_true_flat.append(y_true_classes[i][j])
# Remove any 0s in true labels (padding tokens)
filtered_true = []
filtered_pred = []

for true, pred in zip(y_true_flat, y_pred_flat):
    if true != 0:  # 0 is padding
        filtered_true.append(true)
        filtered_pred.append(pred)

# Recalculate label set and target names
labels = sorted(set(filtered_true))
idx_to_tag = {i: tag for i, tag in enumerate(le.classes_)}
target_names = [idx_to_tag[i] for i in labels]

# Evaluation
print("\nClassification Report:")
print(classification_report(filtered_true, filtered_pred, labels=labels, target_names=target_names))

# Confusion Matrix
cm = confusion_matrix(filtered_true, filtered_pred, labels=labels)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, xticklabels=target_names, yticklabels=target_names, cmap='Blues', annot=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

gru_accuracy = accuracy_score(filtered_true, filtered_pred)
gru_f1_macro = f1_score(filtered_true, filtered_pred, average='macro')
gru_f1_weighted = f1_score(filtered_true, filtered_pred, average='weighted')
# Accuracy and F1
print("Accuracy:", gru_accuracy)
print("F1 Score (Macro):", gru_f1_macro)
print("F1 Score (Weighted):", gru_f1_weighted)


## BiLSTM

In [None]:
bi_lstm = models.Sequential()
bi_lstm.add(layers.Embedding(input_dim=vocab_size,
                           output_dim=embedding_dim,
                           weights=[embedding_matrix],
                           input_length=max_len,
                           trainable=False))
bi_lstm.add(layers.Bidirectional(layers.LSTM(64, return_sequences=True)))
bi_lstm.add(layers.Dropout(0.1))
bi_lstm.add(layers.TimeDistributed(layers.Dense(num_tags, activation='softmax')))

bi_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
bi_lstm.build(input_shape=(None, max_len))
bi_lstm.summary()



In [None]:
history_lstm = bi_lstm.fit(X_train, y_train, batch_size=64, epochs=20, validation_split=0.1)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(history_lstm.history['loss'], label='Training Loss')
plt.plot(history_lstm.history['val_loss'], label='Validation Loss')
plt.title('Training vs Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('categorical_crossentropy')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# --- EVALUATION ---
y_pred = bi_lstm.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=-1)
y_true_classes = np.argmax(y_test, axis=-1)

# Flatten
y_pred_flat = []
y_true_flat = []

for i in range(len(y_true_classes)):
    for j in range(len(y_true_classes[i])):
        if X_test[i][j] != 0:
            y_pred_flat.append(y_pred_classes[i][j])
            y_true_flat.append(y_true_classes[i][j])
# Remove any 0s in true labels (padding tokens)
filtered_true = []
filtered_pred = []

for true, pred in zip(y_true_flat, y_pred_flat):
    if true != 0:  # 0 is padding
        filtered_true.append(true)
        filtered_pred.append(pred)

# Recalculate label set and target names
labels = sorted(set(filtered_true))
idx_to_tag = {i: tag for i, tag in enumerate(le.classes_)}
target_names = [idx_to_tag[i] for i in labels]

# Evaluation
print("\nClassification Report:")
print(classification_report(filtered_true, filtered_pred, labels=labels, target_names=target_names))

# Confusion Matrix
cm = confusion_matrix(filtered_true, filtered_pred, labels=labels)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, xticklabels=target_names, yticklabels=target_names, cmap='Greens', annot=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

bi_lstm_accuracy = accuracy_score(filtered_true, filtered_pred)
bi_lstm_f1_macro = f1_score(filtered_true, filtered_pred, average='macro')
bi_lstm_f1_weighted = f1_score(filtered_true, filtered_pred, average='weighted')
# Accuracy and F1
print("Accuracy:", bi_lstm_accuracy)
print("F1 Score (Macro):", bi_lstm_f1_macro)
print("F1 Score (Weighted):", bi_lstm_f1_weighted)

In [None]:
data = {
    'Accuracy': [accuracy, lstm_accuracy, gru_accuracy, bi_lstm_accuracy],
    'F1 Macro': [f1_macro,  lstm_f1_macro, gru_f1_macro,bi_lstm_f1_macro],
    'F1 Weighted': [f1_weighted, lstm_f1_weighted, gru_f1_weighted, bi_lstm_f1_weighted]
}
model_names = ['Simple RNN', 'LSTM', 'GRU', 'BiLSTM']
df_metrics = pd.DataFrame(data, index=model_names)

# Set up the x locations and bar width
x = np.arange(len(model_names))  # the label locations
bar_width = 0.25  # width of each bar

fig, ax = plt.subplots(figsize=(10, 6))

colors = ['#FFA07A', '#ADD8E6', '#90EE90']
# Plot each metric as a set of bars.
rects1 = ax.bar(x - bar_width, df_metrics['Accuracy'], width=bar_width, label='Accuracy',color=colors[0])
rects2 = ax.bar(x, df_metrics['F1 Macro'], width=bar_width, label='F1 Macro',color=colors[1])
rects3 = ax.bar(x + bar_width, df_metrics['F1 Weighted'], width=bar_width, label='F1 Weighted',color=colors[2])

# Add some text for labels, title and axes ticks.
ax.set_xlabel('Model')
ax.set_ylabel('Score')
ax.set_title('Performance Comparison of Models')
ax.set_xticks(x)
ax.set_xticklabels(model_names)
ax.legend()

# Optional: Label the bars with their heights.
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(rect.get_x() + rect.get_width()/2, height),
                    xytext=(0, 3),  # Vertical offset (3 points)
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)

plt.tight_layout()
plt.show()

