In [None]:
!pip install torchtext=='0.10.0'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.10.0
  Downloading torchtext-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 4.6 MB/s 
Collecting torch==1.9.0
  Downloading torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 2.8 kB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0+cu113
    Uninstalling torch-1.11.0+cu113:
      Successfully uninstalled torch-1.11.0+cu113
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.12.0
    Uninstalling torchtext-0.12.0:
      Successfully uninstalled torchtext-0.12.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.12.0+

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.legacy import data
from torchtext.legacy import datasets

import spacy
import numpy as np

import time
import random
from tqdm import tqdm

# torch.__version__, torchtext.__version__ # ('1.9.0+cu102', '0.10.0')

In [None]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
TEXT = data.Field(lower=True)
UD_TAGS = data.Field(unk_token = None)
PTB_TAGS = data.Field(unk_token = None)

fields = (("text", TEXT), ("udtags", UD_TAGS), ("ptbtags", PTB_TAGS))
train_data, valid_data, test_data = datasets.UDPOS.splits(fields)

print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

downloading en-ud-v2.zip


en-ud-v2.zip: 100%|██████████| 688k/688k [00:00<00:00, 4.70MB/s]


extracting
Number of training examples: 12543
Number of validation examples: 2002
Number of testing examples: 2077


In [None]:
print(f"{vars(train_data.examples[0])}\n\n")

print(f"TEXT:\n{vars(train_data.examples[0])['text']}\n")
print(f"UDTAGS:\n{vars(train_data.examples[0])['udtags']}\n")
print(f"PTBTAGS:\n{vars(train_data.examples[0])['ptbtags']}\n")

{'text': ['al', '-', 'zaman', ':', 'american', 'forces', 'killed', 'shaikh', 'abdullah', 'al', '-', 'ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'qaim', ',', 'near', 'the', 'syrian', 'border', '.'], 'udtags': ['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT'], 'ptbtags': ['NNP', 'HYPH', 'NNP', ':', 'JJ', 'NNS', 'VBD', 'NNP', 'NNP', 'NNP', 'HYPH', 'NNP', ',', 'DT', 'NN', 'IN', 'DT', 'NN', 'IN', 'DT', 'NN', 'IN', 'NNP', ',', 'IN', 'DT', 'JJ', 'NN', '.']}


TEXT:
['al', '-', 'zaman', ':', 'american', 'forces', 'killed', 'shaikh', 'abdullah', 'al', '-', 'ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'qaim', ',', 'near', 'the', 'syrian', 'border', '.']

UDTAGS:
['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'P

In [None]:
MIN_FREQ = 2

# TEXT.build_vocab(
#     train_data, min_freq=MIN_FREQ, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_
# )
TEXT.build_vocab(
    train_data, min_freq=MIN_FREQ
)

UD_TAGS.build_vocab(
    train_data
)

PTB_TAGS.build_vocab(
    train_data
)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in UD_TAG vocabulary: {len(UD_TAGS.vocab)}")
print(f"Unique tokens in PTB_TAG vocabulary: {len(PTB_TAGS.vocab)}")

Unique tokens in TEXT vocabulary: 8866
Unique tokens in UD_TAG vocabulary: 18
Unique tokens in PTB_TAG vocabulary: 51


In [None]:
print(f"{TEXT.vocab.freqs.most_common(10)}\n")
print(f"{UD_TAGS.vocab.freqs.most_common(10)}\n")
print(f"{PTB_TAGS.vocab.freqs.most_common(10)}\n")

[('the', 9076), ('.', 8640), (',', 7021), ('to', 5137), ('and', 5002), ('a', 3782), ('of', 3622), ('i', 3379), ('in', 3112), ('is', 2239)]

[('NOUN', 34781), ('PUNCT', 23679), ('VERB', 23081), ('PRON', 18577), ('ADP', 17638), ('DET', 16285), ('PROPN', 12946), ('ADJ', 12477), ('AUX', 12343), ('ADV', 10548)]

[('NN', 26915), ('IN', 20724), ('DT', 16817), ('NNP', 12449), ('PRP', 12193), ('JJ', 11591), ('RB', 10831), ('.', 10317), ('VB', 9476), ('NNS', 8438)]



In [None]:
print(f"{UD_TAGS.vocab.itos}\n\n")
print(f"{PTB_TAGS.vocab.itos}\n\n")

['<pad>', 'NOUN', 'PUNCT', 'VERB', 'PRON', 'ADP', 'DET', 'PROPN', 'ADJ', 'AUX', 'ADV', 'CCONJ', 'PART', 'NUM', 'SCONJ', 'X', 'INTJ', 'SYM']


['<pad>', 'NN', 'IN', 'DT', 'NNP', 'PRP', 'JJ', 'RB', '.', 'VB', 'NNS', ',', 'CC', 'VBD', 'VBP', 'VBZ', 'CD', 'VBN', 'VBG', 'MD', 'TO', 'PRP$', '-RRB-', '-LRB-', 'WDT', 'WRB', ':', '``', "''", 'WP', 'RP', 'UH', 'POS', 'HYPH', 'JJR', 'NNPS', 'JJS', 'EX', 'NFP', 'GW', 'ADD', 'RBR', '$', 'PDT', 'RBS', 'SYM', 'LS', 'FW', 'AFX', 'WP$', 'XX']




In [None]:
def tag_percentage(tag_counts):
    total_count = sum([count for tag, count in tag_counts])
    tag_counts_percentages = [
        (tag, count, count/total_count) for tag, count in tag_counts
    ]
    return tag_counts_percentages

In [None]:
print("Tag\t\tCount\t\tPercentage\n----------------------------------------")
for tag, count, percent in tag_percentage(UD_TAGS.vocab.freqs.most_common()):
    print(f"{tag}\t\t{count}\t\t{percent*100:4.1f}%")

Tag		Count		Percentage
----------------------------------------
NOUN		34781		17.0%
PUNCT		23679		11.6%
VERB		23081		11.3%
PRON		18577		 9.1%
ADP		17638		 8.6%
DET		16285		 8.0%
PROPN		12946		 6.3%
ADJ		12477		 6.1%
AUX		12343		 6.0%
ADV		10548		 5.2%
CCONJ		6707		 3.3%
PART		5567		 2.7%
NUM		3999		 2.0%
SCONJ		3843		 1.9%
X		847		 0.4%
INTJ		688		 0.3%
SYM		599		 0.3%


In [None]:
print("Tag\t\tCount\t\tPercentage\n----------------------------------------")

for tag, count, percent in tag_percentage(PTB_TAGS.vocab.freqs.most_common()):
    print(f"{tag}\t\t{count}\t\t{percent*100:4.1f}%")

Tag		Count		Percentage
----------------------------------------
NN		26915		13.2%
IN		20724		10.1%
DT		16817		 8.2%
NNP		12449		 6.1%
PRP		12193		 6.0%
JJ		11591		 5.7%
RB		10831		 5.3%
.		10317		 5.0%
VB		9476		 4.6%
NNS		8438		 4.1%
,		8062		 3.9%
CC		6706		 3.3%
VBD		5402		 2.6%
VBP		5374		 2.6%
VBZ		4578		 2.2%
CD		3998		 2.0%
VBN		3967		 1.9%
VBG		3330		 1.6%
MD		3294		 1.6%
TO		3286		 1.6%
PRP$		3068		 1.5%
-RRB-		1008		 0.5%
-LRB-		973		 0.5%
WDT		948		 0.5%
WRB		869		 0.4%
:		866		 0.4%
``		813		 0.4%
''		785		 0.4%
WP		760		 0.4%
RP		755		 0.4%
UH		689		 0.3%
POS		684		 0.3%
HYPH		664		 0.3%
JJR		503		 0.2%
NNPS		498		 0.2%
JJS		383		 0.2%
EX		359		 0.2%
NFP		338		 0.2%
GW		294		 0.1%
ADD		292		 0.1%
RBR		276		 0.1%
$		258		 0.1%
PDT		175		 0.1%
RBS		169		 0.1%
SYM		156		 0.1%
LS		117		 0.1%
FW		93		 0.0%
AFX		48		 0.0%
WP$		15		 0.0%
XX		1		 0.0%


In [None]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

In [None]:
class BiLSTMPOSTagger(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, \
                 n_layers, bidirectional, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(
            embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional,
            dropout=dropout if n_layers>1 else 0
        )
        self.fc = nn.Linear(
            hidden_dim*2 if bidirectional else hidden_dim,
            output_dim
        )
        self.dropout = nn.Dropout(dropout)
    

    def forward(self, text):
        # text: [sent len, batch size]
        embedded = self.dropout(self.embedding(text))
        # embedded: [sent len, batch size, emb dim]
        outputs, (hidden, cell) = self.lstm(embedded)
        # outputs: [sent len, batch size, hid dim x n directions]
        # hidden, cell: [n layers, n directions, batch size, hid dim]
        predictions = self.fc(self.dropout(outputs))
        # predictions: [sent len, batch size, output dim]
        return predictions


In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = len(UD_TAGS.vocab)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

In [None]:
model = BiLSTMPOSTagger(
    INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, \
    N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX
)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean = 0, std = 0.1)
        
model.apply(init_weights)

BiLSTMPOSTagger(
  (embedding): Embedding(8866, 100, padding_idx=1)
  (lstm): LSTM(100, 128, num_layers=2, dropout=0.25, bidirectional=True)
  (fc): Linear(in_features=256, out_features=18, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,522,010 trainable parameters


In [None]:
# # initialize the model's embedding layer with pre-trained embedding values loaded earlier
# pretrained_embeddings = TEXT.vocab.vectors
# print(pretrained_embeddings.shape)

# model.embedding.weight.data.copy_(pretrained_embeddings)

In [None]:
# model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
# print(model.embedding.weight.data)

In [None]:
optimizer = optim.Adam(model.parameters())

TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

model = model.to(device)
criterion = criterion.to(device)

In [None]:
def categorical_accuracy(preds, y, tag_pad_idx):
    max_preds = preds.argmax(dim=1, keepdim=True)
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / y[non_pad_elements].shape[0]

In [None]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    epoch_loss, epoch_acc = 0, 0
    model.train()

    for batch in tqdm(iterator):
        text = batch.text
        tags = batch.udtags

        optimizer.zero_grad()
        predictions = model(text)
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)
        loss = criterion(predictions, tags)
        acc = categorical_accuracy(predictions, tags, tag_pad_idx)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [None]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    epoch_loss, epoch_acc = 0, 0
    model.eval()
    with torch.no_grad():
        for batch in tqdm(iterator):
            text = batch.text
            tags = batch.udtags
            predictions = model(text)
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            loss = criterion(predictions, tags)
            acc = categorical_accuracy(predictions, tags, tag_pad_idx)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 10

for epoch in range(N_EPOCHS):
    print(f"Epoch: {epoch} / {N_EPOCHS}")
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, TAG_PAD_IDX)
    print(f"\tTrain Loss: {train_loss:.4f}, Train Accuracy: {train_acc*100:.2f}%")
    print(f"\tValid Loss: {valid_loss:.4f}, Valid Accuracy: {valid_acc*100:.2f}%")

Epoch: 0 / 10


100%|██████████| 98/98 [01:31<00:00,  1.07it/s]
100%|██████████| 16/16 [00:01<00:00, 13.72it/s]


	Train Loss: 0.5386, Train Accuracy: 83.36%
	Valid Loss: 0.5433, Valid Accuracy: 84.15%
Epoch: 1 / 10


100%|██████████| 98/98 [01:30<00:00,  1.08it/s]
100%|██████████| 16/16 [00:01<00:00, 13.80it/s]


	Train Loss: 0.3155, Train Accuracy: 90.42%
	Valid Loss: 0.4569, Valid Accuracy: 86.47%
Epoch: 2 / 10


100%|██████████| 98/98 [01:31<00:00,  1.07it/s]
100%|██████████| 16/16 [00:01<00:00, 13.92it/s]


	Train Loss: 0.2420, Train Accuracy: 92.51%
	Valid Loss: 0.4162, Valid Accuracy: 88.00%
Epoch: 3 / 10


100%|██████████| 98/98 [01:31<00:00,  1.07it/s]
100%|██████████| 16/16 [00:01<00:00, 13.70it/s]


	Train Loss: 0.2061, Train Accuracy: 93.65%
	Valid Loss: 0.3985, Valid Accuracy: 88.29%
Epoch: 4 / 10


100%|██████████| 98/98 [01:31<00:00,  1.07it/s]
100%|██████████| 16/16 [00:01<00:00, 13.76it/s]


	Train Loss: 0.1807, Train Accuracy: 94.33%
	Valid Loss: 0.3875, Valid Accuracy: 88.86%
Epoch: 5 / 10


100%|██████████| 98/98 [01:33<00:00,  1.05it/s]
100%|██████████| 16/16 [00:01<00:00, 13.74it/s]


	Train Loss: 0.1632, Train Accuracy: 94.87%
	Valid Loss: 0.3779, Valid Accuracy: 89.11%
Epoch: 6 / 10


100%|██████████| 98/98 [01:31<00:00,  1.07it/s]
100%|██████████| 16/16 [00:01<00:00, 13.82it/s]


	Train Loss: 0.1498, Train Accuracy: 95.21%
	Valid Loss: 0.3758, Valid Accuracy: 89.26%
Epoch: 7 / 10


100%|██████████| 98/98 [01:31<00:00,  1.07it/s]
100%|██████████| 16/16 [00:01<00:00, 13.29it/s]


	Train Loss: 0.1361, Train Accuracy: 95.69%
	Valid Loss: 0.3744, Valid Accuracy: 89.12%
Epoch: 8 / 10


100%|██████████| 98/98 [01:35<00:00,  1.03it/s]
100%|██████████| 16/16 [00:01<00:00, 13.54it/s]


	Train Loss: 0.1250, Train Accuracy: 96.04%
	Valid Loss: 0.3706, Valid Accuracy: 89.53%
Epoch: 9 / 10


100%|██████████| 98/98 [01:35<00:00,  1.03it/s]
100%|██████████| 16/16 [00:01<00:00, 13.58it/s]

	Train Loss: 0.1159, Train Accuracy: 96.32%
	Valid Loss: 0.3714, Valid Accuracy: 89.05%





In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion, TAG_PAD_IDX)
print(f"\n\tTest Loss: {test_loss:.4f} \n\tTest Accuracy: {test_acc*100:.2f}%")

100%|██████████| 17/17 [00:01<00:00, 14.81it/s]


	Test Loss: 0.3817 
	Test Accuracy: 88.44%





In [None]:
def tag_sentence(model, device, sentence, text_field, tag_field):
    model.eval()

    if isinstance(sentence, str):
        nlp = spacy.load('en_core_web_sm')
        tokens = [token.text for token in nlp(sentence)]
    else:
        tokens = [token for token in sentence]

    if text_field.lower:
        tokens = [t.lower() for t in tokens]
    
    numericalized_tokens = [text_field.vocab.stoi[t] for t in tokens]
    unk_idx = text_field.vocab.stoi[text_field.unk_token]
    unks = [t for t, n in zip(tokens, numericalized_tokens) if n==unk_idx]
    token_tensor = torch.LongTensor(numericalized_tokens)
    token_tensor = token_tensor.unsqueeze(-1).to(device)
    predictions = model(token_tensor)
    top_predictions = predictions.argmax(-1)
    predicted_tags = [tag_field.vocab.itos[t.item()] for t in top_predictions]
    return tokens, predicted_tags, unks

In [None]:
example_index = 1

sentence = vars(train_data.examples[example_index])['text']
actual_tags = vars(train_data.examples[example_index])['udtags']

print(sentence)
print(actual_tags)

['[', 'this', 'killing', 'of', 'a', 'respected', 'cleric', 'will', 'be', 'causing', 'us', 'trouble', 'for', 'years', 'to', 'come', '.', ']']
['PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'AUX', 'AUX', 'VERB', 'PRON', 'NOUN', 'ADP', 'NOUN', 'PART', 'VERB', 'PUNCT', 'PUNCT']


In [None]:
tokens, pred_tags, unks = tag_sentence(
    model, device, sentence, TEXT, UD_TAGS
)

print(unks)

['respected', 'cleric']


In [None]:
print("Pred. Tag\tActual Tag\tCorrect?\tToken\n-----------------------------------------------------")
for token, pred_tag, actual_tag in zip(tokens, pred_tags, actual_tags):
    correct = '✔' if pred_tag == actual_tag else '✘'
    print(f"{pred_tag}\t\t{actual_tag}\t\t{correct}\t\t{token}")

Pred. Tag	Actual Tag	Correct?	Token
-----------------------------------------------------
PUNCT		PUNCT		✔		[
DET		DET		✔		this
NOUN		NOUN		✔		killing
ADP		ADP		✔		of
DET		DET		✔		a
ADJ		ADJ		✔		respected
NOUN		NOUN		✔		cleric
AUX		AUX		✔		will
AUX		AUX		✔		be
VERB		VERB		✔		causing
PRON		PRON		✔		us
NOUN		NOUN		✔		trouble
ADP		ADP		✔		for
NOUN		NOUN		✔		years
PART		PART		✔		to
VERB		VERB		✔		come
PUNCT		PUNCT		✔		.
PUNCT		PUNCT		✔		]


In [None]:
sentence = 'The Quick brown Fox jumps Over The lazy Dog.'

tokens, tags, unks = tag_sentence(
    model, device, sentence, TEXT, UD_TAGS
)
print(unks)

print("\n\nPred. Tag\tToken\n------------------------")
for token, tag in zip(tokens, tags):
    print(f"{tag}\t\t{token}")

['fox']


Pred. Tag	Token
------------------------
DET		the
ADJ		quick
PROPN		brown
PROPN		fox
VERB		jumps
ADP		over
DET		the
ADJ		lazy
NOUN		dog
PUNCT		.


References </br>
[1] https://github.com/bentrevett/pytorch-pos-tagging/blob/master/1_bilstm.ipynb