In [38]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:
import torch
from torch.utils.data import Dataset, DataLoader

# Download and read in the data
train_data_file = "/content/drive/MyDrive/Colab Notebooks/DL_ASS2/EN-English/en_train.conll"
val_data_file = "/content/drive/MyDrive/Colab Notebooks/DL_ASS2/EN-English/en_dev.conll"
test_data_file = "/content/drive/MyDrive/Colab Notebooks/DL_ASS2/EN-English/en_test.conll"
def read_data(file_path):
    dataset = []
    with open(file_path, 'r', encoding='utf-8') as f:
        lst = []
        for line in f:
            if line == '\n' or line[0] == '#':
                if lst:
                  dataset.append(lst)
                  lst = []
            else:
                parts = line.strip().split(' _ _ ', 1)
                word = parts[0]
                tag = parts[1] if len(parts) > 1 else ''
                lst.append((word, tag))
    return dataset

train_dataset = read_data(train_data_file)
print("Train")
for i in range(5):
  print(train_dataset[i])
dev_dataset = read_data(val_data_file)
print("Dev")
for i in range(5):
  print(dev_dataset[i])
test_dataset = read_data(test_data_file)
print("Test")
for i in range(5):
  print(test_dataset[i])
dataset =train_dataset + dev_dataset + test_dataset
print(len(train_dataset), len(dev_dataset), len(test_dataset), len(dataset)) 
print(len(train_dataset)+len(dev_dataset)+len(test_dataset))

# SEQ_LEN = min(100, (len(max(train_dataset, key=len))))
# print(len(max(dev_dataset, key=len)))
# print(len(max(test_dataset, key=len)))

Train
[('robert', 'B-OtherPER'), ('gottschalk', 'I-OtherPER'), ('1939', 'O'), ('academy', 'B-VisualWork'), ('award', 'I-VisualWork'), ('winner', 'O'), ('and', 'O'), ('founder', 'O'), ('of', 'O'), ('panavision', 'B-ORG')]
[('during', 'O'), ('the', 'O'), ('reign', 'O'), ('of', 'O'), ('the', 'O'), ('tongzhi', 'B-OtherPER'), ('emperor', 'I-OtherPER'), ('(', 'O'), ('r', 'O'), ('.', 'O'), ('1861', 'O'), ('–', 'O'), ('1875', 'O'), (')', 'O'), (':', 'O')]
[('further', 'O'), ('research', 'O'), ('led', 'O'), ('in', 'O'), ('the', 'O'), ('1960s', 'O'), ('to', 'O'), ('the', 'O'), ('bahadur', 'B-OtherPER'), ('representation', 'O'), ('which', 'O'), ('provides', 'O'), ('information', 'O'), ('about', 'O'), ('the', 'O'), ('errorbounds', 'O'), ('.', 'O')]
[('the', 'O'), ('ideas', 'O'), ('were', 'O'), ('introduced', 'O'), ('by', 'O'), ('william', 'B-OtherPER'), ('burnside', 'I-OtherPER'), ('at', 'O'), ('the', 'O'), ('end', 'O'), ('of', 'O'), ('the', 'O'), ('nineteenth', 'O'), ('century', 'O'), ('.', 'O')]

In [40]:
# Create word_to_idx and tag_to_idx mappings
SEQ_LEN = 30
word_to_idx = {"<PAD>": 0, "<UNK>": 1}
tag_to_idx = {"<PAD>": 0}


def preprocess(dataset):
    # Extract sentences and tags
    sent = [[token.lower() for token, tag in sentence] for sentence in dataset]
    tags = [[tag for token, tag in sentence] for sentence in dataset]

    for i in range(len(sent)):
        while len(sent[i]) < SEQ_LEN:
            sent[i].append('<PAD>')
            tags[i].append('<PAD>')

        if len(sent[i]) > SEQ_LEN:
            sent[i] = sent[i][:SEQ_LEN]
            tags[i] = tags[i][:SEQ_LEN]
    
    for sentence_tags in tags:
        for tag in sentence_tags:
            if tag not in tag_to_idx:
                tag_to_idx[tag] = len(tag_to_idx)
    
    for sentence in sent:
        for word in sentence:
            if word not in word_to_idx:
                word_to_idx[word] = len(word_to_idx)

    # Convert words and tags to indices
    X = torch.tensor([[word_to_idx.get(word, 1) for word in sentence] for sentence in sent], dtype=torch.int).type(torch.LongTensor)
    Y = torch.tensor([[tag_to_idx[tag] for tag in sentence] for sentence in tags], dtype=torch.int).type(torch.LongTensor)
    
    return X, Y


In [41]:
train_X, train_Y = preprocess(train_dataset)
dev_X, dev_Y = preprocess(dev_dataset)
test_X, test_Y = preprocess(test_dataset)

In [42]:
# Print the sizes of the datasets
print(f"Number of training examples: {len(train_X)}")
print(f"Number of validation examples: {len(dev_X)}")
print(f"Number of testing examples: {len(test_X)}")

Number of training examples: 16778
Number of validation examples: 871
Number of testing examples: 249980


In [43]:
%pip install pytorch-lightning 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [44]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl

class NERModel(pl.LightningModule):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, num_layers=1, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) #B * seq_len, B * seq_len * embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=num_layers, bidirectional=bidirectional)
        #B * seq_len * embedding_dim -> B * seq_len * hidden_dim 
        #tags
        if bidirectional:
            self.fc = nn.Linear(2*hidden_dim, tagset_size)
        else:
            self.fc = nn.Linear(hidden_dim, tagset_size)
        self.loss_fn = nn.CrossEntropyLoss()
    
    def forward(self, x):
        embeds = self.embedding(x)
        #print(embeds.shape)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.fc(lstm_out)
        tag_scores = nn.functional.log_softmax(tag_space, dim=2)
        return tag_scores
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('test_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters())
        return optimizer

In [45]:
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

EMBEDDING_DIM = 100
HIDDEN_DIM    = 100
NUM_EPOCHS    = 10
BATCH_SIZE    = 32

train_dataset = TensorDataset(train_X, train_Y)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TensorDataset(dev_X, dev_Y)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_X, test_Y)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [46]:
model = NERModel(vocab_size=len(word_to_idx), tagset_size=len(tag_to_idx), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
early_stopping = EarlyStopping(monitor="val_loss", patience=3, mode="min")


In [47]:
trainer = pl.Trainer(max_epochs=NUM_EPOCHS, callbacks=[early_stopping])

trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

trainer.test(dataloaders=test_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 24.2 M
1 | lstm      | LSTM             | 161 K 
2 | fc        | Linear           | 13.7 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
24.4 M    Trainable params
0         Non-trainable params
24.4 M    Total params
97.562    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_3/checkpoints/epoch=8-step=4725.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_3/checkpoints/epoch=8-step=4725.ckpt


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.4126929044723511}]

In [48]:
idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

In [49]:
from sklearn.metrics import classification_report

# define idx_to_tag


# define device
device = torch.device('cpu')

# Create a dataloader for the test set
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Set the model to evaluation mode
model.eval()

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]

print(classification_report(y_true, y_pred))

                         precision    recall  f1-score   support

                  <PAD>       1.00      1.00      1.00   3727316
B-AerospaceManufacturer       0.29      0.25      0.27      1015
  B-AnatomicalStructure       0.31      0.19      0.24      5838
              B-ArtWork       0.18      0.07      0.10      1270
               B-Artist       0.51      0.55      0.53     57034
              B-Athlete       0.46      0.43      0.44     27624
      B-CarManufacturer       0.32      0.20      0.24      2984
               B-Cleric       0.30      0.19      0.23      4732
             B-Clothing       0.20      0.05      0.08      2243
              B-Disease       0.40      0.18      0.25      5622
                B-Drink       0.27      0.15      0.19      2246
             B-Facility       0.33      0.32      0.32     16181
                 B-Food       0.14      0.03      0.05      5317
      B-HumanSettlement       0.61      0.54      0.57     41099
     B-MedicalProcedure 

In [50]:
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

In [51]:
# Set the model to evaluation mode
model.eval()

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Get back the sentence
        x_sent = [idx_to_word[i] for i in x.cpu().numpy().flatten().tolist()]

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]
        print("Sentence")
        print(x_sent)
        print("Predicted tags")
        print(y_pred)
        break

Sentence
['the', 'species', 'was', 'described', 'by', 'dietrich', 'brandis', 'after', 'the', 'forester', 't.', 'f.', 'bourdillon', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'by', 'this', 'time', 'she', 'was', 'competing', 'against', 'a', 'new', 'generation', 'of', 'young', 'drivers', 'including', 'stirling', 'moss', 'and', 'peter', 'collins', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'their', 'son', 'was', 'the', 'opera', 'producer', 'knut', 'hendriksen', '(', '1944', '–', '2020', ')', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'he', 'is', 'the', 'younger', 'brother', 'of', 'adam', 'mosseri', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<P

In [52]:
save_dict = {'model_state_dict': model.state_dict()}
torch.save(save_dict, '/content/drive/MyDrive/Colab Notebooks/DL_ASS2/english_coarse.pt')

In [53]:
# Load the saved model
import torch
load_dict = torch.load('/content/drive/MyDrive/Colab Notebooks/DL_ASS2/english_coarse.pt')

# Create a new model and optimizer
model_english = NERModel(vocab_size=len(word_to_idx), tagset_size=len(tag_to_idx), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
# Load the state dictionary into the model and optimizer
model.load_state_dict(load_dict['model_state_dict'])

<All keys matched successfully>

In [54]:
import torch

def predict_sentence(sentence, model, word_to_idx, idx_to_tag, unknown_token="<UNK>"):
    # Tokenize the sentence
    words = sentence.split()

    # Convert words to lowercase
    words = [word.lower() for word in words]

    # Get the index of each word
    word_indices = [word_to_idx.get(word, word_to_idx[unknown_token]) for word in words]

    # Convert the word indices to a tensor
    tensor = torch.tensor(word_indices).unsqueeze(0)

    # Make a prediction with the model
    predictions = model(tensor)

    # Convert the predictions to tags
    tags = [idx_to_tag[pred] for pred in predictions.argmax(-1).cpu().numpy().flatten().tolist()]

    # Merge consecutive words with the same tag
    tagged_words = []
    for word, tag in zip(words, tags):
        if tag.startswith("B-"):
            tagged_words.append(f"{word}[{tag[2:]}]")
        elif tag.startswith("I-"):
            tagged_words[-1] += f" {word}"
        else:
            tagged_words.append(word)

    # Join the tagged words into a sentence
    tagged_sentence = " ".join(tagged_words)

    return tagged_sentence


print(predict_sentence("Jim bought 300 shares of Acme Corp. in 2022.", model, word_to_idx, idx_to_tag))

jim[OtherPER] bought 300 shares of acme corp. in 2022.[HumanSettlement]
