In [53]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
import torch
from torch.utils.data import Dataset, DataLoader

# Download and read in the data
train_data_file = "/content/drive/MyDrive/Colab Notebooks/DL_ASS2/BN-Bangla/bn_train.conll"
val_data_file = "/content/drive/MyDrive/Colab Notebooks/DL_ASS2/BN-Bangla/bn_dev.conll"
test_data_file = "/content/drive/MyDrive/Colab Notebooks/DL_ASS2/BN-Bangla/bn_test.conll"
def read_data(file_path):
    dataset = []
    with open(file_path, 'r', encoding='utf-8') as f:
        lst = []
        for line in f:
            if line == '\n' or line[0] == '#':
                if lst:
                  dataset.append(lst)
                  lst = []
            else:
                parts = line.strip().split(' _ _ ', 1)
                word = parts[0]
                tag = parts[1] if len(parts) > 1 else ''
                lst.append((word, tag))
    return dataset

train_dataset = read_data(train_data_file)
print("Train")
for i in range(5):
  print(train_dataset[i])
dev_dataset = read_data(val_data_file)
print("Dev")
for i in range(5):
  print(dev_dataset[i])
test_dataset = read_data(test_data_file)
print("Test")
for i in range(5):
  print(test_dataset[i])
dataset =train_dataset + dev_dataset + test_dataset
print(len(train_dataset), len(dev_dataset), len(test_dataset), len(dataset)) 
print(len(train_dataset)+len(dev_dataset)+len(test_dataset))

# SEQ_LEN = min(100, (len(max(train_dataset, key=len))))
# print(len(max(dev_dataset, key=len)))
# print(len(max(test_dataset, key=len)))

Train
[('স্টেশনটি', 'O'), ('প্ল্যাটফর্ম', 'B-OtherPROD'), ('স্ক্রিন', 'I-OtherPROD'), ('ডোর', 'I-OtherPROD'), ('দিয়ে', 'O'), ('সজ্জিত।', 'O')]
[('উদ্ভিদটির', 'O'), ('৯০', 'O'), ('০০০', 'O'), ('এরও', 'O'), ('বেশি', 'O'), ('সৌর', 'B-OtherPROD'), ('প্যানেল', 'I-OtherPROD'), ('২৩৫', 'O'), ('একরও', 'O'), ('বেশি', 'O'), ('রয়েছে।', 'O')]
[('এই', 'O'), ('মিশনটি', 'O'), ('চন্দ্র', 'B-OtherPROD'), ('এক্স-রশ্মি', 'I-OtherPROD'), ('মানমন্দির', 'I-OtherPROD'), ('মোতায়েন', 'O'), ('করেছে।', 'O')]
[('সংস্থাটি', 'O'), ('বেশ', 'O'), ('কয়েকটি', 'O'), ('বিমান', 'O'), ('এবং', 'O'), ('রাডার', 'B-OtherPROD'), ('স্কোয়াড্রনগুলিতে', 'O'), ('কমান্ড', 'O'), ('এবং', 'O'), ('নিয়ন্ত্রণ', 'O'), ('সরবরাহ', 'O'), ('করেছে।', 'O')]
[('ট্রাম', 'B-OtherPROD'), ('স্টেশনটি', 'O'), ('কোচি', 'B-HumanSettlement'), ('কোওচি', 'B-ORG'), ('প্রশাসনিক', 'I-ORG'), ('অঞ্চল', 'I-ORG'), ('সূর্য', 'B-HumanSettlement'), ('উদয়ের', 'I-HumanSettlement'), ('দেশ', 'I-HumanSettlement'), ('।', 'O')]
Dev
[('ফ্যারাডে', 'O'), ('একটি', 'O'), (

In [55]:
# Create word_to_idx and tag_to_idx mappings
SEQ_LEN = 30
word_to_idx = {"<PAD>": 0, "<UNK>": 1}
tag_to_idx = {"<PAD>": 0}


def preprocess(dataset):
    # Extract sentences and tags
    sent = [[token.lower() for token, tag in sentence] for sentence in dataset]
    tags = [[tag for token, tag in sentence] for sentence in dataset]

    for i in range(len(sent)):
        while len(sent[i]) < SEQ_LEN:
            sent[i].append('<PAD>')
            tags[i].append('<PAD>')

        if len(sent[i]) > SEQ_LEN:
            sent[i] = sent[i][:SEQ_LEN]
            tags[i] = tags[i][:SEQ_LEN]
    
    for sentence_tags in tags:
        for tag in sentence_tags:
            if tag not in tag_to_idx:
                tag_to_idx[tag] = len(tag_to_idx)
    
    for sentence in sent:
        for word in sentence:
            if word not in word_to_idx:
                word_to_idx[word] = len(word_to_idx)

    # Convert words and tags to indices
    X = torch.tensor([[word_to_idx.get(word, 1) for word in sentence] for sentence in sent], dtype=torch.int).type(torch.LongTensor)
    Y = torch.tensor([[tag_to_idx[tag] for tag in sentence] for sentence in tags], dtype=torch.int).type(torch.LongTensor)
    
    return X, Y


In [56]:
train_X, train_Y = preprocess(train_dataset)
dev_X, dev_Y = preprocess(dev_dataset)
test_X, test_Y = preprocess(test_dataset)

In [57]:
# Print the sizes of the datasets
print(f"Number of training examples: {len(train_X)}")
print(f"Number of validation examples: {len(dev_X)}")
print(f"Number of testing examples: {len(test_X)}")

Number of training examples: 9708
Number of validation examples: 507
Number of testing examples: 19859


In [58]:
%pip install pytorch-lightning 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [59]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl

class NERModel(pl.LightningModule):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, num_layers=1, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) #B * seq_len, B * seq_len * embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=num_layers, bidirectional=bidirectional)
        #B * seq_len * embedding_dim -> B * seq_len * hidden_dim 
        #tags
        if bidirectional:
            self.fc = nn.Linear(2*hidden_dim, tagset_size)
        else:
            self.fc = nn.Linear(hidden_dim, tagset_size)
        self.loss_fn = nn.CrossEntropyLoss()
    
    def forward(self, x):
        embeds = self.embedding(x)
        #print(embeds.shape)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.fc(lstm_out)
        tag_scores = nn.functional.log_softmax(tag_space, dim=2)
        return tag_scores
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('test_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters())
        return optimizer

In [60]:
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

EMBEDDING_DIM = 100
HIDDEN_DIM    = 100
NUM_EPOCHS    = 10
BATCH_SIZE    = 32

train_dataset = TensorDataset(train_X, train_Y)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TensorDataset(dev_X, dev_Y)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_X, test_Y)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [61]:
model = NERModel(vocab_size=len(word_to_idx), tagset_size=len(tag_to_idx), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
early_stopping = EarlyStopping(monitor="val_loss", patience=3, mode="min")


In [62]:
trainer = pl.Trainer(max_epochs=NUM_EPOCHS, callbacks=[early_stopping])

trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

trainer.test(dataloaders=test_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 4.3 M 
1 | lstm      | LSTM             | 161 K 
2 | fc        | Linear           | 13.7 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
4.4 M     Trainable params
0         Non-trainable params
4.4 M     Total params
17.748    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_3/checkpoints/epoch=9-step=3040.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_3/checkpoints/epoch=9-step=3040.ckpt


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.2614765763282776}]

In [63]:
idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

In [64]:
from sklearn.metrics import classification_report

# define idx_to_tag


# define device
device = torch.device('cpu')

# Create a dataloader for the test set
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Set the model to evaluation mode
model.eval()

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]

print(classification_report(y_true, y_pred))

                         precision    recall  f1-score   support

                  <PAD>       1.00      1.00      1.00    339381
B-AerospaceManufacturer       0.20      0.04      0.07        97
  B-AnatomicalStructure       0.62      0.51      0.56       532
              B-ArtWork       0.22      0.01      0.02       455
               B-Artist       0.42      0.37      0.39      2744
              B-Athlete       0.35      0.30      0.32      1087
      B-CarManufacturer       0.56      0.88      0.69        84
               B-Cleric       0.60      0.52      0.56       240
             B-Clothing       0.12      0.71      0.20        17
              B-Disease       0.58      0.61      0.60       554
                B-Drink       0.60      0.75      0.67       120
             B-Facility       0.49      0.40      0.44       894
                 B-Food       0.32      0.34      0.33       453
      B-HumanSettlement       0.74      0.63      0.68      6011
     B-MedicalProcedure 

In [65]:
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

In [66]:
# Set the model to evaluation mode
model.eval()

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Get back the sentence
        x_sent = [idx_to_word[i] for i in x.cpu().numpy().flatten().tolist()]

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]
        print("Sentence")
        print(x_sent)
        print("Predicted tags")
        print(y_pred)
        break

Sentence
['প্রোপেলারটি', 'একটি', 'ডি', 'হ্যাভিল্যান্ড', 'এয়ারক্রাফ্ট', 'কোম্পানি', 'স্থির', 'পিচ', 'টাইপ', 'ছিল।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'এটি', '১৯৫৫', 'সালে', '১৯৫৭', 'সালে', 'নর্ড', 'এভিয়েশন', 'পরে', 'snias', 'দ্বারা', 'ডিজাইন', 'করা', 'হয়েছিল।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'পাঁচটি', 'f108', 'চালিত', 'উদাহরণগুলি', 'বোয়িং', 'কোম্পানি', 'থেকে', 'সরাসরি', 'অনুরোধ', 'করা', 'হয়েছিল', 'বাকী', 'অন্যান্য', 'পক্ষ', 'থেকে।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'গ্রিনহিল', '১৯৯৫', 'সালে', "l'auto-neige", 'bombardier', 'limitée', 'যোগদানের', 'আগে', 'mckinsey', '&', 'company,', 'inc.', 'এ', 'তাঁর', 'কেরিয়ার', 'শুরু'

In [67]:
save_dict = {'model_state_dict': model.state_dict()}
torch.save(save_dict, '/content/drive/MyDrive/Colab Notebooks/DL_ASS2/bengali_coarse.pt')

In [68]:
# Load the saved model
import torch
load_dict = torch.load('/content/drive/MyDrive/Colab Notebooks/DL_ASS2/bengali_coarse.pt')

# Create a new model and optimizer
model_bengali = NERModel(vocab_size=len(word_to_idx), tagset_size=len(tag_to_idx), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
# Load the state dictionary into the model and optimizer
model.load_state_dict(load_dict['model_state_dict'])

<All keys matched successfully>

In [71]:
import torch

def predict_sentence(sentence, model, word_to_idx, idx_to_tag, unknown_token="<UNK>"):
    # Tokenize the sentence
    words = sentence.split()

    # Convert words to lowercase
    words = [word.lower() for word in words]

    # Get the index of each word
    word_indices = [word_to_idx.get(word, word_to_idx[unknown_token]) for word in words]

    # Convert the word indices to a tensor
    tensor = torch.tensor(word_indices).unsqueeze(0)

    # Make a prediction with the model
    predictions = model(tensor)

    # Convert the predictions to tags
    tags = [idx_to_tag[pred] for pred in predictions.argmax(-1).cpu().numpy().flatten().tolist()]

    # Merge consecutive words with the same tag
    tagged_words = []
    for word, tag in zip(words, tags):
        if tag.startswith("B-"):
            tagged_words.append(f"{word}[{tag[2:]}]")
        elif tag.startswith("I-"):
            tagged_words[-1] += f" {word}"
        else:
            tagged_words.append(word)

    # Join the tagged words into a sentence
    tagged_sentence = " ".join(tagged_words)

    return tagged_sentence


print(predict_sentence("জিম লন্ডনে একটি নতুন বাড়ি কিনেছেন।", model, word_to_idx, idx_to_tag))

জিম[Artist] লন্ডনে একটি নতুন বাড়ি কিনেছেন।[Vehicle]
