In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader

# Download and read in the data
train_data_file = "/content/drive/MyDrive/Colab Notebooks/DL_ASS2/HI-Hindi/hi_train.conll"
val_data_file = "/content/drive/MyDrive/Colab Notebooks/DL_ASS2/HI-Hindi/hi_dev.conll"
test_data_file = "/content/drive/MyDrive/Colab Notebooks/DL_ASS2/HI-Hindi/hi_test.conll"
def read_data(file_path):
    dataset = []
    with open(file_path, 'r', encoding='utf-8') as f:
        lst = []
        for line in f:
            if line == '\n' or line[0] == '#':
                if lst:
                  dataset.append(lst)
                  lst = []
            else:
                parts = line.strip().split(' _ _ ', 1)
                word = parts[0]
                tag = parts[1] if len(parts) > 1 else ''
                lst.append((word, tag))
    return dataset

train_dataset = read_data(train_data_file)
print("Train")
for i in range(5):
  print(train_dataset[i])
dev_dataset = read_data(val_data_file)
print("Dev")
for i in range(5):
  print(dev_dataset[i])
test_dataset = read_data(test_data_file)
print("Test")
for i in range(5):
  print(test_dataset[i])
dataset =train_dataset + dev_dataset + test_dataset
print(len(train_dataset), len(dev_dataset), len(test_dataset), len(dataset)) 
print(len(train_dataset)+len(dev_dataset)+len(test_dataset))

# SEQ_LEN = min(100, (len(max(train_dataset, key=len))))
# print(len(max(dev_dataset, key=len)))
# print(len(max(test_dataset, key=len)))

Train
[('यह', 'O'), ('झियान', 'B-HumanSettlement'), ('चीन', 'B-HumanSettlement'), ('के', 'O'), ('केंद्र', 'O'), ('भाग', 'O'), ('में', 'O'), ('स्थित', 'O'), ('है।', 'O')]
[('२००३', 'O'), ('में', 'O'), ('विंबलडन,', 'B-HumanSettlement'), ('लंदन', 'I-HumanSettlement'), ('में', 'O'), ('एक', 'O'), ('साइकिल', 'O'), ('चालक', 'O'), ('के', 'O'), ('साथ', 'O'), ('टकराव', 'O'), ('के', 'O'), ('दौरान', 'O'), ('रोड्स', 'O'), ('को', 'O'), ('मार', 'O'), ('दिया', 'O'), ('गया', 'O'), ('था।', 'O')]
[('उन्होंने', 'O'), ('अल्जियर्स', 'B-HumanSettlement'), ('में', 'O'), ('राजनीति', 'O'), ('विज्ञान', 'O'), ('का', 'O'), ('अध्ययन', 'O'), ('किया।', 'O')]
[('चार्ल्स', 'O'), ('कोर्डोबा', 'B-HumanSettlement'), ('अमीरात', 'I-HumanSettlement'), ('के', 'O'), ('साथ', 'O'), ('कूटनीति', 'O'), ('में', 'O'), ('लगे', 'O'), ('हुए', 'O'), ('८६५', 'O'), ('में', 'O'), ('मुहम्मद', 'B-Politician'), ('प्रथम', 'I-Politician'), ('कोर्डोबा', 'I-Politician'), ('से', 'O'), ('ऊंट', 'O'), ('प्राप्त', 'O'), ('करते', 'O'), ('हैं।', 'O')]
[(

In [3]:
# Create word_to_idx and tag_to_idx mappings
SEQ_LEN = 30
word_to_idx = {"<PAD>": 0, "<UNK>": 1}
tag_to_idx = {"<PAD>": 0}


def preprocess(dataset):
    # Extract sentences and tags
    sent = [[token.lower() for token, tag in sentence] for sentence in dataset]
    tags = [[tag for token, tag in sentence] for sentence in dataset]

    for i in range(len(sent)):
        while len(sent[i]) < SEQ_LEN:
            sent[i].append('<PAD>')
            tags[i].append('<PAD>')

        if len(sent[i]) > SEQ_LEN:
            sent[i] = sent[i][:SEQ_LEN]
            tags[i] = tags[i][:SEQ_LEN]
    
    for sentence_tags in tags:
        for tag in sentence_tags:
            if tag not in tag_to_idx:
                tag_to_idx[tag] = len(tag_to_idx)
    
    for sentence in sent:
        for word in sentence:
            if word not in word_to_idx:
                word_to_idx[word] = len(word_to_idx)

    # Convert words and tags to indices
    X = torch.tensor([[word_to_idx.get(word, 1) for word in sentence] for sentence in sent], dtype=torch.int).type(torch.LongTensor)
    Y = torch.tensor([[tag_to_idx[tag] for tag in sentence] for sentence in tags], dtype=torch.int).type(torch.LongTensor)
    
    return X, Y


In [4]:
train_X, train_Y = preprocess(train_dataset)
dev_X, dev_Y = preprocess(dev_dataset)
test_X, test_Y = preprocess(test_dataset)

In [5]:
# Print the sizes of the datasets
print(f"Number of training examples: {len(train_X)}")
print(f"Number of validation examples: {len(dev_X)}")
print(f"Number of testing examples: {len(test_X)}")

Number of training examples: 9632
Number of validation examples: 514
Number of testing examples: 18399


In [6]:
%pip install pytorch-lightning 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-lightning
  Downloading pytorch_lightning-2.0.1-py3-none-any.whl (716 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m716.4/716.4 KB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 KB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.7.0
  Downloading lightning_utilities-0.8.0-py3-none-any.whl (20 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m70.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multidict<7.0,>=4.5
  Downloading multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x8

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl

class NERModel(pl.LightningModule):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, num_layers=1, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) #B * seq_len, B * seq_len * embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=num_layers, bidirectional=bidirectional)
        #B * seq_len * embedding_dim -> B * seq_len * hidden_dim 
        #tags
        if bidirectional:
            self.fc = nn.Linear(2*hidden_dim, tagset_size)
        else:
            self.fc = nn.Linear(hidden_dim, tagset_size)
        self.loss_fn = nn.CrossEntropyLoss()
    
    def forward(self, x):
        embeds = self.embedding(x)
        #print(embeds.shape)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.fc(lstm_out)
        tag_scores = nn.functional.log_softmax(tag_space, dim=2)
        return tag_scores
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('test_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters())
        return optimizer

In [8]:
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

EMBEDDING_DIM = 100
HIDDEN_DIM    = 100
NUM_EPOCHS    = 10
BATCH_SIZE    = 32

train_dataset = TensorDataset(train_X, train_Y)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TensorDataset(dev_X, dev_Y)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_X, test_Y)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [9]:
model = NERModel(vocab_size=len(word_to_idx), tagset_size=len(tag_to_idx), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
early_stopping = EarlyStopping(monitor="val_loss", patience=3, mode="min")


In [10]:
trainer = pl.Trainer(max_epochs=NUM_EPOCHS, callbacks=[early_stopping])

trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

trainer.test(dataloaders=test_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 3.1 M 
1 | lstm      | LSTM             | 161 K 
2 | fc        | Linear           | 13.7 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
3.2 M     Trainable params
0         Non-trainable params
3.2 M     Total params
12.937    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_0/checkpoints/epoch=9-step=3010.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_0/checkpoints/epoch=9-step=3010.ckpt


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.27055782079696655}]

In [11]:
idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

In [12]:
from sklearn.metrics import classification_report

# define idx_to_tag


# define device
device = torch.device('cpu')

# Create a dataloader for the test set
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Set the model to evaluation mode
model.eval()

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]

print(classification_report(y_true, y_pred))

                         precision    recall  f1-score   support

                  <PAD>       1.00      1.00      1.00    258203
B-AerospaceManufacturer       0.31      0.05      0.08        85
  B-AnatomicalStructure       0.64      0.55      0.59       489
              B-ArtWork       0.20      0.00      0.01       426
               B-Artist       0.39      0.35      0.37      1852
              B-Athlete       0.51      0.55      0.53      1171
      B-CarManufacturer       0.72      0.77      0.74       146
               B-Cleric       0.49      0.69      0.57       188
             B-Clothing       0.52      0.78      0.62        77
              B-Disease       0.67      0.56      0.61       633
                B-Drink       0.51      0.79      0.62       135
             B-Facility       0.45      0.39      0.42       859
                 B-Food       0.55      0.46      0.50       428
      B-HumanSettlement       0.71      0.62      0.67      5825
     B-MedicalProcedure 

In [13]:
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

In [14]:
# Set the model to evaluation mode
model.eval()

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Get back the sentence
        x_sent = [idx_to_word[i] for i in x.cpu().numpy().flatten().tolist()]

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]
        print("Sentence")
        print(x_sent)
        print("Predicted tags")
        print(y_pred)
        break

Sentence
['उनकी', 'विशेषताओं', 'आंदोलनों', 'और', 'खेल', 'शैली', 'के', 'कारण', 'उनकी', 'तुलना', 'वाल्टर', 'ज़ेंगा', 'से', 'की', 'गई', 'है।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'एथन', 'काट्ज़', '(', 'जन्म', '१९८३', ')', 'शिकागो', 'व्हाइट', 'सोक्स', 'के', 'लिए', 'पिचिंग', 'कोच', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'वह', 'प्रसिद्ध', 'रैंडविक', 'रेसकोर्स', 'ट्रेनर', 'इसहाक', 'अर्नशॉ', 'के', 'लिए', 'प्रेरित', 'था', 'और', 'उस', 'समय', 'के', 'कुछ', 'बेहतरीन', 'घोड़ों', 'के', 'लिए', 'सवार', 'था।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', 'चेल्सी', 'ने', 'उथल', '-पुथल', 'में', 'मैच', 'में', 'प्रवेश', 'किया', 'उनके', 'प्रबंधक', 'टॉमी', 'डोचेर्टी', 'एक', 'दिन', 'पहले', 'इस्तीफा', 'दे', 'दिया।', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>

In [15]:
save_dict = {'model_state_dict': model.state_dict()}
torch.save(save_dict, '/content/drive/MyDrive/Colab Notebooks/DL_ASS2/hindi_coarse.pt')

In [16]:
# Load the saved model
import torch
load_dict = torch.load('/content/drive/MyDrive/Colab Notebooks/DL_ASS2/hindi_coarse.pt')

# Create a new model and optimizer
model_hindi = NERModel(vocab_size=len(word_to_idx), tagset_size=len(tag_to_idx), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, bidirectional=True)
# Load the state dictionary into the model and optimizer
model.load_state_dict(load_dict['model_state_dict'])

<All keys matched successfully>

In [22]:
import torch

def predict_sentence(sentence, model, word_to_idx, idx_to_tag, unknown_token="<UNK>"):
    # Tokenize the sentence
    words = sentence.split()

    # Convert words to lowercase
    words = [word.lower() for word in words]

    # Get the index of each word
    word_indices = [word_to_idx.get(word, word_to_idx[unknown_token]) for word in words]

    # Convert the word indices to a tensor
    tensor = torch.tensor(word_indices).unsqueeze(0)

    # Make a prediction with the model
    predictions = model(tensor)

    # Convert the predictions to tags
    tags = [idx_to_tag[pred] for pred in predictions.argmax(-1).cpu().numpy().flatten().tolist()]

    # Merge consecutive words with the same tag
    tagged_words = []
    for word, tag in zip(words, tags):
        if tag.startswith("B-"):
            tagged_words.append(f"{word}[{tag[2:]}]")
        elif tag.startswith("I-"):
            tagged_words[-1] += f" {word}"
        else:
            tagged_words.append(word)

    # Join the tagged words into a sentence
    tagged_sentence = " ".join(tagged_words)

    return tagged_sentence


print(predict_sentence("जिम को आलू खाना पसंद है।", model, word_to_idx, idx_to_tag))

जिम को आलू[Food] खाना पसंद है।
