In [None]:
pip install --upgrade accelerate

In [None]:
pip install datasets

In [None]:
pip install torch

In [None]:
import torch

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')


In [None]:
pip install tokenizers

In [None]:
pip install transformers

In [None]:
pip install seqeval

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls '/content/drive/MyDrive/Research'

Bangla_Natural_Language_Processing_A_Comprehensive_Analysis_of_Classical_Machine_Learning_and_Deep_Learning-Based_Methods.pdf
BLNERC_BNER_dataset.json
chowdhury2018.pdf
Hello4
Papers


In [None]:
import os
os.chdir("/content/drive/MyDrive/Research")

In [None]:
from datasets import load_metric
import numpy as np
metric = load_metric("seqeval")
import json

In [None]:
import pickle

with open('BLNERC_BNER.json', 'rb') as f:
    label_list = pickle.load(f)

In [None]:
label_list

{0: 'O',
 1: 'B-PER',
 2: 'B-LOC',
 3: 'B-ORG',
 4: 'B-TIM',
 5: 'I-PER',
 6: 'I-LOC',
 7: 'I-ORG',
 8: 'I-TIM'}

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
def tokenize_and_align_labels(examples, label_all_tokens = True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, max_length=128)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    print(len(examples))
    return tokenized_inputs

## Preprocessing the dataset

In [None]:
import pandas as pd

with open("BLNERC_BNER_dataset.json") as f:
  data = json.load(f)

df = pd.json_normalize(data['train'])


In [None]:
result = df.dtypes

print("Output:")
print(result)

Output:
id           int64
ner_tags    object
tokens      object
dtype: object


In [None]:
convert_dict = {'id': int, 'ner_tags': str, 'tokens': str}

df = df.astype(convert_dict)
print(df.dtypes)

id           int64
ner_tags    object
tokens      object
dtype: object


## Load Dataset

In [None]:
from datasets import Sequence

In [None]:
from datasets import Dataset, load_dataset

In [None]:
train_dataset = load_dataset("json", data_files={'train': "BLNERC_BNER_dataset.json"}, field ='train')

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
train_dataset["train"].features

{'ner_tags': Value(dtype='string', id=None),
 'id': Value(dtype='int64', id=None),
 'tokens': Value(dtype='string', id=None)}

In [None]:
datasets = load_dataset("json", data_files={'train': "BLNERC_BNER_dataset.json"})

In [None]:
train_dataset["train"].features[f"ner_tags"]

Value(dtype='string', id=None)

In [None]:
type(datasets["train"].features['train'])

list

In [None]:
datasets["train"].features

{'train': [{'id': Value(dtype='int64', id=None),
   'ner_tags': Value(dtype='string', id=None),
   'tokens': Value(dtype='string', id=None)}],
 'test': [{'id': Value(dtype='int64', id=None),
   'ner_tags': Value(dtype='string', id=None),
   'tokens': Value(dtype='string', id=None)}],
 'validation': [{'id': Value(dtype='int64', id=None),
   'ner_tags': Value(dtype='string', id=None),
   'tokens': Value(dtype='string', id=None)}]}

In [None]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['ner_tags', 'tokens', 'id'],
        num_rows: 41418
    })
})

In [None]:
from datasets import ClassLabel, Sequence, Value

string_value = Value(dtype='string', id=None)


converted_sequence = Sequence(
    feature=ClassLabel(
        num_classes=9,
        names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-TIM', 'I-TIM'],
        names_file=None,
        id=None
    ),
    length=-1,
    id=None
)

In [None]:
train_dataset["train"].features[f"ner_tags"] = converted_sequence

In [None]:
train_dataset["train"].features

{'ner_tags': Value(dtype='string', id=None),
 'id': Value(dtype='int64', id=None),
 'tokens': Value(dtype='string', id=None)}

In [None]:
map(lambda x: None if x in none_items else x, train_dataset)

<map at 0x7f500ced5600>

In [None]:
test_dataset = load_dataset("json", data_files={'test': "BLNERC_BNER_dataset.json"}, field ='test')

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
validation_dataset = load_dataset("json", data_files={'validation': "BLNERC_BNER_dataset.json"}, field ='validation')

Generating validation split: 0 examples [00:00, ? examples/s]

In [None]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(train_dataset["train"])

Unnamed: 0,ner_tags,id,tokens
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",15039,"['একটি', 'সময়ের', 'পরে', 'নিজের', 'উপর', 'ধকল', 'নেবেন', 'না', 'কারণ', 'কিছু', 'নির্দিষ্ট', 'সমস্যা', 'থেকে', 'দূরে', 'থাকাই', 'শ্রেয়']"
1,"[0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 5, 5, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0]",22227,"['পাঠকের', 'স্মরণ', 'থাকে', 'যেন', 'যে', ',', 'ভারতবর্ষের', 'ডাকাইত', 'শাসন', 'করিতে', 'মার্কুইস', 'অব', 'হেষ্টিংসকে', 'যত', 'বড়', 'যুদ্ধোদ্যম', 'করিতে', 'হইয়াছিল', ',', 'পঞ্জাবের', 'লড়াইয়ের', 'পূর্বে', 'আর', 'কখন', 'তত', 'করিতে', 'হয়', 'নাই']"
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",16293,"['খানিক', 'পরে', 'ছোটবৌ', 'অতিশয়', 'সঙ্কুচিতভাবে', 'মৃদুস্বরে', 'বলিল', ',', 'এ', '-', 'সব', 'কথা', 'হয়ত', 'সত্যি', 'নয়', ',', 'বাবা', '!', 'কোন্\u200c', 'সব', 'কথা', 'মা', '?', 'তোমার', 'দিদির', 'কথা', '?', 'ছোটবৌ', 'নতমুখে', 'মৌন', 'হইয়া', 'রহিল']"
3,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",1564,"['মোড়া', 'টেনে', 'ছেলের', 'খানিকটা', 'দূরে', 'বসলেন', 'মনোরমা', ',', 'তোর', 'সঙ্গে', 'কয়েকটা', 'কথা', 'ছিল']"
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",33183,"['করিয়াছিল', 'কিনা', 'জানি', 'না', ',', 'কিন্তু', 'প্রায়', 'সকল', 'গ্রামের', 'মত', 'এখানেও', 'একটা', 'জনশ্রুতি', 'আছে']"
5,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",39189,"['ননীবালা', 'তখন', 'সাগ্রহে', 'বলিলেন\u200c', ',', 'কিন্তু', 'ওদিকের', 'কি', 'হল\u200c', ',', 'ব্যোমকেশবাবু', '?', 'নিমাই', 'নিতাইকে', 'দেখতে', 'গিছলেন', 'নাকি', '?', '’', 'ব্যোমকেশ', 'বলিল\u200c', ',', '‘', 'গিয়েছিলাম']"
6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",21300,"['শরতের', 'আকাশ', 'তখন', 'আর', 'নীল', 'সাদার', 'চাঁদোয়া', 'নয়', 'রীতিমতো', 'রঙিন', 'যুদ্ধক্ষেত্র', 'লাল', 'নীল', 'সবুজ', 'কালো', 'ঘুড়ির', 'মেলায়', 'বোঝা', 'মুশকিল', 'আকাশের', 'আসল', 'রং']"
7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",15533,"['অনুরূপ', 'অবস্থায়', 'পড়িয়া', 'সেকালের', 'ঠাকুর', '-', 'দেবতারাও', 'কিরূপ', 'বিহ্বল', 'বে', '-', 'এক্তিয়ার', 'হইয়া', 'পড়িতেন', ',', 'তাহা', 'তো', 'ভক্ত', 'কবিগণ', 'লিপিবদ্ধ', 'করিয়াই', 'গিয়াছেন']"
8,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0]",5604,"['কিন্তু', 'মার্থার', 'জীবনে', 'তার', 'মায়ের', 'ইতিহাসের', 'পুনরাবৃত্তি', 'অমন', 'অবিকলভাবে', 'ঘটে', 'যাবে', 'সে', 'কথা', 'মার্থা', 'কেন', 'শওকতও', 'কোনদিন', 'কল্পনা', 'করতে', 'পারে', 'নি']"
9,"[0, 0, 0, 0, 0, 1, 5, 0, 0, 1, 0, 0, 0, 0, 0, 0]",26227,"['কিন্তু', 'সে', 'কি', 'হয়', '?', 'আচার্য', 'ধূর্জটি', 'কবিরাজ', 'যে', 'শিবরামের', 'পাংশু', 'মুখের', 'দিকে', 'তাকিয়েই', 'বুঝতে', 'পারবেন']"


## Make tokenizer

In [None]:
def build_fast_bert_tokenizer(files, max_vocab_size):
    from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

    from transformers import BertTokenizerFast, AutoTokenizer, DistilBertTokenizerFast
    assert isinstance(files, list)
    tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
    tokenizer.normalizer = normalizers.Sequence(
        [normalizers.NFD(), normalizers.Lowercase()]
    )
    tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
    special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
    trainer = trainers.WordPieceTrainer(vocab_size=max_vocab_size, special_tokens=special_tokens)
    tokenizer.train(files=files, trainer=trainer)
    cls_token_id = tokenizer.token_to_id("[CLS]")
    sep_token_id = tokenizer.token_to_id("[SEP]")
    tokenizer.post_processor = processors.TemplateProcessing(
        single=f"[CLS]:0 $A:0 [SEP]:0",
        pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", cls_token_id),
            ("[SEP]", sep_token_id),
        ],
    )
    tokenizer.decoder = decoders.WordPiece(prefix="##")
    return BertTokenizerFast(tokenizer_object=tokenizer)

In [None]:
tokenizer = build_fast_bert_tokenizer(["ner_all_tokens_vocab.txt"], 8000)

In [None]:
tokenizer("মনির ঢাকায় থাকে").tokens()

['[CLS]', 'মনি', '##র', 'ঢাকা', '##য়', 'থাকে', '[SEP]']

In [None]:
tokenizer.decode(tokenizer("b_ner on the way to train!!")["input_ids"])

'[CLS] [UNK] _ [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]!! [SEP]'

In [None]:
tokenizer("মনির ঢাকায় থাকে").tokens()

['[CLS]', 'মনি', '##র', 'ঢাকা', '##য়', 'থাকে', '[SEP]']

In [None]:
len(tokenizer.vocab)

8000

In [None]:
from transformers import AutoTokenizer


model_checkpoint = "bert-base-uncased"
batch_size = 16
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
tokenizer("মনির ঢাকায় থাকে").tokens()

['[CLS]', 'ম', '##ন', '##ি', '##র', '[UNK]', 'থ', '##া', '##ক', '##ে', '[SEP]']

## Data Collator

In [None]:
batch_size = 16

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
train_ds = train_dataset.map(tokenize_and_align_labels, batched=True)
test_ds = test_dataset.map(tokenize_and_align_labels, batched=True)
validation_ds = validation_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/41418 [00:00<?, ? examples/s]

IndexError: list index out of range

In [None]:
validation_dataset

DatasetDict({
    validation: Dataset({
        features: ['ner_tags', 'tokens'],
        num_rows: 2967
    })
})

## Train

In [None]:
from transformers import TrainingArguments

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, BertForTokenClassification

model = BertForTokenClassification.from_pretrained("sagorsarker/bangla-bert-base", num_labels=len(label_list))

Downloading (…)lve/main/config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/660M [00:00<?, ?B/s]

Some weights of the model checkpoint at sagorsarker/bangla-bert-base were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and a

In [None]:
args = TrainingArguments(
    output_dir="Hello2",
    evaluation_strategy = "epoch",
    learning_rate=2.5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=1,
    report_to=[],
    save_steps=2000,
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_ds["train"],
    eval_dataset=validation_ds["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1607,0.14397,0.694177,0.684456,0.689282,0.957393
2,0.0189,0.114316,0.792293,0.69737,0.741807,0.966072
3,0.1075,0.115226,0.747985,0.750895,0.749437,0.965493
4,0.0455,0.133038,0.791473,0.742337,0.766118,0.968203
5,0.0072,0.137761,0.769087,0.766454,0.767768,0.967679
6,0.0192,0.151556,0.77204,0.763031,0.767509,0.96793
7,0.1674,0.169506,0.773834,0.76661,0.770205,0.967952
8,0.0021,0.181835,0.804809,0.755096,0.77916,0.969635
9,0.0083,0.188545,0.791566,0.768166,0.77969,0.969406
10,0.0005,0.191824,0.792009,0.764898,0.778218,0.969504


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=14820, training_loss=0.055715892982624755, metrics={'train_runtime': 3815.2844, 'train_samples_per_second': 62.126, 'train_steps_per_second': 3.884, 'total_flos': 8512943510044674.0, 'train_loss': 0.055715892982624755, 'epoch': 10.0})

## Tests

In [None]:
trainer.evaluate(test_ds['test'])

{'eval_loss': 0.1996864378452301,
 'eval_precision': 0.8123115976519117,
 'eval_recall': 0.7584061620500666,
 'eval_f1': 0.7844338899954038,
 'eval_accuracy': 0.9686602791864213,
 'eval_runtime': 14.1665,
 'eval_samples_per_second': 209.155,
 'eval_steps_per_second': 13.13,
 'epoch': 10.0}

In [None]:
'''model.save_pretrained('sagorsarker-bangla-bert-base/ner_model')
tokenizer.save_pretrained('sagorsarker-bangla-bert-base/ner_tokenizer')'''

"model.save_pretrained('sagorsarker-bangla-bert-base/ner_model')\ntokenizer.save_pretrained('sagorsarker-bangla-bert-base/ner_tokenizer')"