In [17]:
!pip3 install transformers
!pip3 install datasets
!pip3 install sentencepiece
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m594.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=6f89136709afc7da03424b555a315b57df89099021660e981d53201e4807664c
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


**Fine Tuning the IndicNER Model**

**Naampadam Dataset**

In [4]:
# Let's download the Naampadam (Indic NER) dataset
from datasets import ClassLabel, load_dataset, load_metric, DownloadMode

lang='gu'

raw_datasets = load_dataset('ai4bharat/naamapadam', lang)

Downloading builder script:   0%|          | 0.00/2.86k [00:00<?, ?B/s]

Downloading and preparing dataset naamapadam_pr/gu to /root/.cache/huggingface/datasets/ai4bharat___naamapadam_pr/gu/1.0.0/99b5ec77eabfaa3fbff510d8cf70d7c34519486cb7dbee99ede19474ddff9b20...


Downloading data:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset naamapadam_pr downloaded and prepared to /root/.cache/huggingface/datasets/ai4bharat___naamapadam_pr/gu/1.0.0/99b5ec77eabfaa3fbff510d8cf70d7c34519486cb7dbee99ede19474ddff9b20. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 472845
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1076
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2389
    })
})

In [5]:
raw_datasets.column_names

{'train': ['tokens', 'ner_tags'],
 'test': ['tokens', 'ner_tags'],
 'validation': ['tokens', 'ner_tags']}

In [6]:
# let's print an instance of dataset
idx=0
rec=raw_datasets['train'][idx]
for w, t in zip(rec['tokens'],rec['ner_tags']):
  print('{}\t{}'.format(w,t))

લક્ઝરી	0
સેન્ટ	5
એન્ડ્રુ	6
માતાનો	6
ચર્ચ	6


In [7]:
features = raw_datasets["train"].features
print(features)

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)}


In [8]:
text_column_name = "tokens"
label_column_name = "ner_tags"

In [9]:
label_list = features[label_column_name].feature.names

label_to_id = {label_list[i]: features[label_column_name].feature.str2int( label_list[i] ) for i in range(len(label_list))}

print(label_list)

num_labels = len(label_list)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']


**Training an NER Model with the dataset**

**Load Pre-trained Model**

In [10]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback, IntervalStrategy
import numpy as np

config = AutoConfig.from_pretrained('ai4bharat/IndicNER', num_labels=num_labels, finetuning_task='ner')
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")
model = AutoModelForTokenClassification.from_pretrained('ai4bharat/IndicNER', num_labels=num_labels )

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/346 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/667M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


**Tokenize all texts and align the labels with them**

In [11]:
# Tokenize all texts and align the labels with them.
padding = "max_length"
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=padding,
        truncation=True,
        max_length=512,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        # print('=====')
        # print('{} {}'.format(i,label)) #ak
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

**Let us not tokenize the train and validation**

In [12]:
train_dataset = raw_datasets["train"]
# train_dataset = train_dataset.map(
#     tokenize_and_align_labels,
#     batched=True,
#     num_proc=4,
#     load_from_cache_file=True,
#     desc="Running tokenizer on train dataset",
# )
subset_train_dataset = train_dataset.select(range(20000))
subset_train_dataset = subset_train_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on subset of train dataset",
)


       

Running tokenizer on subset of train dataset #0:   0%|          | 0/5 [00:00<?, ?ba/s]

 

Running tokenizer on subset of train dataset #1:   0%|          | 0/5 [00:00<?, ?ba/s]

Running tokenizer on subset of train dataset #2:   0%|          | 0/5 [00:00<?, ?ba/s]

Running tokenizer on subset of train dataset #3:   0%|          | 0/5 [00:00<?, ?ba/s]

In [13]:
eval_dataset = raw_datasets["validation"]
subset_eval_dataset = eval_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on Validation dataset",
)

       

Running tokenizer on Validation dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on Validation dataset #2:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Validation dataset #1:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Validation dataset #3:   0%|          | 0/1 [00:00<?, ?ba/s]

**Create Data Collator, Metrics**

In [15]:
data_collator = DataCollatorForTokenClassification(tokenizer)


In [18]:
# Metrics
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # results = metric.compute(predictions=true_predictions, references=true_labels)
    # Unpack nested dictionaries
    # final_results = {}
    # for key, value in results.items():
    #     if isinstance(value, dict):
    #         for n, v in value.items():
    #             final_results[f"{key}_{n}"] = v
    #     else:
    #         final_results[key] = value
    # return final_results
    results = metric.compute(predictions=true_predictions, references=true_labels)
    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value
    return final_results

**Set Training Arguments**

In [19]:
batch_size=16
args=TrainingArguments(
    output_dir='output_dir',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=4,
    evaluation_strategy = "epoch",
    learning_rate=5e-7)

**Training**

In [20]:
trainer = Trainer(
    model=model,
    train_dataset=subset_train_dataset,
    eval_dataset=subset_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # callbacks=[early_stopping_callback],
    args=args,
)

In [21]:
trainer.args

TrainingArguments(
_n_gpu=2,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_

In [22]:
train_result = trainer.train()
metrics = train_result.metrics
# Assuming `metrics` is a dictionary containing the training metrics
with open("training_metrics.txt", "w") as file:
    for key, value in metrics.items():
        file.write(f"{key}: {value}\n")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss,Loc Precision,Loc Recall,Loc F1,Loc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
1,6.7389,2.881739,0.000172,0.000828,0.000285,1208,0.020333,0.010377,0.013741,1060,0.137566,0.193189,0.1607,1615,0.037578,0.083441,0.051819,0.051778
2,1.8012,0.74846,0.0,0.0,0.0,1208,0.0,0.0,0.0,1060,0.247294,0.183901,0.210938,1615,0.172674,0.076487,0.106015,0.798603
3,0.7595,0.599956,0.0,0.0,0.0,1208,0.028807,0.006604,0.010744,1060,0.339776,0.413622,0.37308,1615,0.255006,0.173835,0.206738,0.812762
4,0.5695,0.564811,0.0,0.0,0.0,1208,0.03321,0.008491,0.013524,1060,0.357075,0.468731,0.405355,1615,0.27465,0.19727,0.229616,0.816762




In [20]:

metrics = trainer.evaluate()

trainer.log_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =        3.0
  eval_LOC_f1             =     0.8068
  eval_LOC_number         =       1208
  eval_LOC_precision      =     0.7898
  eval_LOC_recall         =     0.8245
  eval_ORG_f1             =     0.7159
  eval_ORG_number         =       1060
  eval_ORG_precision      =     0.7058
  eval_ORG_recall         =     0.7264
  eval_PER_f1             =     0.8114
  eval_PER_number         =       1615
  eval_PER_precision      =     0.7974
  eval_PER_recall         =      0.826
  eval_loss               =     0.1966
  eval_overall_accuracy   =     0.9414
  eval_overall_f1         =      0.784
  eval_overall_precision  =     0.7702
  eval_overall_recall     =     0.7984
  eval_runtime            = 0:00:54.61
  eval_samples_per_second =      43.74
  eval_steps_per_second   =      2.746


**Evaluate the Trained Model**

**Let us now evaluate the trained model on the test sets of all languages**

**We need to first tokenize the test sets**

In [23]:
test_dataset = raw_datasets["test"]
test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Running tokenizer on Test dataset",
)
print(test_dataset)

       

Running tokenizer on Test dataset #0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

Running tokenizer on Test dataset #1:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Test dataset #2:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on Test dataset #3:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1076
})


**Run prediction on test set of each of the language separately and extract overall Precison, Recall and F-Score separately**

In [24]:
predictions, labels, metrics = trainer.predict(test_dataset, metric_key_prefix=lang)


In [25]:
lang_specific_results = {}
for key in metrics:
    if 'overall_precision' in key:
      lang_specific_results['Precision'] = metrics[key]
    elif 'overall_recall' in key:
      lang_specific_results['Recall'] = metrics[key]
    elif 'overall_f1' in key:
      lang_specific_results['macro-F1 Score'] = metrics[key]

In [26]:
print(lang_specific_results)

{'Precision': 0.2912698412698413, 'Recall': 0.21067738231917335, 'macro-F1 Score': 0.24450366422385075}
