In [None]:
"""!pip install transformers
!pip install datasets
!pip install tokenizer
!pip install seqeval
"""
#either you can use above statement or use single command
# Install
!pip install transformers datasets tokenizers seqeval -q

In [None]:
import datasets
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification

In [None]:
conll2003 = datasets.load_dataset("conll2003")

In [None]:
conll2003

In [None]:
conll2003["train"]

In [None]:
conll2003["train"][0]

In [None]:
conll2003["train"]

In [None]:
conll2003["train"].features['ner_tags']

In [None]:
conll2003["train"].description

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [None]:
conll2003['train'][0]

In [None]:
conll2003["train"].features['ner_tags']

In [None]:
example_text = conll2003['train'][0]

In [None]:
example_text

In [None]:
example_text["tokens"]

In [None]:
tokenized_input = tokenizer(example_text["tokens"], is_split_into_words=True)

In [None]:
tokenized_input["input_ids"]

In [None]:
tokenized_input

In [None]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

In [None]:
tokens

In [None]:
word_ids = tokenized_input.word_ids()

print(word_ids)

In [None]:
example_text["ner_tags"]

In [None]:
for i, label in enumerate(example_text["ner_tags"]):
  print(i,label)

In [None]:
def tokenize_and_align_labels(examples, label_all_tokens=True):

    #tokeinze ids
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []


    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.

        previous_word_idx = None
        label_ids = []
        # Special tokens like `` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)

            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
conll2003["train"][4:5]

In [None]:
q=tokenize_and_align_labels(conll2003["train"][4:5])

In [None]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]):
    print(f"{token:_<40} {label}")

In [None]:
## Applying on entire data
tokenized_datasets = conll2003.map(tokenize_and_align_labels, batched=True)

In [None]:
tokenized_datasets["train"][0]

In [None]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased",num_labels=9)

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
args = TrainingArguments(
"test-ner",
evaluation_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=1,
weight_decay=0.01
)

In [None]:
!pip install transformers[torch]

In [None]:
Trainer(
   model,
   args,
   train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)

In [None]:
metric=datasets.load_metric("seqeval")

In [None]:
example=conll2003['train'][0]

In [None]:
label_list = conll2003["train"].features["ner_tags"].feature.names

label_list

In [None]:
for i in example["ner_tags"]:
  print(i)

In [None]:
 example

In [None]:
labels = [label_list[i] for i in example["ner_tags"]]
labels

In [None]:
metric.compute(predictions=[labels],references=[labels])

In [None]:
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)

    return {
          "precision": results["overall_precision"],
          "recall": results["overall_recall"],
          "f1": results["overall_f1"],
          "accuracy": results["overall_accuracy"],
  }

In [None]:
data_collator=DataCollatorForTokenClassification(tokenizer)

In [None]:
trainer=Trainer(
   model,
   args,
   train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("ner_model")

In [None]:
tokenizer.save_pretrained("tokenizer")

In [None]:
label_list

In [None]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}

In [None]:
id2label

In [None]:
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [None]:
label2id

In [None]:
import json

In [None]:
config=json.load(open("/content/ner_model/config.json"))

In [None]:
config

In [None]:
config["id2label"] = id2label


In [None]:
config["label2id"] = label2id

In [None]:
json.dump(config,open("/content/ner_model/config.json","w"))

In [None]:
model_fine_tuned=AutoModelForTokenClassification.from_pretrained("ner_model")

In [None]:
model_fine_tuned

# transformer pipeline

In [None]:
from transformers import pipeline

In [None]:
nlp_pipeline=pipeline("ner",model=model_fine_tuned,tokenizer=tokenizer)

In [None]:
nlp_pipeline

In [None]:
example="sudhanshu kumar is a foundar of iNeuron"

In [None]:
nlp_pipeline(example)

In [None]:
example="sunny is a founder of microsoft"

In [None]:
nlp_pipeline(example)

In [None]:
example="apple launch mobile while eating apple which taste like orange"


In [None]:
nlp_pipeline(example)

In [None]:
example="vikas is working ai engineer in google"

In [None]:
nlp_pipeline(example)

In [None]:
example="apple founder loves eating apple"


In [None]:
nlp_pipeline(example)

In [None]:
example="Microsoft Windows created their software by idea that came from the window of the house"


In [None]:
nlp_pipeline(example)