In [1]:
import pandas as pd 
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = load_dataset('conllpp')
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [3]:
data['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [4]:
pd.DataFrame(data['train'][:])[['tokens','ner_tags']].iloc[0]

tokens      [EU, rejects, German, call, to, boycott, Briti...
ner_tags                          [3, 0, 7, 0, 0, 0, 7, 0, 0]
Name: 0, dtype: object

In [5]:
tags = data['train'].features['ner_tags']

index2tag = {index:tag for index, tag in enumerate(tags.feature.names)}
tag2index = {tag:index for index, tag in enumerate(tags.feature.names)}
index2tag

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [6]:
tags = data['train'].features['ner_tags'].feature

index2tag = {index:tag for index, tag in enumerate(tags.names)}
tag2index = {tag:index for index, tag in enumerate(tags.names)}
index2tag

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [7]:
tags.int2str(3)

'B-ORG'

In [8]:
def create_tag_names(batch):
    tag_name = {'ner_tags_str' : [tags.int2str(index) for index in batch['ner_tags']]}
    return tag_name

In [9]:
data = data.map(create_tag_names)

In [10]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_tags_str'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_tags_str'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_tags_str'],
        num_rows: 3453
    })
})

In [11]:
pd.DataFrame(data['train'][:])[['tokens','ner_tags', 'ner_tags_str']].iloc[0]

tokens          [EU, rejects, German, call, to, boycott, Briti...
ner_tags                              [3, 0, 7, 0, 0, 0, 7, 0, 0]
ner_tags_str            [B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]
Name: 0, dtype: object

Model Building

In [12]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [13]:
tokenizer.is_fast

True

In [14]:
inputs  = data['train'][0]['tokens']
inputs = tokenizer(inputs, is_split_into_words= True)
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [15]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [16]:
def align_labels_with_tokens(labels, words_ids):
    new_labels = []
    current_word = None
    for word_id in words_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)

        elif word_id is None:
            new_labels.append(-100)

        else:
            label = labels[word_id]

            if label%2 == 1:
                label = label +1
            new_labels.append(label)

    return new_labels

In [17]:
labels  = data['train'][0]['ner_tags']
word_ids = inputs.word_ids()
print(labels, word_ids)

[3, 0, 7, 0, 0, 0, 7, 0, 0] [None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]


In [18]:
align_labels_with_tokens(labels, word_ids)

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]

In [19]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

  all_labels = examples['ner_tags']

  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels, word_ids))

  tokenized_inputs['labels'] = new_labels

  return tokenized_inputs

In [20]:
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True, remove_columns=data['train'].column_names)


In [21]:
tokenized_datasets


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [22]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [23]:
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch

{'input_ids': tensor([[  101,  7270, 22961,  1528,  1840,  1106, 21423,  1418,  2495, 12913,
           119,   102],
        [  101,  1943, 14428,   102,     0,     0,     0,     0,     0,     0,
             0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])}

In [24]:
!pip install seqeval
!pip install evaluate




You should consider upgrading via the 'b:\prep\repository\nlp-with-ml\nlpvenv\scripts\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'b:\prep\repository\nlp-with-ml\nlpvenv\scripts\python.exe -m pip install --upgrade pip' command.


In [25]:

import evaluate
metric = evaluate.load('seqeval')

In [26]:
ner_feature = data['train'].features['ner_tags']
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [27]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [28]:
labels = data['train'][0]['ner_tags']
labels = [label_names[i] for i in labels]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [29]:
predictions = labels.copy()
predictions[2] = "O"

metric.compute(predictions=[predictions], references=[labels])

{'MISC': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.8,
 'overall_accuracy': 0.8888888888888888}

In [30]:
import numpy as np

def compute_metrics(eval_preds):
  logits, labels = eval_preds

  predictions = np.argmax(logits, axis=-1)

  true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

  true_predictions = [[label_names[p] for p,l in zip(prediction, label) if l!=-100]
                      for prediction, label in zip(predictions, labels)]

  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

  return {"precision": all_metrics['overall_precision'],
          "recall": all_metrics['overall_recall'],
          "f1": all_metrics['overall_f1'],
          "accuracy": all_metrics['overall_accuracy']}

Model Training

In [31]:
id2label = {i:label for i, label in enumerate(label_names)}
label2id = {label:i for i, label in enumerate(label_names)}

In [32]:
print(id2label)


{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}


In [33]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
                                                    model_checkpoint,
                                                    id2label=id2label,
                                                    label2id=label2id)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
model.config.num_labels


9

In [35]:
from transformers import TrainingArguments

args = TrainingArguments("distilbert-finetuned-ner",
                         evaluation_strategy = "epoch",
                         save_strategy="epoch",
                         learning_rate = 2e-5,
                         num_train_epochs=3,
                         weight_decay=0.01)

In [36]:
from transformers import Trainer
trainer = Trainer(model=model,
                  args=args,
                  train_dataset = tokenized_datasets['train'],
                  eval_dataset = tokenized_datasets['validation'],
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer)

trainer.train()

  9%|▉         | 500/5268 [09:39<1:32:47,  1.17s/it]

{'loss': 0.2829, 'learning_rate': 1.810174639331815e-05, 'epoch': 0.28}


 19%|█▉        | 1000/5268 [19:07<1:24:41,  1.19s/it]

{'loss': 0.1324, 'learning_rate': 1.6203492786636296e-05, 'epoch': 0.57}


 28%|██▊       | 1500/5268 [28:52<1:26:30,  1.38s/it]

{'loss': 0.0992, 'learning_rate': 1.4305239179954442e-05, 'epoch': 0.85}


                                                     
 33%|███▎      | 1756/5268 [35:15<1:03:31,  1.09s/it]

{'eval_loss': 0.09908612817525864, 'eval_precision': 0.8693982074263764, 'eval_recall': 0.9141703130259172, 'eval_f1': 0.8912223133716162, 'eval_accuracy': 0.972405368811444, 'eval_runtime': 89.1332, 'eval_samples_per_second': 36.462, 'eval_steps_per_second': 4.566, 'epoch': 1.0}


 38%|███▊      | 2000/5268 [40:00<1:03:16,  1.16s/it] 

{'loss': 0.0671, 'learning_rate': 1.240698557327259e-05, 'epoch': 1.14}


 47%|████▋     | 2500/5268 [49:47<56:09,  1.22s/it]  

{'loss': 0.0616, 'learning_rate': 1.0508731966590738e-05, 'epoch': 1.42}


 57%|█████▋    | 3000/5268 [59:29<39:37,  1.05s/it]  

{'loss': 0.0493, 'learning_rate': 8.610478359908885e-06, 'epoch': 1.71}


 66%|██████▋   | 3500/5268 [1:09:08<33:25,  1.13s/it]

{'loss': 0.0502, 'learning_rate': 6.712224753227031e-06, 'epoch': 1.99}


                                                     
 67%|██████▋   | 3512/5268 [1:10:53<31:21,  1.07s/it]

{'eval_loss': 0.06899060308933258, 'eval_precision': 0.9098360655737705, 'eval_recall': 0.9340289464826658, 'eval_f1': 0.9217737917289486, 'eval_accuracy': 0.9817801848472362, 'eval_runtime': 91.2283, 'eval_samples_per_second': 35.625, 'eval_steps_per_second': 4.461, 'epoch': 2.0}


 76%|███████▌  | 4000/5268 [1:20:27<23:35,  1.12s/it]   

{'loss': 0.0301, 'learning_rate': 4.8139711465451785e-06, 'epoch': 2.28}


 85%|████████▌ | 4500/5268 [1:30:08<14:16,  1.11s/it]

{'loss': 0.0286, 'learning_rate': 2.9157175398633257e-06, 'epoch': 2.56}


 95%|█████████▍| 5000/5268 [1:39:46<05:39,  1.27s/it]

{'loss': 0.0293, 'learning_rate': 1.0174639331814731e-06, 'epoch': 2.85}


                                                     
100%|██████████| 5268/5268 [1:46:32<00:00,  1.03it/s]

{'eval_loss': 0.07084232568740845, 'eval_precision': 0.9111183994752378, 'eval_recall': 0.9350387075058902, 'eval_f1': 0.9229235880398671, 'eval_accuracy': 0.98292812150468, 'eval_runtime': 93.7255, 'eval_samples_per_second': 34.676, 'eval_steps_per_second': 4.342, 'epoch': 3.0}


100%|██████████| 5268/5268 [1:46:34<00:00,  1.21s/it]

{'train_runtime': 6394.4526, 'train_samples_per_second': 6.587, 'train_steps_per_second': 0.824, 'train_loss': 0.08056521705360963, 'epoch': 3.0}





TrainOutput(global_step=5268, training_loss=0.08056521705360963, metrics={'train_runtime': 6394.4526, 'train_samples_per_second': 6.587, 'train_steps_per_second': 0.824, 'train_loss': 0.08056521705360963, 'epoch': 3.0})

In [97]:
from transformers import pipeline

checkpoint = "B:/Prep/repository/NLP-with-ML/Assignment/distilbert-finetuned-ner/checkpoint-5268"
token_classifier = pipeline(
    "token-classification", model=checkpoint, aggregation_strategy="simple"
)

token_classifier("Can you google price of jim beam for me and I work in google")

[]

In [51]:
%pip install zip

Collecting zip
  Using cached zip-0.0.2.tar.gz (3.0 kB)
Collecting Flask-Admin>=1.0.4
  Using cached Flask_Admin-1.6.1-py3-none-any.whl (7.5 MB)
Collecting Flask-Bootstrap>=2.2.2-1
  Using cached Flask-Bootstrap-3.3.7.1.tar.gz (456 kB)
Collecting Flask-Cache>=0.10.1
  Using cached Flask-Cache-0.13.1.tar.gz (45 kB)
Collecting Flask-FlatPages>=0.3
  Using cached Flask_FlatPages-0.8.2-py3-none-any.whl (10 kB)
Collecting Flask-Gravatar>=0.2.4
  Using cached Flask_Gravatar-0.5.0-py2.py3-none-any.whl (7.5 kB)
Collecting Flask-Login>=0.1.3
  Using cached Flask_Login-0.6.3-py3-none-any.whl (17 kB)
Collecting Flask-Mail>=0.7.4
  Using cached Flask-Mail-0.9.1.tar.gz (45 kB)
Collecting Flask-PyMongo>=0.2.1
  Using cached Flask_PyMongo-2.3.0-py2.py3-none-any.whl (12 kB)
Collecting Flask-Restless>=0.9.1
  Using cached Flask-Restless-0.17.0.tar.gz (42 kB)
Collecting Flask-SQLAlchemy>=0.16
  Using cached flask_sqlalchemy-3.1.1-py3-none-any.whl (25 kB)
Collecting Flask-Themes>=0.1.3
  Using cached Fla

    ERROR: Command errored out with exit status 1:
     command: 'b:\Prep\repository\NLP-with-ML\nlpvenv\Scripts\python.exe' -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\whiz\\AppData\\Local\\Temp\\pip-install-6o7zx6mv\\wsgiref_61c2bfd39c5446b2a52c62a18cb0493d\\setup.py'"'"'; __file__='"'"'C:\\Users\\whiz\\AppData\\Local\\Temp\\pip-install-6o7zx6mv\\wsgiref_61c2bfd39c5446b2a52c62a18cb0493d\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\whiz\AppData\Local\Temp\pip-pip-egg-info-l9253cx3'
         cwd: C:\Users\whiz\AppData\Local\Temp\pip-install-6o7zx6mv\wsgiref_61c2bfd39c5446b2a52c62a18cb0493d\
    Complete output (8 lines):
    Traceback (most recent call last):
      File "<string>", line 1, in <module

In [49]:
!zip -r "distilbert_ner.zip" "B:/Prep/repository/NLP-with-ML/Assignment/distilbert-finetuned-ner/checkpoint-5268"


'zip' is not recognized as an internal or external command,
operable program or batch file.
