<a href="https://colab.research.google.com/github/marendtz/News_NER/blob/master/code_prodigy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Preparations: Setup Working Directory**

In [1]:
import getpass
#get github access token for collab
token = getpass.getpass()

··········


In [2]:
%cd /content/
%rm -r /content/*
%ls -la
%pwd

/content
total 16
drwxr-xr-x 1 root root 4096 Apr 29 11:42 [0m[01;34m.[0m/
drwxr-xr-x 1 root root 4096 Apr 29 10:36 [01;34m..[0m/
drwxr-xr-x 4 root root 4096 Apr 27 13:34 [01;34m.config[0m/


'/content'

In [3]:
# clone github repo
!git clone https://{token}@github.com/marendtz/News_NER.git

Cloning into 'News_NER'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 8 (delta 1), reused 8 (delta 1), pack-reused 0[K
Unpacking objects: 100% (8/8), 96.31 KiB | 1.28 MiB/s, done.


In [4]:
%pip install -r /content/News_NER/requirements.txt


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**Create huggingface DatasetDict (maren-hugg/news_ner_dict) from prodigy annotations.jsonl**

In [5]:
import datasets
from datasets import Features, Sequence
from datasets.features import Value, ClassLabel

import random

from spacy.training import offsets_to_biluo_tags, biluo_to_iob
import spacy

from huggingface_hub import notebook_login, create_repo



In [6]:
# login to huggingface
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [65]:
# we will create a function to convert offset formatted labels to BILUO tags and after to IOB tag
def label_and_fill_table(dataset):
    nlp = spacy.blank("en")

    tokens = [word['text'] for word in dataset['tokens']]
    dataset['tokens'] = tokens
    print("tokens")
    print(tokens)
    
    dataset['ner_tags_biluo'] = offsets_to_biluo_tags(
        nlp(dataset['text']), 
        [(d['start'], d['end'], d['label']) for d in [d for d in (dataset['spans'] or [])]]
        )
    #print('ner_tags_biluo')
    #print(dataset['ner_tags_biluo'])
    
    dataset['ner_tags_iob'] = biluo_to_iob(
        dataset['ner_tags_biluo']
        )       
    #print('ner_tags_iob')
    
    if set(dataset['ner_tags_iob'])-set(['O','B-PER','I-PER','B-ORG','I-ORG','B-LOC','I-LOC']):              # handle misaligned tokens even tough the same tokenizer is used: https://github.com/explosion/spaCy/discussions/12247
      dataset['ner_tags_iob'] =  ['O' if x == '-' else x for x in dataset['ner_tags_iob']]

    #print(dataset['ner_tags_iob'])


    return dataset

In [47]:
"""
A class to hold the data and labels for the model.
Can be initialized from a JSONL file and procesed, or from a preprocessed Dataset that has been saved.
"""

# A helper function that returns True if 'jsonl' is found in a string, otherwise False
def is_jsonl(string):
    if 'jsonl' in string:
      print("is jsonl")
      return True
    else:
      return False

class NERdataset():
    def __init__(self, data_file, tokenizer,seed):
        self.data_file = data_file
        self.tokenizer = tokenizer
        self.seed = seed
        
    # Load dataset
        self.load()
    # Process the data (and save if save is not None)
        self.process()


    # Load using datasets.Dataset.from_json if the data is an unprocessed jsonl file, otherwise load from a preprocessed dataset using load_dataset
    def load(self):
        if is_jsonl(self.data_file):
            self.dataset = datasets.Dataset.from_json(self.data_file)
            print("dataset build from data_file")
        else:
            print("no jsonl supplied as data_file")
        return self

    def process_jsonl(self):
      
      print(self.dataset)  
      # create necessary columns from prodigy output + convert offsets to iob format
      self.dataset = self.dataset.map(label_and_fill_table)

      # create information about labels used in the dataset      
      # ensure fixed labeling for transparency
      self.label2id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}
      self.id2label = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
      self.num_classes = len(self.label2id)
      self.label_list = list(self.label2id.keys())

      # create label colum with info about labels used
      label_column = []
      for sentence in self.dataset["ner_tags_iob"]:
        label_column.append([self.label2id[tag] for tag in sentence])
      self.dataset = self.dataset.add_column("ner_tags", label_column)

      # remove unneccesary columns
      rm_cols = list(set(self.dataset.column_names) - set(['tokens','ner_tags']))      
      self.dataset = self.dataset.map(remove_columns=rm_cols)

      print(self.dataset)
      # adapt features    
      features = Features(
          { 'ner_tags': Sequence(feature=ClassLabel(num_classes=self.num_classes, names=self.label_list, names_file=None, id=None),length=-1, id=None),
            'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
          }
      )    
      self.dataset = self.dataset.map(features.encode_example, features=features)

    
    def create_datasetdict(self): 

      # create    
      intermediate_dataset = self.dataset
      # shuffle dataset
      shuffled_dataset = intermediate_dataset.shuffle(seed=self.seed)
      
      # build DataSetDict with train, test, val split
      
      # 90% train, 10% test + validation
      train_testvalid = shuffled_dataset.train_test_split(test_size=0.5)
      # Split the 10% test + valid in half test, half valid
      test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
      
      self.datasetdict = DatasetDict({
          'train': train_testvalid['train'],
          'test': test_valid['test'],
          'validation': test_valid['train']})
      
      
      # push the tokenized dataset to the hub
      self.datasetdict.push_to_hub(repo_id="maren-hugg/news_ner_dict", private=True)

   
    # Process the data
    def process(self):
        if is_jsonl(self.data_file):
            self.process_jsonl()
            self.create_datasetdict()  
        else:
            print("data_file is not of format jsonl")
        

In [48]:
from collections import defaultdict
import pandas as pd
import numpy as np
from datasets import load_dataset_builder, get_dataset_config_names, load_dataset, DatasetDict

from transformers import AutoTokenizer, TrainingArguments, DataCollatorForTokenClassification, Trainer
import nltk.data
nltk.download('punkt')

import torch.nn as nn
from transformers import XLMRobertaConfig, AutoConfig, XLMRobertaForTokenClassification
from transformers.modeling_outputs import TokenClassifierOutput

from huggingface_hub import notebook_login
from seqeval.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report
from seqeval.scheme import IOB2


from torch.nn.functional import cross_entropy
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [52]:
# create huggingface DatasetDict and upload

# tokenizer must be the same than the one used for training later
xlmr_model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

NERdataset(
     data_file="/content/News_NER/data/annotations_news2.jsonl", 
     tokenizer=tokenizer,
     seed = 42)

is jsonl
Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-db0d82a8d271dbcc/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-db0d82a8d271dbcc/0.0.0. Subsequent calls will reuse this data.
dataset build from data_file
is jsonl
Dataset({
    features: ['text', 'tokens', '_input_hash', '_task_hash', '_view_id', 'spans', 'answer', '_timestamp'],
    num_rows: 12
})


Map:   0%|          | 0/12 [00:00<?, ? examples/s]

---------------------------------------------------------------
Annotated data given by prodigy:
{'text': "Hello my Name is Maren and I'm testing from Mannheim.", 'tokens': [{'text': 'Hello', 'id': 0, 'start': 0, 'end': 5, 'disabled': False, 'ws': True}, {'text': 'my', 'id': 1, 'start': 6, 'end': 8, 'disabled': False, 'ws': True}, {'text': 'Name', 'id': 2, 'start': 9, 'end': 13, 'disabled': False, 'ws': True}, {'text': 'is', 'id': 3, 'start': 14, 'end': 16, 'disabled': False, 'ws': True}, {'text': 'Maren', 'id': 4, 'start': 17, 'end': 22, 'disabled': False, 'ws': True}, {'text': 'and', 'id': 5, 'start': 23, 'end': 26, 'disabled': False, 'ws': True}, {'text': "I'm", 'id': 6, 'start': 27, 'end': 30, 'disabled': False, 'ws': True}, {'text': 'testing', 'id': 7, 'start': 31, 'end': 38, 'disabled': False, 'ws': True}, {'text': 'from', 'id': 8, 'start': 39, 'end': 43, 'disabled': False, 'ws': True}, {'text': 'Mannheim.', 'id': 9, 'start': 44, 'end': 53, 'disabled': False, 'ws': True}], '_inpu

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 12
})


Map:   0%|          | 0/12 [00:00<?, ? examples/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/687 [00:00<?, ?B/s]

<__main__.NERdataset at 0x7fe0b425c340>

**Load Model and custom DatasetDict and Finetune**

In [54]:
news_ner_dict = load_dataset("maren-hugg/news_ner_dict", use_auth_token=True)

Downloading readme:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/maren-hugg___parquet/maren-hugg--news_ner_dict-69e527e6c3faee64/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/3.11k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.33k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/6 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/maren-hugg___parquet/maren-hugg--news_ner_dict-69e527e6c3faee64/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [55]:
# check features
print(news_ner_dict["train"].features)
tags = news_ner_dict["train"].features["ner_tags"].feature
print(tags)

# generate function to convert name and id of tags
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}
print(index2tag)
print(tag2index)

{'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None), 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}
ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)
{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


In [60]:
news_ner_dict["train"][4]

{'ner_tags': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 5, 6],
 'tokens': ['Hello',
  'my',
  'Name',
  'is',
  'Maren',
  'and',
  "I'm",
  'testing',
  'from',
  'Mannheim.']}