<a href="https://colab.research.google.com/github/marendtz/ModelNER/blob/master/prodigy_create_custom_datasetdict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Preparations**

In [1]:
import getpass
#get github access token for collab
token = getpass.getpass()

··········


In [2]:
# clean workdir
%cd /content/
%rm -r /content/*
%ls -la
%pwd

/content
total 16
drwxr-xr-x 1 root root 4096 Jun 12 06:05 [0m[01;34m.[0m/
drwxr-xr-x 1 root root 4096 Jun 12 06:03 [01;34m..[0m/
drwxr-xr-x 4 root root 4096 Jun  8 18:17 [01;34m.config[0m/


'/content'

In [3]:
# clone github repo
!git clone https://{token}@github.com/marendtz/ModelNER.git

Cloning into 'ModelNER'...
remote: Enumerating objects: 47, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 47 (delta 24), reused 16 (delta 5), pack-reused 0[K
Unpacking objects: 100% (47/47), 423.01 KiB | 1.75 MiB/s, done.


In [5]:
# install dependencies
%pip install -r /content/ModelNER/requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers (from -r /content/ModelNER/requirements.txt (line 3))
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from -r /content/ModelNER/requirements.txt (line 4))
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval==1.2.2 (from -r /content/ModelNER/requirements.txt (line 8))
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate (from -r /content/ModelNER/requirements.txt (line 9))
  Downloadi

**Create huggingface dataset from prodigy annotations.jsonl**

In [6]:
import datasets
from datasets import Features, Sequence, DatasetDict
from datasets.features import Value, ClassLabel

import random

import spacy

from transformers import AutoTokenizer

from huggingface_hub import notebook_login, create_repo

In [7]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
# helper function to convert offset formatted labels to IOB tag
def convert_spans_to_iob(text, ents):
  tags = ["O"] * len(text)
  for ent in ents:
    start, end, label = ent["token_start"], ent["token_end"], ent["label"]
    if start == end:
      tags[start] = 'B-'+ label
    else:
      tags[start] = 'B-' + label
      tags[start+1: end + 1] = ['I-'+label]*(end - start)
  return tags

In [9]:
# function to convert prodigy info to needed features
def label_and_fill_table(dataset):

  tokens = [word['text'] for word in dataset['tokens']]
  dataset['tokens'] = tokens
  #print("tokens")
  #print(tokens)
  
  dataset['ner_tags_iob'] = convert_spans_to_iob(tokens,dataset['spans'] or [])
  #print('ner_tags_iob')
  #print(dataset['ner_tags_iob'])
  
  return dataset

In [11]:
"""
A class to hold the data and labels for the model.
Initialized from a JSONL file and procesed.
"""

# A helper function that returns True if 'jsonl' is found in a string, otherwise False
def is_jsonl(string):
  if 'jsonl' in string:
    print("is jsonl")
    return True
  else:
    return False

class NERdataset():
  def __init__(self, data_file, seed):
    self.data_file = data_file
    self.seed = seed
    
    # Load dataset
    self.load()
    # Process the data (and save if save is not None)
    self.process()


  # 1) Load using datasets.Dataset.from_json if the data is an unprocessed jsonl file, otherwise load from a preprocessed dataset using load_dataset
  def load(self):
    if is_jsonl(self.data_file):
      self.dataset = datasets.Dataset.from_json(self.data_file)
      print("dataset build from data_file")
    else:
      print("no jsonl supplied as data_file")
    return self

  # 2) Process loaded data to create features needed
  def process_jsonl(self):
    
    print(self.dataset)  
    # create necessary columns from prodigy output + convert offsets to iob format
    self.dataset = self.dataset.map(label_and_fill_table)

    # create information about labels used in the dataset      
    # ensure fixed labeling for transparency (adapted from PAN-X)
    self.label2id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}
    self.id2label = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
    self.num_classes = len(self.label2id)
    self.label_list = list(self.label2id.keys())

    # create label colum with info about labels used
    label_column = []
    for sentence in self.dataset["ner_tags_iob"]:
      label_column.append([self.label2id[tag] for tag in sentence])
    self.dataset = self.dataset.add_column("ner_tags", label_column)

    # remove unneccesary columns
    rm_cols = list(set(self.dataset.column_names) - set(['tokens','ner_tags']))      
    self.dataset = self.dataset.map(remove_columns=rm_cols)

    print(self.dataset)
    # adapt features    
    features = Features(
      { 'ner_tags': Sequence(feature=ClassLabel(num_classes=self.num_classes, names=self.label_list, names_file=None, id=None),length=-1, id=None),
        'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
      }
    )    
    self.dataset = self.dataset.map(features.encode_example, features=features)

  # 3) Create DatasetDict that can be loaded to Huggingface for later use in training
  def create_datasetdict(self): 

    # create    
    intermediate_dataset = self.dataset
    # shuffle dataset
    shuffled_dataset = intermediate_dataset.shuffle(seed=self.seed)
    
    # build DataSetDict with train, test, val split
    
    # 90% train, 10% test + validation
    train_testvalid = shuffled_dataset.train_test_split(test_size=0.5)
    # Split the 10% test + valid in half test, half valid
    test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
    
    self.datasetdict = DatasetDict({
        'train': train_testvalid['train'],
        'test': test_valid['test'],
        'validation': test_valid['train']})
    
    
    # push the tokenized dataset to the hub
    self.datasetdict.push_to_hub(repo_id="maren-hugg/sustainability_ner", private=True)

  # Process the data
  def process(self):
    if is_jsonl(self.data_file):
      self.process_jsonl()
      self.create_datasetdict()  
    else:
      print("data_file is not of format jsonl")
        


In [12]:
# create class object 
dataset_class = NERdataset(
     data_file="/content/ModelNER/custom_data/annotations.jsonl", 
     seed = 42)

is jsonl
Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-e8e748665d555397/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-e8e748665d555397/0.0.0. Subsequent calls will reuse this data.
dataset build from data_file
is jsonl
Dataset({
    features: ['text', 'tokens', '_input_hash', '_task_hash', '_view_id', 'spans', 'answer', '_timestamp'],
    num_rows: 1000
})


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 1000
})


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]