In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!pip install seqeval

In [None]:
# Used to automatically convert slow tokenizer to fast tokenizer
!pip install sentencepiece

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from datasets import Features, ClassLabel, Sequence, load_metric
from tensorflow.data import Dataset

from transformers import AutoConfig, AutoTokenizer, TFAutoModelForTokenClassification
from transformers import BertConfig, BertTokenizer, BertForTokenClassification
from transformers import TFTrainingArguments, TFTrainer
from transformers import DataCollatorForTokenClassification

In [None]:
LABEL_MAP = {
    "LOC": 0,
    "ORG": 1,
    "PER": 2,
    "O": 3,
    "IDENTIFIER_LATITUDE_LONGITUDE": 4,
    "TITLE": 5,
    "NATIONALITY": 6,
    "TEMPORAL_DATE": 7,
    "TEMPORAL_TIME": 8,
    "PRODUCT": 9,
    "RELIGION": 10,
    "IDENTIFIER_MONEY": 11,
    "IDENTIFIER_URL": 12,
    "IDENTIFIER_DISTANCE": 13,
    "IDENTIFIER_EMAIL": 14,
    "IDENTIFIER_PHONE_NUMBER": 15,
    "IDENTIFIER_NUMBER": 16
}

LABEL_LIST = list(LABEL_MAP.keys())

INPUT_FILE_TRAIN = "/content/drive/MyDrive/datasets/ron_ner_train_5.txt"
INPUT_FILE_VALIDATION = "/content/drive/MyDrive/datasets/ron_ner_test_5.txt"
OUTPUT_DIR = "test-ner"

BATCH_SIZE = 16
MODEL_CHECKPOINT = "jplu/tf-xlm-r-ner-40-lang"
# MODEL_CHECKPOINT = "distilbert-base-uncased"
TASK = "ner"

In [None]:
unique_tags = LABEL_LIST
tag2id = LABEL_MAP
id2tag = {id: tag for tag, id in tag2id.items()}

In [None]:
def migration(lines):
    result = []

    for line in lines:
        # line = line.replace("'", " '")
        words = line.split()
        words_utterance = []
        words_tag = []
        replacement = "O"

        for word in words:
            if "<START:" in word:
                replacement = word.replace("<START:", "").replace(">", "")
                # replacement = "B-" + replacement
                replacement = replacement.replace("PERSON", "PER")
                replacement = replacement.replace("LOCATION", "LOC")
                replacement = replacement.replace("ORGANIZATION", "ORG")
                continue
            if "<END>" in word:
                replacement = "O"
                continue

            words_utterance.append(word)
            # words_tag.append(LABEL_MAP[replacement] if replacement in LABEL_MAP else LABEL_MAP["O"])
            words_tag.append(replacement)

            # if replacement[:2] == "B-":
            #     replacement = "I-" + replacement[2:]

        result.append({"tokens": words_utterance, "ner_tags": words_tag})

    return result

DATA

In [None]:
data_train = []
data_train_clean = []

with open(INPUT_FILE_TRAIN, "r") as f:
    data_train = f.read().splitlines()

for line in data_train:
  if len(line) < 1000:
      data_train_clean.append(line)

data_migrated_train = migration(data_train_clean)
train_texts = [item["tokens"] for item in data_migrated_train]
train_tags = [item["ner_tags"] for item in data_migrated_train]


data_validation = []
data_validation_clean = []

with open(INPUT_FILE_VALIDATION, "r") as f:
    data_validation = f.read().splitlines()

for line in data_validation:
  if len(line) < 1000:
      data_validation_clean.append(line)
  
print(len(data_train))
print(len(data_validation))
print()
print(len(data_train_clean))
print(len(data_validation_clean))

data_migrated_validation = migration(data_validation_clean)
validation_texts = [item["tokens"] for item in data_migrated_validation]
validation_tags = [item["ner_tags"] for item in data_migrated_validation]

83007
7977

82923
7941


TOKENIZER

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

train_encodings = tokenizer(train_texts, is_split_into_words=True, padding=True, truncation=True)
validation_encodings = tokenizer(validation_texts, is_split_into_words=True, padding=True, truncation=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=699.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=150.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# Sample:
# Sentence: ['În', 'dimineaţa', 'de', '24', 'ianuarie,', 'la', '29°', ',', 'am', 'zărit', 'insula', 'Keeling,', 'ridicătură', 'madreporică', 'plină', 'de', 'cocotieri', 'minunaţi,', 'care', 'a', 'fost', 'vizitată', 'de', 'Darwin', 'şi', 'de', 'căpitanul', 'Fitz-Roy.']
# Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'IDENTIFIER_LATITUDE_LONGITUDE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
# Tokens: ['<s>', '▁În', '▁di', 'mine', 'a', 'ţa', '▁de', '▁24', '▁ianuarie', ',', '▁la', '▁29', '°', '▁', ',', '▁am', '▁z', 'ări', 't', '▁insula', '▁Ke', 'eling', ',', '▁ridic', 'ătură', '▁madre', 'por', 'ică', '▁plină', '▁de', '▁coco', 'tier', 'i', '▁minun', 'aţi', ',', '▁care', '▁a', '▁fost', '▁vizitat', 'ă', '▁de', '▁Darwin', '▁şi', '▁de', '▁că', 'pitan', 'ul', '▁Fit', 'z', '-', 'Ro', 'y', '.', '</s>']
# Word Ids: [None, 0, 1, 1, 1, 1, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 9, 9, 9, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 15, 16, 16, 16, 17, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26, 26, 26, 27, 27, 27, 27, 27, 27, None]
# Encoded Tags: [-100, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -100]


def encode_tags(input_ner_tags, input_tokens):
    label_all_tokens = True
    result = []

    for i, ner_tags in enumerate(input_ner_tags):
        ner_ids = [tag2id[f"{tag}"] for tag in ner_tags]
        word_ids = input_tokens.word_ids(batch_index=i)

        encoded_ner_ids = []
        previous_word_id = None
        for word_id in word_ids:
            # Special tokens have a word id that is None. We set the label to -100
            # so they are automatically ignored in the loss function.
            if word_id is None:
                encoded_ner_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_id != previous_word_id:
                encoded_ner_ids.append(ner_ids[word_id])
            # For the other tokens in a word, we set the label to either
            # the current label or -100, depending on the label_all_tokens flag.
            else:
                encoded_ner_ids.append(ner_ids[word_id] if label_all_tokens else -100)
            previous_word_id = word_id

        result.append(encoded_ner_ids)

    return result

train_labels = encode_tags(train_tags, train_encodings)
validation_labels = encode_tags(validation_tags, validation_encodings)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

validation_dataset = tf.data.Dataset.from_tensor_slices((
    dict(validation_encodings),
    validation_labels
))

MODEL

In [None]:
args = TFTrainingArguments(
    
    output_dir=OUTPUT_DIR + '/results',
    # evaluation_strategy="epoch",
    # label_names= LABEL_LIST,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=10000,
    save_total_limit=3,

)

In [None]:
# model = TFAutoModelForTokenClassification.from_pretrained(MODEL_CHECKPOINT)

with args.strategy.scope():
    model = TFAutoModelForTokenClassification.from_pretrained(MODEL_CHECKPOINT)

In [None]:
id2label = dict([(value, key) for key, value in LABEL_MAP.items()])

config = AutoConfig.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=len(LABEL_LIST),
    id2label=id2label,
    label2id=LABEL_MAP
)

In [None]:
# Reinitiallize the final classification layer to match new number of labels

from transformers.modeling_tf_utils import get_initializer

model.classifier = tf.keras.layers.Dense(
                config.num_labels, kernel_initializer=get_initializer(config.initializer_range),
                name="classifier"
            )

model.config = config
model.num_labels = config.num_labels

In [None]:
trainer = TFTrainer(  
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    # data_collator=data_collator,
    # tokenizer=tokenizer,
    # compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
# trainer.evaluate()

In [None]:
trainer.save_model(OUTPUT_DIR + '/model')

In [None]:
# !rm -rf test-ner 

In [None]:
# !cp /content/test-ner/model/tf_model.h5 drive/MyDrive/tf_model.h5

In [None]:
# !cp /content/test-ner/model/config.json drive/MyDrive/config.json