<a href="https://colab.research.google.com/github/overfit-ir/persian-twitter-ner/blob/master/benchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [2]:
!pip -q install transformers==4.2.2
!pip -q install sentencepiece
!pip -q install nereval

[K     |████████████████████████████████| 1.8MB 6.1MB/s 
[K     |████████████████████████████████| 870kB 44.0MB/s 
[K     |████████████████████████████████| 2.9MB 41.4MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.2MB 4.3MB/s 
[?25h

In [3]:
import transformers
transformers.__version__

'4.2.2'

In [4]:
import pandas as pd
import numpy as np
from transformers import (
    pipeline, 
    AutoConfig, 
    AutoTokenizer, 
    AutoModel, 
    AutoModelForTokenClassification
)
from pprint import pprint

In [94]:
! rm -rf data
! wget -q --show-progress https://raw.githubusercontent.com/overfit-ir/persian-twitter-ner/master/twitter_data/persian-ner-twitter-data/test.txt
! wget -q --show-progress https://raw.githubusercontent.com/overfit-ir/persian-twitter-ner/master/twitter_data/persian-ner-twitter-data/train.txt
! mkdir data && mv test.txt data/ && mv train.txt data/



In [111]:
! rm -rf data_peyma
! wget -q --show-progress https://raw.githubusercontent.com/overfit-ir/persian-twitter-ner/master/ner_data/peyma/test.txt
! wget -q --show-progress https://raw.githubusercontent.com/overfit-ir/persian-twitter-ner/master/ner_data/peyma/train.txt
! mkdir data_peyma && mv test.txt data_peyma/ && mv train.txt data_peyma/



# Convert to Text

In [7]:
from pathlib import Path
import re

def convert_lines_to_text(file_path, separator='\t'):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split(separator, 1)
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

In [95]:
texts, tags = convert_lines_to_text('data/test.txt')
texts_train, tags_train = convert_lines_to_text('data/train.txt')

In [9]:
s = ''
for word in texts[8]:
  s += word + ' '
print(s)

شعار ندهید همین الان بسیاری از سران نظام فعلی ایران دارای تابعیت دوگانه ایرانی و عراقی هستند برادران لاریجانی سردار نقدی دکتر صالحی حتی مرحوم شاهرودی زمانی که رییس قوه قضاییه بود فارسی را به لهجه عربی صحبت میکرد مهم تعهد و عشق به وطن است که در سایه آموزش و رضایت بدست میآید 


In [112]:
texts_peyma, tags_peyma = convert_lines_to_text('data_peyma/test.txt', separator='|')
texts_peyma_train, tags_peyma_train = convert_lines_to_text('data_peyma/test.txt', separator='|')

In [11]:
s = ''
for word in texts_peyma[0]:
  s += word + ' '
print(s)

کنایه سرلشگر فیروزآبادی به پادشاه عربستان و پسرش 


# Benchmark

In [12]:
import logging
from collections import namedtuple
from copy import deepcopy

logging.basicConfig(
    format="%(asctime)s %(name)s %(levelname)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level="DEBUG",
)

Entity = namedtuple("Entity", "e_type start_offset end_offset")

class Evaluator():

    def __init__(self, true, pred, tags):
        """
        """

        if len(true) != len(pred):
            raise ValueError("Number of predicted documents does not equal true")

        self.true = true
        self.pred = pred
        self.tags = tags

        # Setup dict into which metrics will be stored.

        self.metrics_results = {
            'correct': 0,
            'incorrect': 0,
            'partial': 0,
            'missed': 0,
            'spurious': 0,
            'possible': 0,
            'actual': 0,
            'precision': 0,
            'recall': 0,
        }

        # Copy results dict to cover the four schemes.

        self.results = {
            'strict': deepcopy(self.metrics_results),
            'ent_type': deepcopy(self.metrics_results),
            'partial':deepcopy(self.metrics_results),
            'exact':deepcopy(self.metrics_results),
            }

        # Create an accumulator to store results

        self.evaluation_agg_entities_type = {e: deepcopy(self.results) for e in tags}


    def evaluate(self):

        logging.info(
            "Imported %s predictions for %s true examples",
            len(self.pred), len(self.true)
        )

        for true_ents, pred_ents in zip(self.true, self.pred):

            # Check that the length of the true and predicted examples are the
            # same. This must be checked here, because another error may not
            # be thrown if the lengths do not match.

            if len(true_ents) != len(pred_ents):
                raise ValueError("Prediction length does not match true example length")

            # Compute results for one message

            tmp_results, tmp_agg_results = compute_metrics(
                collect_named_entities(true_ents),
                collect_named_entities(pred_ents),
                self.tags
            )

            # Cycle through each result and accumulate

            # TODO: Combine these loops below:

            for eval_schema in self.results:

                for metric in self.results[eval_schema]:

                    self.results[eval_schema][metric] += tmp_results[eval_schema][metric]

            # Calculate global precision and recall

            self.results = compute_precision_recall_wrapper(self.results)

            # Aggregate results by entity type

            for e_type in self.tags:

                for eval_schema in tmp_agg_results[e_type]:

                    for metric in tmp_agg_results[e_type][eval_schema]:

                        self.evaluation_agg_entities_type[e_type][eval_schema][metric] += tmp_agg_results[e_type][eval_schema][metric]

                # Calculate precision recall at the individual entity level

                self.evaluation_agg_entities_type[e_type] = compute_precision_recall_wrapper(self.evaluation_agg_entities_type[e_type])

        return self.results, self.evaluation_agg_entities_type


def collect_named_entities(tokens):
    """
    Creates a list of Entity named-tuples, storing the entity type and the start and end
    offsets of the entity.

    :param tokens: a list of tags
    :return: a list of Entity named-tuples
    """

    named_entities = []
    start_offset = None
    end_offset = None
    ent_type = None

    for offset, token_tag in enumerate(tokens):

        if token_tag == 'O':
            if ent_type is not None and start_offset is not None:
                end_offset = offset - 1
                named_entities.append(Entity(ent_type, start_offset, end_offset))
                start_offset = None
                end_offset = None
                ent_type = None

        elif ent_type is None:
            ent_type = token_tag[2:]
            start_offset = offset

        elif ent_type != token_tag[2:] or (ent_type == token_tag[2:] and token_tag[:1] == 'B'):

            end_offset = offset - 1
            named_entities.append(Entity(ent_type, start_offset, end_offset))

            # start of a new entity
            ent_type = token_tag[2:]
            start_offset = offset
            end_offset = None

    # catches an entity that goes up until the last token

    if ent_type is not None and start_offset is not None and end_offset is None:
        named_entities.append(Entity(ent_type, start_offset, len(tokens)-1))

    return named_entities


def compute_metrics(true_named_entities, pred_named_entities, tags):


    eval_metrics = {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 0, 'precision': 0, 'recall': 0}

    # overall results
    
    evaluation = {
        'strict': deepcopy(eval_metrics),
        'ent_type': deepcopy(eval_metrics),
        'partial': deepcopy(eval_metrics),
        'exact': deepcopy(eval_metrics)
    }

    # results by entity type

    evaluation_agg_entities_type = {e: deepcopy(evaluation) for e in tags}

    # keep track of entities that overlapped

    true_which_overlapped_with_pred = []

    # Subset into only the tags that we are interested in.
    # NOTE: we remove the tags we don't want from both the predicted and the
    # true entities. This covers the two cases where mismatches can occur:
    #
    # 1) Where the model predicts a tag that is not present in the true data
    # 2) Where there is a tag in the true data that the model is not capable of
    # predicting.

    true_named_entities = [ent for ent in true_named_entities if ent.e_type in tags]
    pred_named_entities = [ent for ent in pred_named_entities if ent.e_type in tags]

    # go through each predicted named-entity

    for pred in pred_named_entities:
        found_overlap = False

        # Check each of the potential scenarios in turn. See
        # http://www.davidsbatista.net/blog/2018/05/09/Named_Entity_Evaluation/
        # for scenario explanation.

        # Scenario I: Exact match between true and pred

        if pred in true_named_entities:
            true_which_overlapped_with_pred.append(pred)
            evaluation['strict']['correct'] += 1
            evaluation['ent_type']['correct'] += 1
            evaluation['exact']['correct'] += 1
            evaluation['partial']['correct'] += 1

            # for the agg. by e_type results
            evaluation_agg_entities_type[pred.e_type]['strict']['correct'] += 1
            evaluation_agg_entities_type[pred.e_type]['ent_type']['correct'] += 1
            evaluation_agg_entities_type[pred.e_type]['exact']['correct'] += 1
            evaluation_agg_entities_type[pred.e_type]['partial']['correct'] += 1

        else:

            # check for overlaps with any of the true entities

            for true in true_named_entities:

                pred_range = range(pred.start_offset, pred.end_offset)
                true_range = range(true.start_offset, true.end_offset)

                # Scenario IV: Offsets match, but entity type is wrong

                if true.start_offset == pred.start_offset and pred.end_offset == true.end_offset \
                        and true.e_type != pred.e_type:

                    # overall results
                    evaluation['strict']['incorrect'] += 1
                    evaluation['ent_type']['incorrect'] += 1
                    evaluation['partial']['correct'] += 1
                    evaluation['exact']['correct'] += 1

                    # aggregated by entity type results
                    evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1
                    evaluation_agg_entities_type[true.e_type]['ent_type']['incorrect'] += 1
                    evaluation_agg_entities_type[true.e_type]['partial']['correct'] += 1
                    evaluation_agg_entities_type[true.e_type]['exact']['correct'] += 1

                    true_which_overlapped_with_pred.append(true)
                    found_overlap = True

                    break

                # check for an overlap i.e. not exact boundary match, with true entities

                elif find_overlap(true_range, pred_range):

                    true_which_overlapped_with_pred.append(true)

                    # Scenario V: There is an overlap (but offsets do not match
                    # exactly), and the entity type is the same.
                    # 2.1 overlaps with the same entity type

                    if pred.e_type == true.e_type:

                        # overall results
                        evaluation['strict']['incorrect'] += 1
                        evaluation['ent_type']['correct'] += 1
                        evaluation['partial']['partial'] += 1
                        evaluation['exact']['incorrect'] += 1

                        # aggregated by entity type results
                        evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1
                        evaluation_agg_entities_type[true.e_type]['ent_type']['correct'] += 1
                        evaluation_agg_entities_type[true.e_type]['partial']['partial'] += 1
                        evaluation_agg_entities_type[true.e_type]['exact']['incorrect'] += 1

                        found_overlap = True

                        break

                    # Scenario VI: Entities overlap, but the entity type is
                    # different.

                    else:
                        # overall results
                        evaluation['strict']['incorrect'] += 1
                        evaluation['ent_type']['incorrect'] += 1
                        evaluation['partial']['partial'] += 1
                        evaluation['exact']['incorrect'] += 1

                        # aggregated by entity type results
                        # Results against the true entity
                        # print(pred)
                        # print(true)
                        evaluation_agg_entities_type[true.e_type]['strict']['incorrect'] += 1
                        evaluation_agg_entities_type[true.e_type]['partial']['partial'] += 1
                        evaluation_agg_entities_type[true.e_type]['ent_type']['incorrect'] += 1
                        evaluation_agg_entities_type[true.e_type]['exact']['incorrect'] += 1

                        # Results against the predicted entity

                        # evaluation_agg_entities_type[pred.e_type]['strict']['spurious'] += 1

                        found_overlap = True

                        break

            # Scenario II: Entities are spurious (i.e., over-generated).

            if not found_overlap:

                # Overall results

                evaluation['strict']['spurious'] += 1
                evaluation['ent_type']['spurious'] += 1
                evaluation['partial']['spurious'] += 1
                evaluation['exact']['spurious'] += 1
                # print(pred)
                # Aggregated by entity type results
                # print('Pred : ' ,pred)
                evaluation_agg_entities_type[pred.e_type]['strict']['spurious'] += 1
                evaluation_agg_entities_type[pred.e_type]['partial']['spurious'] += 1
                evaluation_agg_entities_type[pred.e_type]['ent_type']['spurious'] += 1
                evaluation_agg_entities_type[pred.e_type]['exact']['spurious'] += 1

                # NOTE: when pred.e_type is not found in tags
                # or when it simply does not appear in the test set, then it is
                # spurious, but it is not clear where to assign it at the tag
                # level. In this case, it is applied to all target_tags
                # found in this example. This will mean that the sum of the
                # evaluation_agg_entities will not equal evaluation.

                # for true in tags:                    
                #     print('True : ' ,true)
                #     evaluation_agg_entities_type[true]['strict']['spurious'] += 1
                #     evaluation_agg_entities_type[true]['ent_type']['spurious'] += 1
                #     evaluation_agg_entities_type[true]['partial']['spurious'] += 1
                #     evaluation_agg_entities_type[true]['exact']['spurious'] += 1

    # Scenario III: Entity was missed entirely.

    for true in true_named_entities:
        if true in true_which_overlapped_with_pred:
            continue
        else:
            # overall results
            evaluation['strict']['missed'] += 1
            evaluation['ent_type']['missed'] += 1
            evaluation['partial']['missed'] += 1
            evaluation['exact']['missed'] += 1

            # for the agg. by e_type
            evaluation_agg_entities_type[true.e_type]['strict']['missed'] += 1
            evaluation_agg_entities_type[true.e_type]['ent_type']['missed'] += 1
            evaluation_agg_entities_type[true.e_type]['partial']['missed'] += 1
            evaluation_agg_entities_type[true.e_type]['exact']['missed'] += 1

    # Compute 'possible', 'actual' according to SemEval-2013 Task 9.1 on the
    # overall results, and use these to calculate precision and recall.

    for eval_type in evaluation:
        evaluation[eval_type] = compute_actual_possible(evaluation[eval_type])

    # Compute 'possible', 'actual', and precision and recall on entity level
    # results. Start by cycling through the accumulated results.

    for entity_type, entity_level in evaluation_agg_entities_type.items():

        # Cycle through the evaluation types for each dict containing entity
        # level results.

        for eval_type in entity_level:

            evaluation_agg_entities_type[entity_type][eval_type] = compute_actual_possible(
                entity_level[eval_type]
            )

    return evaluation, evaluation_agg_entities_type


def find_overlap(true_range, pred_range):
    """Find the overlap between two ranges

    Find the overlap between two ranges. Return the overlapping values if
    present, else return an empty set().

    Examples:

    >>> find_overlap((1, 2), (2, 3))
    2
    >>> find_overlap((1, 2), (3, 4))
    set()
    """

    true_set = set(true_range)
    pred_set = set(pred_range)

    overlaps = true_set.intersection(pred_set)

    return overlaps


def compute_actual_possible(results):
    """
    Takes a result dict that has been output by compute metrics.
    Returns the results dict with actual, possible populated.

    When the results dicts is from partial or ent_type metrics, then
    partial_or_type=True to ensure the right calculation is used for
    calculating precision and recall.
    """

    correct = results['correct']
    incorrect = results['incorrect']
    partial = results['partial']
    missed = results['missed']
    spurious = results['spurious']

    # Possible: number annotations in the gold-standard which contribute to the
    # final score

    possible = correct + incorrect + partial + missed

    # Actual: number of annotations produced by the NER system

    actual = correct + incorrect + spurious

    results["actual"] = actual
    results["possible"] = possible

    return results


def compute_precision_recall(results, partial_or_type=False):
    """
    Takes a result dict that has been output by compute metrics.
    Returns the results dict with precison and recall populated.

    When the results dicts is from partial or ent_type metrics, then
    partial_or_type=True to ensure the right calculation is used for
    calculating precision and recall.
    """

    actual = results["actual"]
    possible = results["possible"]
    partial = results['partial']
    correct = results['correct']

    if partial_or_type:
        precision = (correct + 0.5 * partial) / actual if actual > 0 else 0
        recall = (correct + 0.5 * partial) / possible if possible > 0 else 0

    else:
        precision = correct / actual if actual > 0 else 0
        recall = correct / possible if possible > 0 else 0

    results["precision"] = precision
    results["recall"] = recall

    return results


def compute_precision_recall_wrapper(results):
    """
    Wraps the compute_precision_recall function and runs on a dict of results
    """

    results_a = {key: compute_precision_recall(value, True) for key, value in results.items() if
                 key in ['partial', 'ent_type']}
    results_b = {key: compute_precision_recall(value) for key, value in results.items() if
                 key in ['strict', 'exact']}

    results = {**results_a, **results_b}

    return results


In [13]:
def map_index2label(text, labels):
  index2label = {}
  start_index = 0
  for i, word in enumerate(text):
    end_index = start_index + len(word)
    index2label[(start_index, end_index)] = labels[i]
    start_index = end_index + 1
  return index2label

In [14]:
def align_prediction(texts, labels_list, model, tag_map):
  y_true = []
  y_pred = []
  index = 0
  for text, labels in zip(texts, labels_list):
    index2label_true = map_index2label(text, labels)
    y_true += [value for key, value in index2label_true.items()]
    model_result = model(" ".join(text))
    index2label_pred = {}
    for key, value in index2label_true.items():
      temp = []
      for item in model_result:
        if item['start'] >= key[0] and item['end'] <= key[1]:
          temp.append(item['entity'])
      index2label_pred[(key[0], key[1])] = temp[0]
    y_pred += [ tag_map[value] for key, value in index2label_pred.items()]
    index += 1
    if index%10 == 0:
      print(index)
    if index==500:
      break
  return [y_true], [y_pred]

In [15]:
def benchmark(y_true, y_pred, defined_labels_to_evaluate):
  evaluator = Evaluator(y_true, y_pred, tags=defined_labels_to_evaluate)
  return evaluator.evaluate()

### Albert

In [None]:
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/albert-fa-zwnj-base-v2-ner")
model = AutoModelForTokenClassification.from_pretrained("HooshvareLab/albert-fa-zwnj-base-v2-ner")
model.eval()
albert_ner = pipeline('ner', model=model, tokenizer=tokenizer, ignore_labels=[])

2021-04-15 11:46:53 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:46:53 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/albert-fa-zwnj-base-v2-ner/resolve/main/config.json HTTP/1.1" 200 0
2021-04-15 11:46:53 filelock DEBUG: Attempting to acquire lock 140377837699664 on /root/.cache/huggingface/transformers/fa5b2b8037a29a88c692d6aa8acb666ea97c74b19ae19cf4469412f27dbdcb2c.b8e9dbd19707b0813c96c809830537f4417f29e1e032848a54aa349912414965.lock
2021-04-15 11:46:53 filelock INFO: Lock 140377837699664 acquired on /root/.cache/huggingface/transformers/fa5b2b8037a29a88c692d6aa8acb666ea97c74b19ae19cf4469412f27dbdcb2c.b8e9dbd19707b0813c96c809830537f4417f29e1e032848a54aa349912414965.lock
2021-04-15 11:46:53 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:46:54 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /HooshvareLab/albert-fa-zwnj-base-v2-ner/resolve/

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1572.0, style=ProgressStyle(description…

2021-04-15 11:46:54 filelock DEBUG: Attempting to release lock 140377837699664 on /root/.cache/huggingface/transformers/fa5b2b8037a29a88c692d6aa8acb666ea97c74b19ae19cf4469412f27dbdcb2c.b8e9dbd19707b0813c96c809830537f4417f29e1e032848a54aa349912414965.lock
2021-04-15 11:46:54 filelock INFO: Lock 140377837699664 released on /root/.cache/huggingface/transformers/fa5b2b8037a29a88c692d6aa8acb666ea97c74b19ae19cf4469412f27dbdcb2c.b8e9dbd19707b0813c96c809830537f4417f29e1e032848a54aa349912414965.lock
2021-04-15 11:46:54 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-15 11:46:54 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/albert-fa-zwnj-base-v2-ner/resolve/main/spiece.model HTTP/1.1" 302 0
2021-04-15 11:46:54 filelock DEBUG: Attempting to acquire lock 140377837446096 on /root/.cache/huggingface/transformers/1e61e7998995a2c7b20ae37f17dbd451a93bd0325ee7871fa9992cb75efeff10.d998083122643312c83ef4329b4631782a875c6ee59927666f3a972d118e6c56.lock
2021-04-15 11:46:54 filelock INFO: Lock 140377837446096 acquired on /root/.cache/huggingface/transformers/1e61e7998995a2c7b20ae37f17dbd451a93bd0325ee7871fa9992cb75efeff10.d998083122643312c83ef4329b4631782a875c6ee59927666f3a972d118e6c56.lock
2021-04-15 11:46:54 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): cdn-lfs.huggingface.co:443
2021-04-15 11:46:54 urllib3.connectionpool DEBUG: https://cdn-lfs.huggingface.co:443 "GET /HooshvareLab/albert-fa-zwnj-base-v2-ner/903319b1a4a7e58e49383764d33897a7f49784510247d68438e4f3bff25b01f1 HTTP/1.1" 200 857476


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=857476.0, style=ProgressStyle(descripti…

2021-04-15 11:46:54 filelock DEBUG: Attempting to release lock 140377837446096 on /root/.cache/huggingface/transformers/1e61e7998995a2c7b20ae37f17dbd451a93bd0325ee7871fa9992cb75efeff10.d998083122643312c83ef4329b4631782a875c6ee59927666f3a972d118e6c56.lock
2021-04-15 11:46:54 filelock INFO: Lock 140377837446096 released on /root/.cache/huggingface/transformers/1e61e7998995a2c7b20ae37f17dbd451a93bd0325ee7871fa9992cb75efeff10.d998083122643312c83ef4329b4631782a875c6ee59927666f3a972d118e6c56.lock
2021-04-15 11:46:54 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-15 11:46:55 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/albert-fa-zwnj-base-v2-ner/resolve/main/tokenizer.json HTTP/1.1" 200 0
2021-04-15 11:46:55 filelock DEBUG: Attempting to acquire lock 140377838575952 on /root/.cache/huggingface/transformers/309e59ec44042dd72731f77ac9f1b3ea7f27e798b4dda075b4e5b574cfc02823.be961fb8b21d67dbb8485194ab3ba872cca87271b562bf02a994d188607ca243.lock
2021-04-15 11:46:55 filelock INFO: Lock 140377838575952 acquired on /root/.cache/huggingface/transformers/309e59ec44042dd72731f77ac9f1b3ea7f27e798b4dda075b4e5b574cfc02823.be961fb8b21d67dbb8485194ab3ba872cca87271b562bf02a994d188607ca243.lock
2021-04-15 11:46:55 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:46:55 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /HooshvareLab/albert-fa-zwnj-base-v2-ner/resolve/main/tokenizer.json HTTP/1.1" 200 3229336


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3229336.0, style=ProgressStyle(descript…

2021-04-15 11:46:56 filelock DEBUG: Attempting to release lock 140377838575952 on /root/.cache/huggingface/transformers/309e59ec44042dd72731f77ac9f1b3ea7f27e798b4dda075b4e5b574cfc02823.be961fb8b21d67dbb8485194ab3ba872cca87271b562bf02a994d188607ca243.lock
2021-04-15 11:46:56 filelock INFO: Lock 140377838575952 released on /root/.cache/huggingface/transformers/309e59ec44042dd72731f77ac9f1b3ea7f27e798b4dda075b4e5b574cfc02823.be961fb8b21d67dbb8485194ab3ba872cca87271b562bf02a994d188607ca243.lock
2021-04-15 11:46:56 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-15 11:46:56 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/albert-fa-zwnj-base-v2-ner/resolve/main/added_tokens.json HTTP/1.1" 404 0
2021-04-15 11:46:56 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:46:56 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/albert-fa-zwnj-base-v2-ner/resolve/main/special_tokens_map.json HTTP/1.1" 200 0
2021-04-15 11:46:56 filelock DEBUG: Attempting to acquire lock 140377838576400 on /root/.cache/huggingface/transformers/ca8b9cdacd210800376d99bc8fa4ff0ad3ec558f1546ea0edc09066878079e8d.923ec3af02797bf352e5d19ea7c70e2afaa87ef38bcdc8088cbe93a9bf2ba9ab.lock
2021-04-15 11:46:56 filelock INFO: Lock 140377838576400 acquired on /root/.cache/huggingface/transformers/ca8b9cdacd210800376d99bc8fa4ff0ad3ec558f1546ea0edc09066878079e8d.923ec3af02797bf352e5d19ea7c70e2afaa87ef38bcdc8088cbe93a9bf2ba9ab.lock
2021-04-15 11:46:56 urllib3.connectionpool DEBUG: St

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=186.0, style=ProgressStyle(description_…

2021-04-15 11:46:56 filelock DEBUG: Attempting to release lock 140377838576400 on /root/.cache/huggingface/transformers/ca8b9cdacd210800376d99bc8fa4ff0ad3ec558f1546ea0edc09066878079e8d.923ec3af02797bf352e5d19ea7c70e2afaa87ef38bcdc8088cbe93a9bf2ba9ab.lock
2021-04-15 11:46:56 filelock INFO: Lock 140377838576400 released on /root/.cache/huggingface/transformers/ca8b9cdacd210800376d99bc8fa4ff0ad3ec558f1546ea0edc09066878079e8d.923ec3af02797bf352e5d19ea7c70e2afaa87ef38bcdc8088cbe93a9bf2ba9ab.lock
2021-04-15 11:46:56 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-15 11:46:57 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/albert-fa-zwnj-base-v2-ner/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
2021-04-15 11:46:57 filelock DEBUG: Attempting to acquire lock 140377837380304 on /root/.cache/huggingface/transformers/48a4c7ed555c3de1299c3fc7a2163800468ce034a5d81f9bfe337882c1a88cf4.a89c16d93351e1a6169a82c9eb6324edc64ff127f4ec6a9a72f6462a98aaf017.lock
2021-04-15 11:46:57 filelock INFO: Lock 140377837380304 acquired on /root/.cache/huggingface/transformers/48a4c7ed555c3de1299c3fc7a2163800468ce034a5d81f9bfe337882c1a88cf4.a89c16d93351e1a6169a82c9eb6324edc64ff127f4ec6a9a72f6462a98aaf017.lock
2021-04-15 11:46:57 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:46:57 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /HooshvareLab/albert-fa-zwnj-base-v2-ner/resolve/main/tokenizer_config.json HTTP/1.1" 200 499


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=499.0, style=ProgressStyle(description_…

2021-04-15 11:46:57 filelock DEBUG: Attempting to release lock 140377837380304 on /root/.cache/huggingface/transformers/48a4c7ed555c3de1299c3fc7a2163800468ce034a5d81f9bfe337882c1a88cf4.a89c16d93351e1a6169a82c9eb6324edc64ff127f4ec6a9a72f6462a98aaf017.lock
2021-04-15 11:46:57 filelock INFO: Lock 140377837380304 released on /root/.cache/huggingface/transformers/48a4c7ed555c3de1299c3fc7a2163800468ce034a5d81f9bfe337882c1a88cf4.a89c16d93351e1a6169a82c9eb6324edc64ff127f4ec6a9a72f6462a98aaf017.lock
2021-04-15 11:46:57 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-15 11:46:57 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/albert-fa-zwnj-base-v2-ner/resolve/main/config.json HTTP/1.1" 200 0
2021-04-15 11:46:57 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:46:58 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/albert-fa-zwnj-base-v2-ner/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
2021-04-15 11:46:58 filelock DEBUG: Attempting to acquire lock 140377837382800 on /root/.cache/huggingface/transformers/7b0f0e27645e482092088f175e8c18e4e349fe940811383871e74b60a5ac6fa2.15b683ef18b8b9444c37732797d895b000f4badc046ed32fa916f88decc7ed1d.lock
2021-04-15 11:46:58 filelock INFO: Lock 140377837382800 acquired on /root/.cache/huggingface/transformers/7b0f0e27645e482092088f175e8c18e4e349fe940811383871e74b60a5ac6fa2.15b683ef18b8b9444c37732797d895b000f4badc046ed32fa916f88decc7ed1d.lock
2021-04-15 11:46:58 urllib3.connectionpool DEBUG: Starting new H

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=44451929.0, style=ProgressStyle(descrip…

2021-04-15 11:46:59 filelock DEBUG: Attempting to release lock 140377837382800 on /root/.cache/huggingface/transformers/7b0f0e27645e482092088f175e8c18e4e349fe940811383871e74b60a5ac6fa2.15b683ef18b8b9444c37732797d895b000f4badc046ed32fa916f88decc7ed1d.lock
2021-04-15 11:46:59 filelock INFO: Lock 140377837382800 released on /root/.cache/huggingface/transformers/7b0f0e27645e482092088f175e8c18e4e349fe940811383871e74b60a5ac6fa2.15b683ef18b8b9444c37732797d895b000f4badc046ed32fa916f88decc7ed1d.lock





In [None]:
  tag_map = {
    "B-DAT": 'O',
    "B-EVE": "B-EVE",
    "B-FAC": "B-ORG",
    "B-LOC": "B-LOC",
    "B-MON": "O",
    "B-ORG": "B-ORG",
    "B-PER": "B-PER",
    "B-PRO": "O",
    "B-TIM": "O",
    "B-PCT": "O",
    "I-DAT": "O",
    "I-EVE": "I-EVE",
    "I-FAC": "I-ORG",
    "I-LOC": "I-LOC",
    "I-MON": "O",
    "I-ORG": "I-ORG",
    "I-PER": "I-PER",
    "I-PRO": "O",
    "I-TIM": "O",
    "I-PCT": "O",
    'O': 'O'
  }
y_true, y_perd = align_prediction(texts, tags, albert_ner, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG', 'EVE'])

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300


2021-04-15 11:48:12 root INFO: Imported 1 predictions for 1 true examples


({'ent_type': {'actual': 591,
   'correct': 391,
   'incorrect': 46,
   'missed': 173,
   'partial': 0,
   'possible': 610,
   'precision': 0.6615905245346869,
   'recall': 0.6409836065573771,
   'spurious': 154},
  'exact': {'actual': 591,
   'correct': 403,
   'incorrect': 34,
   'missed': 173,
   'partial': 0,
   'possible': 610,
   'precision': 0.6818950930626058,
   'recall': 0.660655737704918,
   'spurious': 154},
  'partial': {'actual': 557,
   'correct': 403,
   'incorrect': 0,
   'missed': 173,
   'partial': 34,
   'possible': 610,
   'precision': 0.7540394973070018,
   'recall': 0.6885245901639344,
   'spurious': 154},
  'strict': {'actual': 591,
   'correct': 365,
   'incorrect': 72,
   'missed': 173,
   'partial': 0,
   'possible': 610,
   'precision': 0.6175972927241963,
   'recall': 0.5983606557377049,
   'spurious': 154}},
 {'EVE': {'ent_type': {'actual': 1,
    'correct': 1,
    'incorrect': 0,
    'missed': 13,
    'partial': 0,
    'possible': 14,
    'precision': 1.0

### Pars Bert

In [None]:
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-ner-uncased")
model = AutoModelForTokenClassification.from_pretrained("HooshvareLab/bert-base-parsbert-ner-uncased")
model.eval()
parsbert_ner = pipeline('ner', model=model, tokenizer=tokenizer, ignore_labels=[])

2021-04-15 11:48:12 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:48:12 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-ner-uncased/resolve/main/config.json HTTP/1.1" 200 0
2021-04-15 11:48:12 filelock DEBUG: Attempting to acquire lock 140377830001744 on /root/.cache/huggingface/transformers/cae79f4815c0cf33be75b6a1aa1d6236ab2646d25c5c26f5083ec2f31a6bc82f.c3f34538a8ec527cdbc5de919e808441d6737abb85fe85a68db484f12dceb83a.lock
2021-04-15 11:48:12 filelock INFO: Lock 140377830001744 acquired on /root/.cache/huggingface/transformers/cae79f4815c0cf33be75b6a1aa1d6236ab2646d25c5c26f5083ec2f31a6bc82f.c3f34538a8ec527cdbc5de919e808441d6737abb85fe85a68db484f12dceb83a.lock
2021-04-15 11:48:12 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:48:13 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /HooshvareLab/bert-base-parsbert-ner-uncased/

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1357.0, style=ProgressStyle(description…

2021-04-15 11:48:13 filelock DEBUG: Attempting to release lock 140377830001744 on /root/.cache/huggingface/transformers/cae79f4815c0cf33be75b6a1aa1d6236ab2646d25c5c26f5083ec2f31a6bc82f.c3f34538a8ec527cdbc5de919e808441d6737abb85fe85a68db484f12dceb83a.lock
2021-04-15 11:48:13 filelock INFO: Lock 140377830001744 released on /root/.cache/huggingface/transformers/cae79f4815c0cf33be75b6a1aa1d6236ab2646d25c5c26f5083ec2f31a6bc82f.c3f34538a8ec527cdbc5de919e808441d6737abb85fe85a68db484f12dceb83a.lock
2021-04-15 11:48:13 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-15 11:48:13 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-ner-uncased/resolve/main/vocab.txt HTTP/1.1" 200 0
2021-04-15 11:48:13 filelock DEBUG: Attempting to acquire lock 140377830232016 on /root/.cache/huggingface/transformers/5df91a905c8036634a1cddbbcdab55e558aec2fd4309a294217d01f403e371f7.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-15 11:48:13 filelock INFO: Lock 140377830232016 acquired on /root/.cache/huggingface/transformers/5df91a905c8036634a1cddbbcdab55e558aec2fd4309a294217d01f403e371f7.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-15 11:48:13 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:48:13 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /HooshvareLab/bert-base-parsbert-ner-uncased/resolve/main/vocab.txt HTTP/1.1" 200 1215509


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1215509.0, style=ProgressStyle(descript…

2021-04-15 11:48:14 filelock DEBUG: Attempting to release lock 140377830232016 on /root/.cache/huggingface/transformers/5df91a905c8036634a1cddbbcdab55e558aec2fd4309a294217d01f403e371f7.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-15 11:48:14 filelock INFO: Lock 140377830232016 released on /root/.cache/huggingface/transformers/5df91a905c8036634a1cddbbcdab55e558aec2fd4309a294217d01f403e371f7.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-15 11:48:14 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-15 11:48:14 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-ner-uncased/resolve/main/tokenizer.json HTTP/1.1" 404 0
2021-04-15 11:48:14 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:48:14 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-ner-uncased/resolve/main/added_tokens.json HTTP/1.1" 404 0
2021-04-15 11:48:14 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:48:15 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-ner-uncased/resolve/main/special_tokens_map.json HTTP/1.1" 200 0
2021-04-15 11:48:15 filelock DEBUG: Attempting to acquire lock 140377830230032 on /root/.cache/huggingface/transformers/9c399f248ec41d037b934b424658c33cd7050d5eb47eadfaaebef9859880f728.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-1

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…

2021-04-15 11:48:15 filelock DEBUG: Attempting to release lock 140377830230032 on /root/.cache/huggingface/transformers/9c399f248ec41d037b934b424658c33cd7050d5eb47eadfaaebef9859880f728.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-15 11:48:15 filelock INFO: Lock 140377830230032 released on /root/.cache/huggingface/transformers/9c399f248ec41d037b934b424658c33cd7050d5eb47eadfaaebef9859880f728.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-15 11:48:15 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-15 11:48:15 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-ner-uncased/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
2021-04-15 11:48:15 filelock DEBUG: Attempting to acquire lock 140377837293008 on /root/.cache/huggingface/transformers/d9c378c1d878d4339ca3cfb8759681d66bb8a5b79d66e9905733939c2e47416e.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b.lock
2021-04-15 11:48:15 filelock INFO: Lock 140377837293008 acquired on /root/.cache/huggingface/transformers/d9c378c1d878d4339ca3cfb8759681d66bb8a5b79d66e9905733939c2e47416e.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b.lock
2021-04-15 11:48:15 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:48:16 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /HooshvareLab/bert-base-parsbert-ner-uncased/resolve/main/tokenizer_config.json HTTP/1.1" 200 2


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…

2021-04-15 11:48:16 filelock DEBUG: Attempting to release lock 140377837293008 on /root/.cache/huggingface/transformers/d9c378c1d878d4339ca3cfb8759681d66bb8a5b79d66e9905733939c2e47416e.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b.lock
2021-04-15 11:48:16 filelock INFO: Lock 140377837293008 released on /root/.cache/huggingface/transformers/d9c378c1d878d4339ca3cfb8759681d66bb8a5b79d66e9905733939c2e47416e.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b.lock





2021-04-15 11:48:16 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:48:16 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-ner-uncased/resolve/main/config.json HTTP/1.1" 200 0
2021-04-15 11:48:16 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:48:17 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-ner-uncased/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
2021-04-15 11:48:17 filelock DEBUG: Attempting to acquire lock 140377837156816 on /root/.cache/huggingface/transformers/74434d7ca32de74f5c4e1ef13ddcee2b2fe7b4a2861e8141267ef695011b0df7.aee1184fa3234eec730b51786e2e50883bb3b2e1312308fcb0999d13cb2ba8c5.lock
2021-04-15 11:48:17 filelock INFO: Lock 140377837156816 acquired on /root/.cache/huggingface/transformers/74434d7ca32de74f5c4e1ef13ddcee2b2fe7b4a2861e8141267ef695011b0df7.aee1184fa3234eec730b51

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=651478185.0, style=ProgressStyle(descri…

2021-04-15 11:48:31 filelock DEBUG: Attempting to release lock 140377837156816 on /root/.cache/huggingface/transformers/74434d7ca32de74f5c4e1ef13ddcee2b2fe7b4a2861e8141267ef695011b0df7.aee1184fa3234eec730b51786e2e50883bb3b2e1312308fcb0999d13cb2ba8c5.lock
2021-04-15 11:48:31 filelock INFO: Lock 140377837156816 released on /root/.cache/huggingface/transformers/74434d7ca32de74f5c4e1ef13ddcee2b2fe7b4a2861e8141267ef695011b0df7.aee1184fa3234eec730b51786e2e50883bb3b2e1312308fcb0999d13cb2ba8c5.lock





In [None]:
  tag_map = {
    "B-date": 'O',
    "B-event": "B-EVE",
    "B-facility": "B-ORG",
    "B-location": "B-LOC",
    "B-money": "O",
    "B-organization": "B-ORG",
    "B-person": "B-PER",
    "B-product": "O",
    "B-time": "O",
    "B-percent": "O",
    "I-date": "O",
    "I-event": "I-EVE",
    "I-facility": "I-ORG",
    "I-location": "I-LOC",
    "I-money": "O",
    "I-organization": "I-ORG",
    "I-person": "I-PER",
    "I-product": "O",
    "I-time": "O",
    "I-percent": "O",
    'O': 'O'
  }
y_true, y_perd = align_prediction(texts, tags, parsbert_ner, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG', 'EVE'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300


2021-04-15 11:49:30 root INFO: Imported 1 predictions for 1 true examples


({'ent_type': {'actual': 598,
   'correct': 434,
   'incorrect': 37,
   'missed': 137,
   'partial': 0,
   'possible': 608,
   'precision': 0.725752508361204,
   'recall': 0.7138157894736842,
   'spurious': 127},
  'exact': {'actual': 598,
   'correct': 452,
   'incorrect': 19,
   'missed': 137,
   'partial': 0,
   'possible': 608,
   'precision': 0.7558528428093646,
   'recall': 0.743421052631579,
   'spurious': 127},
  'partial': {'actual': 579,
   'correct': 452,
   'incorrect': 0,
   'missed': 137,
   'partial': 19,
   'possible': 608,
   'precision': 0.7970639032815199,
   'recall': 0.759046052631579,
   'spurious': 127},
  'strict': {'actual': 598,
   'correct': 419,
   'incorrect': 52,
   'missed': 137,
   'partial': 0,
   'possible': 608,
   'precision': 0.7006688963210702,
   'recall': 0.6891447368421053,
   'spurious': 127}},
 {'EVE': {'ent_type': {'actual': 2,
    'correct': 1,
    'incorrect': 0,
    'missed': 13,
    'partial': 0,
    'possible': 14,
    'precision': 0.5,


In [None]:
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG', 'EVE'])

2021-04-15 11:49:30 root INFO: Imported 1 predictions for 1 true examples


({'ent_type': {'actual': 598,
   'correct': 434,
   'incorrect': 37,
   'missed': 137,
   'partial': 0,
   'possible': 608,
   'precision': 0.725752508361204,
   'recall': 0.7138157894736842,
   'spurious': 127},
  'exact': {'actual': 598,
   'correct': 452,
   'incorrect': 19,
   'missed': 137,
   'partial': 0,
   'possible': 608,
   'precision': 0.7558528428093646,
   'recall': 0.743421052631579,
   'spurious': 127},
  'partial': {'actual': 579,
   'correct': 452,
   'incorrect': 0,
   'missed': 137,
   'partial': 19,
   'possible': 608,
   'precision': 0.7970639032815199,
   'recall': 0.759046052631579,
   'spurious': 127},
  'strict': {'actual': 598,
   'correct': 419,
   'incorrect': 52,
   'missed': 137,
   'partial': 0,
   'possible': 608,
   'precision': 0.7006688963210702,
   'recall': 0.6891447368421053,
   'spurious': 127}},
 {'EVE': {'ent_type': {'actual': 2,
    'correct': 1,
    'incorrect': 0,
    'missed': 13,
    'partial': 0,
    'possible': 14,
    'precision': 0.5,


### XLMR

In [None]:
nlp_ner = pipeline(
    "ner",
    model="jplu/tf-xlm-r-ner-40-lang",
    tokenizer=(
        'jplu/tf-xlm-r-ner-40-lang',  
        {"use_fast": True}),
    framework="tf",
    ignore_labels=[],
)

2021-04-15 11:49:30 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:49:30 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /jplu/tf-xlm-r-ner-40-lang/resolve/main/config.json HTTP/1.1" 200 0
2021-04-15 11:49:30 filelock DEBUG: Attempting to acquire lock 140380418734544 on /root/.cache/huggingface/transformers/592edd8ca19e03c273c0d78107e6114dd71159783068fd8a7055ef64a925bff4.8febe5547bd288d4fdf088fad643ea91ad188f30915e46d3b989a3d06364d222.lock
2021-04-15 11:49:30 filelock INFO: Lock 140380418734544 acquired on /root/.cache/huggingface/transformers/592edd8ca19e03c273c0d78107e6114dd71159783068fd8a7055ef64a925bff4.8febe5547bd288d4fdf088fad643ea91ad188f30915e46d3b989a3d06364d222.lock
2021-04-15 11:49:30 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:49:31 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /jplu/tf-xlm-r-ner-40-lang/resolve/main/config.json HTTP/1.1" 2

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=699.0, style=ProgressStyle(description_…

2021-04-15 11:49:31 filelock DEBUG: Attempting to release lock 140380418734544 on /root/.cache/huggingface/transformers/592edd8ca19e03c273c0d78107e6114dd71159783068fd8a7055ef64a925bff4.8febe5547bd288d4fdf088fad643ea91ad188f30915e46d3b989a3d06364d222.lock
2021-04-15 11:49:31 filelock INFO: Lock 140380418734544 released on /root/.cache/huggingface/transformers/592edd8ca19e03c273c0d78107e6114dd71159783068fd8a7055ef64a925bff4.8febe5547bd288d4fdf088fad643ea91ad188f30915e46d3b989a3d06364d222.lock
2021-04-15 11:49:31 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-15 11:49:31 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /jplu/tf-xlm-r-ner-40-lang/resolve/main/sentencepiece.bpe.model HTTP/1.1" 200 0
2021-04-15 11:49:31 filelock DEBUG: Attempting to acquire lock 140377798113168 on /root/.cache/huggingface/transformers/610d44b2598cdb5337559ed970ff2fc69117c0d36786c2d108aacecc439b4599.00628a9eeb8baf4080d44a0abe9fe8057893de20c7cb6e6423cddbf452f7d4d8.lock
2021-04-15 11:49:31 filelock INFO: Lock 140377798113168 acquired on /root/.cache/huggingface/transformers/610d44b2598cdb5337559ed970ff2fc69117c0d36786c2d108aacecc439b4599.00628a9eeb8baf4080d44a0abe9fe8057893de20c7cb6e6423cddbf452f7d4d8.lock
2021-04-15 11:49:31 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:49:31 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /jplu/tf-xlm-r-ner-40-lang/resolve/main/sentencepiece.bpe.model HTTP/1.1" 200 5069051


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…

2021-04-15 11:49:32 filelock DEBUG: Attempting to release lock 140377798113168 on /root/.cache/huggingface/transformers/610d44b2598cdb5337559ed970ff2fc69117c0d36786c2d108aacecc439b4599.00628a9eeb8baf4080d44a0abe9fe8057893de20c7cb6e6423cddbf452f7d4d8.lock
2021-04-15 11:49:32 filelock INFO: Lock 140377798113168 released on /root/.cache/huggingface/transformers/610d44b2598cdb5337559ed970ff2fc69117c0d36786c2d108aacecc439b4599.00628a9eeb8baf4080d44a0abe9fe8057893de20c7cb6e6423cddbf452f7d4d8.lock
2021-04-15 11:49:32 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-15 11:49:32 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /jplu/tf-xlm-r-ner-40-lang/resolve/main/tokenizer.json HTTP/1.1" 404 0
2021-04-15 11:49:32 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:49:33 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /jplu/tf-xlm-r-ner-40-lang/resolve/main/added_tokens.json HTTP/1.1" 404 0
2021-04-15 11:49:33 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:49:33 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /jplu/tf-xlm-r-ner-40-lang/resolve/main/special_tokens_map.json HTTP/1.1" 200 0
2021-04-15 11:49:33 filelock DEBUG: Attempting to acquire lock 140377798113168 on /root/.cache/huggingface/transformers/fb6b2e33c0bb4c25b56431adf3869c18c5a20e80acb8a45672b2020ab6019f03.0dc5b1041f62041ebbd23b1297f2f573769d5c97d8b7c28180ec86b8f6185aa8.lock
2021-04-15 11:49:33 filelock INFO: Lock 140377798113168 acquire

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=150.0, style=ProgressStyle(description_…

2021-04-15 11:49:33 filelock DEBUG: Attempting to release lock 140377798113168 on /root/.cache/huggingface/transformers/fb6b2e33c0bb4c25b56431adf3869c18c5a20e80acb8a45672b2020ab6019f03.0dc5b1041f62041ebbd23b1297f2f573769d5c97d8b7c28180ec86b8f6185aa8.lock
2021-04-15 11:49:33 filelock INFO: Lock 140377798113168 released on /root/.cache/huggingface/transformers/fb6b2e33c0bb4c25b56431adf3869c18c5a20e80acb8a45672b2020ab6019f03.0dc5b1041f62041ebbd23b1297f2f573769d5c97d8b7c28180ec86b8f6185aa8.lock
2021-04-15 11:49:33 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-15 11:49:33 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /jplu/tf-xlm-r-ner-40-lang/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
2021-04-15 11:49:33 filelock DEBUG: Attempting to acquire lock 140377798101968 on /root/.cache/huggingface/transformers/192171f27b852eb9adff1624e0ed6dca9bf1aed82ece5566077669556838808c.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b.lock
2021-04-15 11:49:33 filelock INFO: Lock 140377798101968 acquired on /root/.cache/huggingface/transformers/192171f27b852eb9adff1624e0ed6dca9bf1aed82ece5566077669556838808c.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b.lock
2021-04-15 11:49:33 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:49:34 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /jplu/tf-xlm-r-ner-40-lang/resolve/main/tokenizer_config.json HTTP/1.1" 200 2


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…

2021-04-15 11:49:34 filelock DEBUG: Attempting to release lock 140377798101968 on /root/.cache/huggingface/transformers/192171f27b852eb9adff1624e0ed6dca9bf1aed82ece5566077669556838808c.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b.lock
2021-04-15 11:49:34 filelock INFO: Lock 140377798101968 released on /root/.cache/huggingface/transformers/192171f27b852eb9adff1624e0ed6dca9bf1aed82ece5566077669556838808c.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b.lock





2021-04-15 11:49:35 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:49:36 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /jplu/tf-xlm-r-ner-40-lang/resolve/main/modelcard.json HTTP/1.1" 404 0
2021-04-15 11:49:36 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:49:36 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /jplu/tf-xlm-r-ner-40-lang/resolve/main/config.json HTTP/1.1" 200 0
2021-04-15 11:49:36 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-15 11:49:36 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /jplu/tf-xlm-r-ner-40-lang/resolve/main/tf_model.h5 HTTP/1.1" 302 0
2021-04-15 11:49:36 filelock DEBUG: Attempting to acquire lock 140377798113168 on /root/.cache/huggingface/transformers/309b1bef385d3c2d3a7f8e15f32f93a5347508e13edfc0f562999e06a78887e7.52fe00cc1309b8aa154b26b5e0c7b34af351c439d525ff6

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1112459008.0, style=ProgressStyle(descr…

2021-04-15 11:50:01 filelock DEBUG: Attempting to release lock 140377798113168 on /root/.cache/huggingface/transformers/309b1bef385d3c2d3a7f8e15f32f93a5347508e13edfc0f562999e06a78887e7.52fe00cc1309b8aa154b26b5e0c7b34af351c439d525ff6501d3fe634daf8aef.h5.lock
2021-04-15 11:50:01 filelock INFO: Lock 140377798113168 released on /root/.cache/huggingface/transformers/309b1bef385d3c2d3a7f8e15f32f93a5347508e13edfc0f562999e06a78887e7.52fe00cc1309b8aa154b26b5e0c7b34af351c439d525ff6501d3fe634daf8aef.h5.lock





Some layers from the model checkpoint at jplu/tf-xlm-r-ner-40-lang were not used when initializing TFXLMRobertaForTokenClassification: ['dropout_38']
- This IS expected if you are initializing TFXLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLMRobertaForTokenClassification were initialized from the model checkpoint at jplu/tf-xlm-r-ner-40-lang.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForTokenClassification for predictions without further training.


In [None]:
tag_map = {
    "PER": 'B-PER',
    "LOC": "B-LOC",
    "ORG": "B-ORG",
    'O': 'O'
}
y_true, y_perd = align_prediction(texts, tags, nlp_ner, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Instructions for updating:
Use tf.identity instead.


Instructions for updating:
Use tf.identity instead.


10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300


2021-04-15 11:51:50 root INFO: Imported 1 predictions for 1 true examples


({'ent_type': {'actual': 1223,
   'correct': 247,
   'incorrect': 49,
   'missed': 298,
   'partial': 0,
   'possible': 594,
   'precision': 0.2019623875715454,
   'recall': 0.4158249158249158,
   'spurious': 927},
  'exact': {'actual': 1223,
   'correct': 296,
   'incorrect': 0,
   'missed': 298,
   'partial': 0,
   'possible': 594,
   'precision': 0.24202780049059688,
   'recall': 0.4983164983164983,
   'spurious': 927},
  'partial': {'actual': 1223,
   'correct': 296,
   'incorrect': 0,
   'missed': 298,
   'partial': 0,
   'possible': 594,
   'precision': 0.24202780049059688,
   'recall': 0.4983164983164983,
   'spurious': 927},
  'strict': {'actual': 1223,
   'correct': 247,
   'incorrect': 49,
   'missed': 298,
   'partial': 0,
   'possible': 594,
   'precision': 0.2019623875715454,
   'recall': 0.4158249158249158,
   'spurious': 927}},
 {'LOC': {'ent_type': {'actual': 259,
    'correct': 131,
    'incorrect': 34,
    'missed': 56,
    'partial': 0,
    'possible': 221,
    'prec

### Our Model: using fine tunning

In [17]:
from transformers import TFAutoModelForTokenClassification

In [18]:
tokenizer = AutoTokenizer.from_pretrained("overfit/twiner-bert-base")
model = TFAutoModelForTokenClassification.from_pretrained("overfit/twiner-bert-base")
twiner_seq = pipeline('ner', model=model, tokenizer=tokenizer, ignore_labels=[])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1124.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1215509.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=354.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=649311328.0, style=ProgressStyle(descri…




All model checkpoint layers were used when initializing TFBertForTokenClassification.

All the layers of TFBertForTokenClassification were initialized from the model checkpoint at overfit/twiner-bert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


In [None]:
tag_map = {
  "B-EVE": "B-EVE",
  "B-LOC": "B-LOC",
  "B-ORG": "B-ORG",
  "B-PER": "B-PER",
  "B-POG": "B_POG",
  "B-NAT": "B-NAT",
  "I-EVE": "I-EVE",
  "I-LOC": "I-LOC",
  "I-ORG": "I-ORG",
  "I-PER": "I-PER",
  "I-POG": "I_POG",
  "I-NAT": "I-NAT",
  'O': 'O'
}
y_true, y_perd = align_prediction(texts, tags, twiner_seq, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG', 'EVE'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Instructions for updating:
Use tf.identity instead.


Instructions for updating:
Use tf.identity instead.


10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300


2021-04-16 09:43:35 root INFO: Imported 1 predictions for 1 true examples


({'ent_type': {'actual': 631,
   'correct': 516,
   'incorrect': 21,
   'missed': 71,
   'partial': 0,
   'possible': 608,
   'precision': 0.8177496038034865,
   'recall': 0.8486842105263158,
   'spurious': 94},
  'exact': {'actual': 631,
   'correct': 517,
   'incorrect': 20,
   'missed': 71,
   'partial': 0,
   'possible': 608,
   'precision': 0.8193343898573693,
   'recall': 0.850328947368421,
   'spurious': 94},
  'partial': {'actual': 611,
   'correct': 517,
   'incorrect': 0,
   'missed': 71,
   'partial': 20,
   'possible': 608,
   'precision': 0.8625204582651391,
   'recall': 0.8667763157894737,
   'spurious': 94},
  'strict': {'actual': 631,
   'correct': 498,
   'incorrect': 39,
   'missed': 71,
   'partial': 0,
   'possible': 608,
   'precision': 0.7892234548335975,
   'recall': 0.819078947368421,
   'spurious': 94}},
 {'EVE': {'ent_type': {'actual': 13,
    'correct': 7,
    'incorrect': 0,
    'missed': 7,
    'partial': 0,
    'possible': 14,
    'precision': 0.5384615384

In [None]:
y_true, y_perd = align_prediction(texts, tags, twiner_seq, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG', 'EVE', 'POG', 'NAT'])

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300


2021-04-16 09:52:43 root INFO: Imported 1 predictions for 1 true examples


({'ent_type': {'actual': 689,
   'correct': 549,
   'incorrect': 30,
   'missed': 81,
   'partial': 0,
   'possible': 660,
   'precision': 0.7968069666182874,
   'recall': 0.8318181818181818,
   'spurious': 110},
  'exact': {'actual': 689,
   'correct': 558,
   'incorrect': 21,
   'missed': 81,
   'partial': 0,
   'possible': 660,
   'precision': 0.8098693759071117,
   'recall': 0.8454545454545455,
   'spurious': 110},
  'partial': {'actual': 668,
   'correct': 558,
   'incorrect': 0,
   'missed': 81,
   'partial': 21,
   'possible': 660,
   'precision': 0.8510479041916168,
   'recall': 0.8613636363636363,
   'spurious': 110},
  'strict': {'actual': 689,
   'correct': 530,
   'incorrect': 49,
   'missed': 81,
   'partial': 0,
   'possible': 660,
   'precision': 0.7692307692307693,
   'recall': 0.803030303030303,
   'spurious': 110}},
 {'EVE': {'ent_type': {'actual': 13,
    'correct': 7,
    'incorrect': 0,
    'missed': 7,
    'partial': 0,
    'possible': 14,
    'precision': 0.53846

In [24]:
tag_map = {
  "B-EVE": "O",
  "B-LOC": "B_LOC",
  "B-ORG": "B_ORG",
  "B-PER": "B_PER",
  "B-POG": "O",
  "B-NAT": "O",
  "I-EVE": "O",
  "I-LOC": "I_LOC",
  "I-ORG": "I_ORG",
  "I-PER": "I_PER",
  "I-POG": "O",
  "I-NAT": "O",
  'O': 'O'
}
y_true, y_perd = align_prediction(texts_peyma, tags_peyma, twiner_seq, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Instructions for updating:
Use tf.identity instead.


Instructions for updating:
Use tf.identity instead.


10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


2021-04-17 15:02:10 root INFO: Imported 1 predictions for 1 true examples


500


({'ent_type': {'actual': 850,
   'correct': 684,
   'incorrect': 14,
   'missed': 147,
   'partial': 0,
   'possible': 845,
   'precision': 0.8047058823529412,
   'recall': 0.8094674556213017,
   'spurious': 152},
  'exact': {'actual': 850,
   'correct': 643,
   'incorrect': 55,
   'missed': 147,
   'partial': 0,
   'possible': 845,
   'precision': 0.7564705882352941,
   'recall': 0.7609467455621302,
   'spurious': 152},
  'partial': {'actual': 795,
   'correct': 643,
   'incorrect': 0,
   'missed': 147,
   'partial': 55,
   'possible': 845,
   'precision': 0.8433962264150944,
   'recall': 0.7934911242603551,
   'spurious': 152},
  'strict': {'actual': 850,
   'correct': 638,
   'incorrect': 60,
   'missed': 147,
   'partial': 0,
   'possible': 845,
   'precision': 0.7505882352941177,
   'recall': 0.7550295857988165,
   'spurious': 152}},
 {'LOC': {'ent_type': {'actual': 318,
    'correct': 249,
    'incorrect': 3,
    'missed': 54,
    'partial': 0,
    'possible': 306,
    'precision

### Our Model using MTL

In [15]:
from transformers import TFAutoModelForTokenClassification

In [16]:
tokenizer = AutoTokenizer.from_pretrained("overfit/twiner-bert-base-mtl")
model = TFAutoModelForTokenClassification.from_pretrained("overfit/twiner-bert-base-mtl")
twiner_mtl = pipeline('ner', model=model, tokenizer=tokenizer, ignore_labels=[])

2021-04-17 17:52:11 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 17:52:11 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/config.json HTTP/1.1" 200 0
2021-04-17 17:52:11 filelock DEBUG: Attempting to acquire lock 140370185980624 on /root/.cache/huggingface/transformers/6aae46c25a22a58e18b1c1b6adc8a8b0af058d1c9d46d9235c77312bfcedcb5c.24ecbabe775c7edabcc88227cb13d0c504150601e1c0dbb1dd2c7295f6766cde.lock
2021-04-17 17:52:11 filelock INFO: Lock 140370185980624 acquired on /root/.cache/huggingface/transformers/6aae46c25a22a58e18b1c1b6adc8a8b0af058d1c9d46d9235c77312bfcedcb5c.24ecbabe775c7edabcc88227cb13d0c504150601e1c0dbb1dd2c7295f6766cde.lock
2021-04-17 17:52:11 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 17:52:11 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /overfit/twiner-bert-base-mtl/resolve/main/config.json HTTP/

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1124.0, style=ProgressStyle(description…

2021-04-17 17:52:11 filelock DEBUG: Attempting to release lock 140370185980624 on /root/.cache/huggingface/transformers/6aae46c25a22a58e18b1c1b6adc8a8b0af058d1c9d46d9235c77312bfcedcb5c.24ecbabe775c7edabcc88227cb13d0c504150601e1c0dbb1dd2c7295f6766cde.lock
2021-04-17 17:52:11 filelock INFO: Lock 140370185980624 released on /root/.cache/huggingface/transformers/6aae46c25a22a58e18b1c1b6adc8a8b0af058d1c9d46d9235c77312bfcedcb5c.24ecbabe775c7edabcc88227cb13d0c504150601e1c0dbb1dd2c7295f6766cde.lock
2021-04-17 17:52:11 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 17:52:11 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/vocab.txt HTTP/1.1" 200 0
2021-04-17 17:52:11 filelock DEBUG: Attempting to acquire lock 140370181925136 on /root/.cache/huggingface/transformers/0e254b340265789f234ea0a71cd8228e05e94a5d19068a74ea35abe12aa012c9.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d447




2021-04-17 17:52:11 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /overfit/twiner-bert-base-mtl/resolve/main/vocab.txt HTTP/1.1" 200 1215509


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1215509.0, style=ProgressStyle(descript…

2021-04-17 17:52:12 filelock DEBUG: Attempting to release lock 140370181925136 on /root/.cache/huggingface/transformers/0e254b340265789f234ea0a71cd8228e05e94a5d19068a74ea35abe12aa012c9.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-17 17:52:12 filelock INFO: Lock 140370181925136 released on /root/.cache/huggingface/transformers/0e254b340265789f234ea0a71cd8228e05e94a5d19068a74ea35abe12aa012c9.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-17 17:52:12 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 17:52:12 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/tokenizer.json HTTP/1.1" 404 0
2021-04-17 17:52:12 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-17 17:52:12 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/added_tokens.json HTTP/1.1" 404 0
2021-04-17 17:52:12 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 17:52:12 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/special_tokens_map.json HTTP/1.1" 200 0
2021-04-17 17:52:12 filelock DEBUG: Attempting to acquire lock 140370185980624 on /root/.cache/huggingface/transformers/cfb340e9b5920789a748ef17883dab7fe74644822190f9624abb224a48e96638.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-17 17:52:12 filelock INFO: Lock 140370185980624 acquired on /root/.cache/huggingface/transformers/cfb340e9b5920789a748ef17883dab7fe74644822190f9624abb224a48e96638.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-17 17:52:12 urllib3.connectionpool DEBUG: Starting new HTTPS conne

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…

2021-04-17 17:52:12 filelock DEBUG: Attempting to release lock 140370185980624 on /root/.cache/huggingface/transformers/cfb340e9b5920789a748ef17883dab7fe74644822190f9624abb224a48e96638.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-17 17:52:12 filelock INFO: Lock 140370185980624 released on /root/.cache/huggingface/transformers/cfb340e9b5920789a748ef17883dab7fe74644822190f9624abb224a48e96638.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-17 17:52:12 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 17:52:12 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
2021-04-17 17:52:12 filelock DEBUG: Attempting to acquire lock 140372767008848 on /root/.cache/huggingface/transformers/22ab5ca7b32afe47eef18e996cf9bf7c5a87fb7315b285918612fd9c38a56ac6.f5bfb2432b46fe53cc72280487a37c915c677add081f32fdf0d




2021-04-17 17:52:12 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /overfit/twiner-bert-base-mtl/resolve/main/tokenizer_config.json HTTP/1.1" 200 354


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=354.0, style=ProgressStyle(description_…

2021-04-17 17:52:12 filelock DEBUG: Attempting to release lock 140372767008848 on /root/.cache/huggingface/transformers/22ab5ca7b32afe47eef18e996cf9bf7c5a87fb7315b285918612fd9c38a56ac6.f5bfb2432b46fe53cc72280487a37c915c677add081f32fdf0d70a6dca4539ac.lock
2021-04-17 17:52:12 filelock INFO: Lock 140372767008848 released on /root/.cache/huggingface/transformers/22ab5ca7b32afe47eef18e996cf9bf7c5a87fb7315b285918612fd9c38a56ac6.f5bfb2432b46fe53cc72280487a37c915c677add081f32fdf0d70a6dca4539ac.lock





2021-04-17 17:52:13 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 17:52:13 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/config.json HTTP/1.1" 200 0
2021-04-17 17:52:13 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 17:52:13 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/tf_model.h5 HTTP/1.1" 302 0
2021-04-17 17:52:13 filelock DEBUG: Attempting to acquire lock 140370181635408 on /root/.cache/huggingface/transformers/3ef81553e6fd3d5f3cf0bae8f0b3e6f2a9f5ce9bd2f14a52fb1d1f23baddc3ed.c2e65ea90d8672dd4b4142ea7dd4040883e024e5a78acfc46626bed3f02421cd.h5.lock
2021-04-17 17:52:13 filelock INFO: Lock 140370181635408 acquired on /root/.cache/huggingface/transformers/3ef81553e6fd3d5f3cf0bae8f0b3e6f2a9f5ce9bd2f14a52fb1d1f23baddc3ed.c2e65ea90d8672dd4b4142ea7dd4040883e024e5a78acfc46626bed

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=649311328.0, style=ProgressStyle(descri…

2021-04-17 17:52:28 filelock DEBUG: Attempting to release lock 140370181635408 on /root/.cache/huggingface/transformers/3ef81553e6fd3d5f3cf0bae8f0b3e6f2a9f5ce9bd2f14a52fb1d1f23baddc3ed.c2e65ea90d8672dd4b4142ea7dd4040883e024e5a78acfc46626bed3f02421cd.h5.lock
2021-04-17 17:52:28 filelock INFO: Lock 140370181635408 released on /root/.cache/huggingface/transformers/3ef81553e6fd3d5f3cf0bae8f0b3e6f2a9f5ce9bd2f14a52fb1d1f23baddc3ed.c2e65ea90d8672dd4b4142ea7dd4040883e024e5a78acfc46626bed3f02421cd.h5.lock





All model checkpoint layers were used when initializing TFBertForTokenClassification.

All the layers of TFBertForTokenClassification were initialized from the model checkpoint at overfit/twiner-bert-base-mtl.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


In [None]:
tag_map = {
  "B-EVE": "B-EVE",
  "B-LOC": "B-LOC",
  "B-ORG": "B-ORG",
  "B-PER": "B-PER",
  "B-POG": "B-POG",
  "B-NAT": "B-NAT",
  "I-EVE": "I-EVE",
  "I-LOC": "I-LOC",
  "I-ORG": "I-ORG",
  "I-PER": "I-PER",
  "I-POG": "I_POG",
  "I-NAT": "I-NAT",
  'O': 'O'
}
y_true, y_perd = align_prediction(texts, tags, twiner_mtl, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG', 'EVE'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Instructions for updating:
Use tf.identity instead.


Instructions for updating:
Use tf.identity instead.


10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300


2021-04-17 10:17:59 root INFO: Imported 1 predictions for 1 true examples


({'ent_type': {'actual': 630,
   'correct': 514,
   'incorrect': 15,
   'missed': 79,
   'partial': 0,
   'possible': 608,
   'precision': 0.8158730158730159,
   'recall': 0.8453947368421053,
   'spurious': 101},
  'exact': {'actual': 630,
   'correct': 509,
   'incorrect': 20,
   'missed': 79,
   'partial': 0,
   'possible': 608,
   'precision': 0.807936507936508,
   'recall': 0.837171052631579,
   'spurious': 101},
  'partial': {'actual': 610,
   'correct': 509,
   'incorrect': 0,
   'missed': 79,
   'partial': 20,
   'possible': 608,
   'precision': 0.8508196721311475,
   'recall': 0.8536184210526315,
   'spurious': 101},
  'strict': {'actual': 630,
   'correct': 496,
   'incorrect': 33,
   'missed': 79,
   'partial': 0,
   'possible': 608,
   'precision': 0.7873015873015873,
   'recall': 0.8157894736842105,
   'spurious': 101}},
 {'EVE': {'ent_type': {'actual': 15,
    'correct': 10,
    'incorrect': 0,
    'missed': 4,
    'partial': 0,
    'possible': 14,
    'precision': 0.66666

In [None]:
y_true, y_perd = align_prediction(texts, tags, twiner_mtl, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG', 'EVE', 'POG', 'NAT'])

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300


2021-04-17 10:19:32 root INFO: Imported 1 predictions for 1 true examples


({'ent_type': {'actual': 685,
   'correct': 557,
   'incorrect': 18,
   'missed': 85,
   'partial': 0,
   'possible': 660,
   'precision': 0.8131386861313868,
   'recall': 0.843939393939394,
   'spurious': 110},
  'exact': {'actual': 685,
   'correct': 555,
   'incorrect': 20,
   'missed': 85,
   'partial': 0,
   'possible': 660,
   'precision': 0.8102189781021898,
   'recall': 0.8409090909090909,
   'spurious': 110},
  'partial': {'actual': 665,
   'correct': 555,
   'incorrect': 0,
   'missed': 85,
   'partial': 20,
   'possible': 660,
   'precision': 0.849624060150376,
   'recall': 0.8560606060606061,
   'spurious': 110},
  'strict': {'actual': 685,
   'correct': 539,
   'incorrect': 36,
   'missed': 85,
   'partial': 0,
   'possible': 660,
   'precision': 0.7868613138686131,
   'recall': 0.8166666666666667,
   'spurious': 110}},
 {'EVE': {'ent_type': {'actual': 15,
    'correct': 10,
    'incorrect': 0,
    'missed': 4,
    'partial': 0,
    'possible': 14,
    'precision': 0.66666

In [18]:
tag_map = {
  "B-EVE": "O",
  "B-LOC": "B_LOC",
  "B-ORG": "B_ORG",
  "B-PER": "B_PER",
  "B-POG": "O",
  "B-NAT": "O",
  "I-EVE": "O",
  "I-LOC": "I_LOC",
  "I-ORG": "I_ORG",
  "I-PER": "I_PER",
  "I-POG": "O",
  "I-NAT": "O",
  'O': 'O'
}
y_true, y_perd = align_prediction(texts_peyma, tags_peyma, twiner_mtl, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG'])

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


2021-04-17 17:55:32 root INFO: Imported 1 predictions for 1 true examples


500


({'ent_type': {'actual': 847,
   'correct': 737,
   'incorrect': 18,
   'missed': 87,
   'partial': 0,
   'possible': 842,
   'precision': 0.8701298701298701,
   'recall': 0.8752969121140143,
   'spurious': 92},
  'exact': {'actual': 847,
   'correct': 726,
   'incorrect': 29,
   'missed': 87,
   'partial': 0,
   'possible': 842,
   'precision': 0.8571428571428571,
   'recall': 0.8622327790973872,
   'spurious': 92},
  'partial': {'actual': 818,
   'correct': 726,
   'incorrect': 0,
   'missed': 87,
   'partial': 29,
   'possible': 842,
   'precision': 0.9052567237163814,
   'recall': 0.8794536817102138,
   'spurious': 92},
  'strict': {'actual': 847,
   'correct': 714,
   'incorrect': 41,
   'missed': 87,
   'partial': 0,
   'possible': 842,
   'precision': 0.8429752066115702,
   'recall': 0.8479809976247031,
   'spurious': 92}},
 {'LOC': {'ent_type': {'actual': 310,
    'correct': 256,
    'incorrect': 3,
    'missed': 47,
    'partial': 0,
    'possible': 306,
    'precision': 0.825

### Our Model Peyma using MTL

In [None]:
from transformers import TFAutoModelForTokenClassification

In [None]:
tokenizer = AutoTokenizer.from_pretrained("overfit/peyma-ner-bert-base")
model = TFAutoModelForTokenClassification.from_pretrained("overfit/peyma-ner-bert-base")
peyma_mtl = pipeline('ner', model=model, tokenizer=tokenizer, ignore_labels=[])

2021-04-17 10:59:04 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 10:59:04 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/peyma-ner-bert-base/resolve/main/config.json HTTP/1.1" 200 0
2021-04-17 10:59:04 filelock DEBUG: Attempting to acquire lock 140403316194000 on /root/.cache/huggingface/transformers/213a6ec7fc886c25c24fcecf724c386a07934a0c5df7b7d54677edd6f5784679.362070e431c5b684bc5ccefba3372fe3e7088f2b24adf061c7b45684af1dc1eb.lock
2021-04-17 10:59:04 filelock INFO: Lock 140403316194000 acquired on /root/.cache/huggingface/transformers/213a6ec7fc886c25c24fcecf724c386a07934a0c5df7b7d54677edd6f5784679.362070e431c5b684bc5ccefba3372fe3e7088f2b24adf061c7b45684af1dc1eb.lock
2021-04-17 10:59:04 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 10:59:04 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /overfit/peyma-ner-bert-base/resolve/main/config.json HTTP/1.

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1196.0, style=ProgressStyle(description…

2021-04-17 10:59:04 filelock DEBUG: Attempting to release lock 140403316194000 on /root/.cache/huggingface/transformers/213a6ec7fc886c25c24fcecf724c386a07934a0c5df7b7d54677edd6f5784679.362070e431c5b684bc5ccefba3372fe3e7088f2b24adf061c7b45684af1dc1eb.lock
2021-04-17 10:59:04 filelock INFO: Lock 140403316194000 released on /root/.cache/huggingface/transformers/213a6ec7fc886c25c24fcecf724c386a07934a0c5df7b7d54677edd6f5784679.362070e431c5b684bc5ccefba3372fe3e7088f2b24adf061c7b45684af1dc1eb.lock
2021-04-17 10:59:05 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 10:59:05 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/peyma-ner-bert-base/resolve/main/vocab.txt HTTP/1.1" 200 0
2021-04-17 10:59:05 filelock DEBUG: Attempting to acquire lock 140403524698064 on /root/.cache/huggingface/transformers/35b64b6ab37076c6af96a12b62c2effdc4d7edc3038a239694685775db44e7cb.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474




2021-04-17 10:59:05 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /overfit/peyma-ner-bert-base/resolve/main/vocab.txt HTTP/1.1" 200 1215509


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1215509.0, style=ProgressStyle(descript…

2021-04-17 10:59:05 filelock DEBUG: Attempting to release lock 140403524698064 on /root/.cache/huggingface/transformers/35b64b6ab37076c6af96a12b62c2effdc4d7edc3038a239694685775db44e7cb.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-17 10:59:05 filelock INFO: Lock 140403524698064 released on /root/.cache/huggingface/transformers/35b64b6ab37076c6af96a12b62c2effdc4d7edc3038a239694685775db44e7cb.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-17 10:59:05 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 10:59:05 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/peyma-ner-bert-base/resolve/main/tokenizer.json HTTP/1.1" 404 0
2021-04-17 10:59:05 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-17 10:59:05 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/peyma-ner-bert-base/resolve/main/added_tokens.json HTTP/1.1" 404 0
2021-04-17 10:59:05 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 10:59:05 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/peyma-ner-bert-base/resolve/main/special_tokens_map.json HTTP/1.1" 200 0
2021-04-17 10:59:05 filelock DEBUG: Attempting to acquire lock 140403305755792 on /root/.cache/huggingface/transformers/d5181600882759801da7c1f669c595b7d196afdcb789dca427eb95bba0fe40ce.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-17 10:59:05 filelock INFO: Lock 140403305755792 acquired on /root/.cache/huggingface/transformers/d5181600882759801da7c1f669c595b7d196afdcb789dca427eb95bba0fe40ce.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-17 10:59:05 urllib3.connectionpool DEBUG: Starting new HTTPS connect

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…

2021-04-17 10:59:05 filelock DEBUG: Attempting to release lock 140403305755792 on /root/.cache/huggingface/transformers/d5181600882759801da7c1f669c595b7d196afdcb789dca427eb95bba0fe40ce.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-17 10:59:05 filelock INFO: Lock 140403305755792 released on /root/.cache/huggingface/transformers/d5181600882759801da7c1f669c595b7d196afdcb789dca427eb95bba0fe40ce.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-17 10:59:05 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 10:59:06 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/peyma-ner-bert-base/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
2021-04-17 10:59:06 filelock DEBUG: Attempting to acquire lock 140403305755792 on /root/.cache/huggingface/transformers/823b89ed1cc28a1c2ffe554fd665037ee101afe706c266d098b7baabae4eaf08.f5bfb2432b46fe53cc72280487a37c915c677add081f32fdf0d7




2021-04-17 10:59:06 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /overfit/peyma-ner-bert-base/resolve/main/tokenizer_config.json HTTP/1.1" 200 354


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=354.0, style=ProgressStyle(description_…

2021-04-17 10:59:06 filelock DEBUG: Attempting to release lock 140403305755792 on /root/.cache/huggingface/transformers/823b89ed1cc28a1c2ffe554fd665037ee101afe706c266d098b7baabae4eaf08.f5bfb2432b46fe53cc72280487a37c915c677add081f32fdf0d70a6dca4539ac.lock
2021-04-17 10:59:06 filelock INFO: Lock 140403305755792 released on /root/.cache/huggingface/transformers/823b89ed1cc28a1c2ffe554fd665037ee101afe706c266d098b7baabae4eaf08.f5bfb2432b46fe53cc72280487a37c915c677add081f32fdf0d70a6dca4539ac.lock





2021-04-17 10:59:06 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 10:59:06 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/peyma-ner-bert-base/resolve/main/config.json HTTP/1.1" 200 0
2021-04-17 10:59:06 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 10:59:06 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/peyma-ner-bert-base/resolve/main/tf_model.h5 HTTP/1.1" 302 0
2021-04-17 10:59:06 filelock DEBUG: Attempting to acquire lock 140403321134864 on /root/.cache/huggingface/transformers/88b58bece2110e8763ccccc3f2cf3727d2759ca72eae6c299c5fb0a1b4d0b61b.c1dcf06d6a9561fe2acca9a7c9f2ec07038065d704e1984b293c6c142b53c3a8.h5.lock
2021-04-17 10:59:06 filelock INFO: Lock 140403321134864 acquired on /root/.cache/huggingface/transformers/88b58bece2110e8763ccccc3f2cf3727d2759ca72eae6c299c5fb0a1b4d0b61b.c1dcf06d6a9561fe2acca9a7c9f2ec07038065d704e1984b293c6c142

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=649317472.0, style=ProgressStyle(descri…

2021-04-17 10:59:26 filelock DEBUG: Attempting to release lock 140403321134864 on /root/.cache/huggingface/transformers/88b58bece2110e8763ccccc3f2cf3727d2759ca72eae6c299c5fb0a1b4d0b61b.c1dcf06d6a9561fe2acca9a7c9f2ec07038065d704e1984b293c6c142b53c3a8.h5.lock
2021-04-17 10:59:26 filelock INFO: Lock 140403321134864 released on /root/.cache/huggingface/transformers/88b58bece2110e8763ccccc3f2cf3727d2759ca72eae6c299c5fb0a1b4d0b61b.c1dcf06d6a9561fe2acca9a7c9f2ec07038065d704e1984b293c6c142b53c3a8.h5.lock





Some layers from the model checkpoint at overfit/peyma-ner-bert-base were not used when initializing TFBertForTokenClassification: ['dropout_75']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at overfit/peyma-ner-bert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


In [None]:
tag_map = {
  "B_LOC": "B_LOC",
  "B_ORG": "B_ORG",
  "B_PER": "B_PER",
  "B_MON": "B_MON",
  "B_PCT": "B_PCT",
  "B_DAT": "B_DAT",
  "B_TIM": "B_TIM",
  "I_LOC": "I_LOC",
  "I_ORG": "I_ORG",
  "I_PER": "I_PER",
  "I_MON": "I_MON",
  "I_PCT": "I_PCT",
  "I_DAT": "I_DAT",
  "I_TIM": "I_TIM",
  'O': 'O'
}
y_true, y_perd = align_prediction(texts_peyma, tags_peyma, peyma_mtl, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG', 'MON', 'PCT', 'DAT', 'TIM'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


2021-04-17 11:17:04 root INFO: Imported 1 predictions for 1 true examples


500


({'ent_type': {'actual': 1006,
   'correct': 917,
   'incorrect': 23,
   'missed': 38,
   'partial': 0,
   'possible': 978,
   'precision': 0.911530815109344,
   'recall': 0.9376278118609407,
   'spurious': 66},
  'exact': {'actual': 1006,
   'correct': 902,
   'incorrect': 38,
   'missed': 38,
   'partial': 0,
   'possible': 978,
   'precision': 0.8966202783300199,
   'recall': 0.9222903885480572,
   'spurious': 66},
  'partial': {'actual': 968,
   'correct': 902,
   'incorrect': 0,
   'missed': 38,
   'partial': 38,
   'possible': 978,
   'precision': 0.9514462809917356,
   'recall': 0.941717791411043,
   'spurious': 66},
  'strict': {'actual': 1006,
   'correct': 887,
   'incorrect': 53,
   'missed': 38,
   'partial': 0,
   'possible': 978,
   'precision': 0.8817097415506958,
   'recall': 0.9069529652351738,
   'spurious': 66}},
 {'DAT': {'ent_type': {'actual': 104,
    'correct': 92,
    'incorrect': 1,
    'missed': 5,
    'partial': 0,
    'possible': 98,
    'precision': 0.88461

In [None]:
tag_map = {
  "B_LOC": "B_LOC",
  "B_ORG": "B_ORG",
  "B_PER": "B_PER",
  "B_MON": "B_MON",
  "B_PCT": "B_PCT",
  "B_DAT": "B_DAT",
  "B_TIM": "B_TIM",
  "I_LOC": "I_LOC",
  "I_ORG": "I_ORG",
  "I_PER": "I_PER",
  "I_MON": "I_MON",
  "I_PCT": "I_PCT",
  "I_DAT": "I_DAT",
  "I_TIM": "I_TIM",
  'O': 'O'
}
y_true, y_perd = align_prediction(texts_peyma, tags_peyma, peyma_mtl, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG'])

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


2021-04-17 11:43:03 root INFO: Imported 1 predictions for 1 true examples


500


({'ent_type': {'actual': 864,
   'correct': 790,
   'incorrect': 21,
   'missed': 33,
   'partial': 0,
   'possible': 844,
   'precision': 0.9143518518518519,
   'recall': 0.9360189573459715,
   'spurious': 53},
  'exact': {'actual': 864,
   'correct': 782,
   'incorrect': 29,
   'missed': 33,
   'partial': 0,
   'possible': 844,
   'precision': 0.9050925925925926,
   'recall': 0.9265402843601895,
   'spurious': 53},
  'partial': {'actual': 835,
   'correct': 782,
   'incorrect': 0,
   'missed': 33,
   'partial': 29,
   'possible': 844,
   'precision': 0.9538922155688623,
   'recall': 0.9437203791469194,
   'spurious': 53},
  'strict': {'actual': 864,
   'correct': 769,
   'incorrect': 42,
   'missed': 33,
   'partial': 0,
   'possible': 844,
   'precision': 0.8900462962962963,
   'recall': 0.9111374407582938,
   'spurious': 53}},
 {'LOC': {'ent_type': {'actual': 315,
    'correct': 284,
    'incorrect': 3,
    'missed': 19,
    'partial': 0,
    'possible': 306,
    'precision': 0.901

In [None]:
tag_map = {
  "B_LOC": "B-LOC",
  "B_ORG": "B-ORG",
  "B_PER": "B-PER",
  "B_MON": "O",
  "B_PCT": "O",
  "B_DAT": "O",
  "B_TIM": "O",
  "I_LOC": "I-LOC",
  "I_ORG": "I-ORG",
  "I_PER": "I-PER",
  "I_MON": "O",
  "I_PCT": "O",
  "I_DAT": "O",
  "I_TIM": "O",
  'O': 'O'
}
y_true, y_perd = align_prediction(texts, tags, peyma_mtl, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG'])

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300


2021-04-17 11:52:40 root INFO: Imported 1 predictions for 1 true examples


({'ent_type': {'actual': 628,
   'correct': 477,
   'incorrect': 22,
   'missed': 95,
   'partial': 0,
   'possible': 594,
   'precision': 0.7595541401273885,
   'recall': 0.803030303030303,
   'spurious': 129},
  'exact': {'actual': 628,
   'correct': 479,
   'incorrect': 20,
   'missed': 95,
   'partial': 0,
   'possible': 594,
   'precision': 0.7627388535031847,
   'recall': 0.8063973063973064,
   'spurious': 129},
  'partial': {'actual': 608,
   'correct': 479,
   'incorrect': 0,
   'missed': 95,
   'partial': 20,
   'possible': 594,
   'precision': 0.8042763157894737,
   'recall': 0.8232323232323232,
   'spurious': 129},
  'strict': {'actual': 628,
   'correct': 460,
   'incorrect': 39,
   'missed': 95,
   'partial': 0,
   'possible': 594,
   'precision': 0.732484076433121,
   'recall': 0.7744107744107744,
   'spurious': 129}},
 {'LOC': {'ent_type': {'actual': 230,
    'correct': 170,
    'incorrect': 8,
    'missed': 43,
    'partial': 0,
    'possible': 221,
    'precision': 0.7

### Pars BErt Peyma

In [None]:
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-peymaner-uncased")
model = AutoModelForTokenClassification.from_pretrained("HooshvareLab/bert-base-parsbert-peymaner-uncased")
model.eval()
parsbert_peyma = pipeline('ner', model=model, tokenizer=tokenizer, ignore_labels=[])

2021-04-17 11:43:06 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 11:43:07 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/config.json HTTP/1.1" 200 0
2021-04-17 11:43:07 filelock DEBUG: Attempting to acquire lock 140403308553680 on /root/.cache/huggingface/transformers/63b8a6a3548fd09df2b1c8b59dafbee427c1ec85c7a2a6e896454f11611e7f04.c4ce79e951e964f7837f2d8e269cd9a0bbcee6b5be0aeb65e8fb944d5184af3c.lock
2021-04-17 11:43:07 filelock INFO: Lock 140403308553680 acquired on /root/.cache/huggingface/transformers/63b8a6a3548fd09df2b1c8b59dafbee427c1ec85c7a2a6e896454f11611e7f04.c4ce79e951e964f7837f2d8e269cd9a0bbcee6b5be0aeb65e8fb944d5184af3c.lock
2021-04-17 11:43:07 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 11:43:07 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /HooshvareLab/bert-base-parsbert-peymane

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=997.0, style=ProgressStyle(description_…

2021-04-17 11:43:07 filelock DEBUG: Attempting to release lock 140403308553680 on /root/.cache/huggingface/transformers/63b8a6a3548fd09df2b1c8b59dafbee427c1ec85c7a2a6e896454f11611e7f04.c4ce79e951e964f7837f2d8e269cd9a0bbcee6b5be0aeb65e8fb944d5184af3c.lock
2021-04-17 11:43:07 filelock INFO: Lock 140403308553680 released on /root/.cache/huggingface/transformers/63b8a6a3548fd09df2b1c8b59dafbee427c1ec85c7a2a6e896454f11611e7f04.c4ce79e951e964f7837f2d8e269cd9a0bbcee6b5be0aeb65e8fb944d5184af3c.lock
2021-04-17 11:43:07 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 11:43:07 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/vocab.txt HTTP/1.1" 200 0
2021-04-17 11:43:07 filelock DEBUG: Attempting to acquire lock 140403179728400 on /root/.cache/huggingface/transformers/a636f20d387de26da664850322ecd2eb83581e329c3f6e1f6efced8d86a6d613.6699f2ee4745b6531f79b9781879071b6ace2d2768d




2021-04-17 11:43:07 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/vocab.txt HTTP/1.1" 200 1215509


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1215509.0, style=ProgressStyle(descript…

2021-04-17 11:43:07 filelock DEBUG: Attempting to release lock 140403179728400 on /root/.cache/huggingface/transformers/a636f20d387de26da664850322ecd2eb83581e329c3f6e1f6efced8d86a6d613.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-17 11:43:07 filelock INFO: Lock 140403179728400 released on /root/.cache/huggingface/transformers/a636f20d387de26da664850322ecd2eb83581e329c3f6e1f6efced8d86a6d613.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-17 11:43:07 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 11:43:07 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/tokenizer.json HTTP/1.1" 404 0
2021-04-17 11:43:07 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-17 11:43:07 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/added_tokens.json HTTP/1.1" 404 0
2021-04-17 11:43:07 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 11:43:08 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/special_tokens_map.json HTTP/1.1" 200 0
2021-04-17 11:43:08 filelock DEBUG: Attempting to acquire lock 140403308553104 on /root/.cache/huggingface/transformers/0ec614ae9c28937555af1539fc9e2320ebe7a5e9aecf6b712ff2bea88b62e4b8.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-17 11:43:08 filelock INFO: Lock 140403308553104 acquired on /root/.cache/huggingface/transformers/0ec614ae9c28937555af1539fc9e2320ebe7a5e9aecf6b712ff2bea88b62e4b8.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-17 11:43:08 urllib3.connec

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…

2021-04-17 11:43:08 filelock DEBUG: Attempting to release lock 140403308553104 on /root/.cache/huggingface/transformers/0ec614ae9c28937555af1539fc9e2320ebe7a5e9aecf6b712ff2bea88b62e4b8.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-17 11:43:08 filelock INFO: Lock 140403308553104 released on /root/.cache/huggingface/transformers/0ec614ae9c28937555af1539fc9e2320ebe7a5e9aecf6b712ff2bea88b62e4b8.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-17 11:43:08 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 11:43:08 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
2021-04-17 11:43:08 filelock DEBUG: Attempting to acquire lock 140403179728400 on /root/.cache/huggingface/transformers/1ad82da13039dcf9acaf9f82ffeb7366accb0f33f64b4f697a5716a69eee8285.5cc6e825eb228a7a5cfd27cb4d7151e




2021-04-17 11:43:08 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/tokenizer_config.json HTTP/1.1" 200 2


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…

2021-04-17 11:43:08 filelock DEBUG: Attempting to release lock 140403179728400 on /root/.cache/huggingface/transformers/1ad82da13039dcf9acaf9f82ffeb7366accb0f33f64b4f697a5716a69eee8285.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b.lock
2021-04-17 11:43:08 filelock INFO: Lock 140403179728400 released on /root/.cache/huggingface/transformers/1ad82da13039dcf9acaf9f82ffeb7366accb0f33f64b4f697a5716a69eee8285.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b.lock





2021-04-17 11:43:08 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 11:43:09 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/config.json HTTP/1.1" 200 0
2021-04-17 11:43:09 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-17 11:43:09 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
2021-04-17 11:43:09 filelock DEBUG: Attempting to acquire lock 140403179623248 on /root/.cache/huggingface/transformers/070d60b171933e7e2340a97a691b9aa5eebedc5262212f6fdd00ab95ed762986.e4ae6c5a02a126ab0d1c8352dfd857fbab8490a9460b3cc1d92227127dc21742.lock
2021-04-17 11:43:09 filelock INFO: Lock 140403179623248 acquired on /root/.cache/huggingface/transformers/070d60b171933e7e2340a97a691b9aa5eebedc5262212f6fdd00ab95ed762986.e4ae6c5a02a1

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=651459729.0, style=ProgressStyle(descri…

2021-04-17 11:43:28 filelock DEBUG: Attempting to release lock 140403179623248 on /root/.cache/huggingface/transformers/070d60b171933e7e2340a97a691b9aa5eebedc5262212f6fdd00ab95ed762986.e4ae6c5a02a126ab0d1c8352dfd857fbab8490a9460b3cc1d92227127dc21742.lock
2021-04-17 11:43:28 filelock INFO: Lock 140403179623248 released on /root/.cache/huggingface/transformers/070d60b171933e7e2340a97a691b9aa5eebedc5262212f6fdd00ab95ed762986.e4ae6c5a02a126ab0d1c8352dfd857fbab8490a9460b3cc1d92227127dc21742.lock





In [None]:
tag_map = {
  "B_LOC": "B_LOC",
  "B_ORG": "B_ORG",
  "B_PER": "B_PER",
  "B_MON": "B_MON",
  "B_PCT": "B_PCT",
  "B_DAT": "B_DAT",
  "B_TIM": "B_TIM",
  "I_LOC": "I_LOC",
  "I_ORG": "I_ORG",
  "I_PER": "I_PER",
  "I_MON": "I_MON",
  "I_PCT": "I_PCT",
  "I_DAT": "I_DAT",
  "I_TIM": "I_TIM",
  'O': 'O'
}
y_true, y_perd = align_prediction(texts_peyma, tags_peyma, parsbert_peyma, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG', 'MON', 'PCT', 'DAT', 'TIM'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


2021-04-17 11:44:58 root INFO: Imported 1 predictions for 1 true examples


500


({'ent_type': {'actual': 982,
   'correct': 959,
   'incorrect': 5,
   'missed': 11,
   'partial': 0,
   'possible': 975,
   'precision': 0.9765784114052953,
   'recall': 0.9835897435897436,
   'spurious': 18},
  'exact': {'actual': 982,
   'correct': 959,
   'incorrect': 5,
   'missed': 11,
   'partial': 0,
   'possible': 975,
   'precision': 0.9765784114052953,
   'recall': 0.9835897435897436,
   'spurious': 18},
  'partial': {'actual': 977,
   'correct': 959,
   'incorrect': 0,
   'missed': 11,
   'partial': 5,
   'possible': 975,
   'precision': 0.9841351074718526,
   'recall': 0.9861538461538462,
   'spurious': 18},
  'strict': {'actual': 982,
   'correct': 955,
   'incorrect': 9,
   'missed': 11,
   'partial': 0,
   'possible': 975,
   'precision': 0.9725050916496945,
   'recall': 0.9794871794871794,
   'spurious': 18}},
 {'DAT': {'ent_type': {'actual': 96,
    'correct': 95,
    'incorrect': 0,
    'missed': 2,
    'partial': 0,
    'possible': 97,
    'precision': 0.98958333333

In [None]:
tag_map = {
  "B_LOC": "B_LOC",
  "B_ORG": "B_ORG",
  "B_PER": "B_PER",
  "B_MON": "B_MON",
  "B_PCT": "B_PCT",
  "B_DAT": "B_DAT",
  "B_TIM": "B_TIM",
  "I_LOC": "I_LOC",
  "I_ORG": "I_ORG",
  "I_PER": "I_PER",
  "I_MON": "I_MON",
  "I_PCT": "I_PCT",
  "I_DAT": "I_DAT",
  "I_TIM": "I_TIM",
  'O': 'O'
}
y_true, y_perd = align_prediction(texts_peyma, tags_peyma, parsbert_peyma, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG'])

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


2021-04-17 11:46:17 root INFO: Imported 1 predictions for 1 true examples


500


({'ent_type': {'actual': 850,
   'correct': 828,
   'incorrect': 5,
   'missed': 9,
   'partial': 0,
   'possible': 842,
   'precision': 0.9741176470588235,
   'recall': 0.9833729216152018,
   'spurious': 17},
  'exact': {'actual': 850,
   'correct': 830,
   'incorrect': 3,
   'missed': 9,
   'partial': 0,
   'possible': 842,
   'precision': 0.9764705882352941,
   'recall': 0.9857482185273159,
   'spurious': 17},
  'partial': {'actual': 847,
   'correct': 830,
   'incorrect': 0,
   'missed': 9,
   'partial': 3,
   'possible': 842,
   'precision': 0.9817001180637545,
   'recall': 0.9875296912114014,
   'spurious': 17},
  'strict': {'actual': 850,
   'correct': 826,
   'incorrect': 7,
   'missed': 9,
   'partial': 0,
   'possible': 842,
   'precision': 0.971764705882353,
   'recall': 0.9809976247030879,
   'spurious': 17}},
 {'LOC': {'ent_type': {'actual': 311,
    'correct': 302,
    'incorrect': 0,
    'missed': 4,
    'partial': 0,
    'possible': 306,
    'precision': 0.9710610932475

### Test with Masked entities

In [47]:
new_text, new_tag = [], []
for text, tweet_tag in zip(texts, tags):
  if len([tag for tag in tweet_tag if tag.startswith('B')]) == 1:
    new_text.append(text)
    new_tag.append(tweet_tag)

In [59]:
len([x for x in new_tag if 'B-POG' in x])

2

In [69]:
masked_texts = []
for text, tweet_tag in zip(new_text, new_tag):
  masked_text = []
  for word, tag in zip(text, tweet_tag):
    if tag != 'O':
      masked_text.append('[MASK]')
    else:
      masked_text.append(word)
  masked_texts.append(masked_text)

In [77]:
with open('masked_texts_test.txt', 'w') as file:
  for text in masked_texts:
    file.write('\n'.join(text))
    file.write('\n\n')

In [81]:
!wget -q --show-progress https://raw.githubusercontent.com/overfit-ir/persian-twitter-ner/master/masked_texts.txt



In [86]:
human_tags = []
with open('masked_texts.txt', 'r') as file:
  for line in file.readlines():
    if line != '\n':
      s = line.split('\t')
      if len(s) == 1:
        human_tags.append('O')
      else:
        human_tags.append(s[1].replace('\n', ''))

In [89]:
len([x for tweet_tag in new_tag for x in tweet_tag])

2075

In [90]:
len(human_tags)

2075

In [91]:
benchmark([[x for tweet_tag in new_tag for x in tweet_tag]], [human_tags], ['PER', 'LOC', 'ORG', 'NAT', 'POG'])

2021-04-18 12:45:03 root INFO: Imported 1 predictions for 1 true examples


({'ent_type': {'actual': 79,
   'correct': 65,
   'incorrect': 12,
   'missed': 7,
   'partial': 0,
   'possible': 84,
   'precision': 0.8227848101265823,
   'recall': 0.7738095238095238,
   'spurious': 2},
  'exact': {'actual': 79,
   'correct': 77,
   'incorrect': 0,
   'missed': 7,
   'partial': 0,
   'possible': 84,
   'precision': 0.9746835443037974,
   'recall': 0.9166666666666666,
   'spurious': 2},
  'partial': {'actual': 79,
   'correct': 77,
   'incorrect': 0,
   'missed': 7,
   'partial': 0,
   'possible': 84,
   'precision': 0.9746835443037974,
   'recall': 0.9166666666666666,
   'spurious': 2},
  'strict': {'actual': 79,
   'correct': 65,
   'incorrect': 12,
   'missed': 7,
   'partial': 0,
   'possible': 84,
   'precision': 0.8227848101265823,
   'recall': 0.7738095238095238,
   'spurious': 2}},
 {'LOC': {'ent_type': {'actual': 19,
    'correct': 16,
    'incorrect': 1,
    'missed': 3,
    'partial': 0,
    'possible': 20,
    'precision': 0.8421052631578947,
    'recall

In [21]:
from transformers import TFAutoModelForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("overfit/twiner-bert-base-mtl")
model = TFAutoModelForTokenClassification.from_pretrained("overfit/twiner-bert-base-mtl")
twiner_mtl = pipeline('ner', model=model, tokenizer=tokenizer, ignore_labels=[])

2021-04-18 11:09:53 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:09:53 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/config.json HTTP/1.1" 200 0
2021-04-18 11:09:53 filelock DEBUG: Attempting to acquire lock 140083018445520 on /root/.cache/huggingface/transformers/6aae46c25a22a58e18b1c1b6adc8a8b0af058d1c9d46d9235c77312bfcedcb5c.24ecbabe775c7edabcc88227cb13d0c504150601e1c0dbb1dd2c7295f6766cde.lock
2021-04-18 11:09:53 filelock INFO: Lock 140083018445520 acquired on /root/.cache/huggingface/transformers/6aae46c25a22a58e18b1c1b6adc8a8b0af058d1c9d46d9235c77312bfcedcb5c.24ecbabe775c7edabcc88227cb13d0c504150601e1c0dbb1dd2c7295f6766cde.lock
2021-04-18 11:09:53 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:09:54 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /overfit/twiner-bert-base-mtl/resolve/main/config.json HTTP/

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1124.0, style=ProgressStyle(description…

2021-04-18 11:09:54 filelock DEBUG: Attempting to release lock 140083018445520 on /root/.cache/huggingface/transformers/6aae46c25a22a58e18b1c1b6adc8a8b0af058d1c9d46d9235c77312bfcedcb5c.24ecbabe775c7edabcc88227cb13d0c504150601e1c0dbb1dd2c7295f6766cde.lock
2021-04-18 11:09:54 filelock INFO: Lock 140083018445520 released on /root/.cache/huggingface/transformers/6aae46c25a22a58e18b1c1b6adc8a8b0af058d1c9d46d9235c77312bfcedcb5c.24ecbabe775c7edabcc88227cb13d0c504150601e1c0dbb1dd2c7295f6766cde.lock
2021-04-18 11:09:54 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-18 11:09:55 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/vocab.txt HTTP/1.1" 200 0
2021-04-18 11:09:55 filelock DEBUG: Attempting to acquire lock 140083018209872 on /root/.cache/huggingface/transformers/0e254b340265789f234ea0a71cd8228e05e94a5d19068a74ea35abe12aa012c9.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-18 11:09:55 filelock INFO: Lock 140083018209872 acquired on /root/.cache/huggingface/transformers/0e254b340265789f234ea0a71cd8228e05e94a5d19068a74ea35abe12aa012c9.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-18 11:09:55 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:09:55 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /overfit/twiner-bert-base-mtl/resolve/main/vocab.txt HTTP/1.1" 200 1215509


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1215509.0, style=ProgressStyle(descript…

2021-04-18 11:09:56 filelock DEBUG: Attempting to release lock 140083018209872 on /root/.cache/huggingface/transformers/0e254b340265789f234ea0a71cd8228e05e94a5d19068a74ea35abe12aa012c9.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-18 11:09:56 filelock INFO: Lock 140083018209872 released on /root/.cache/huggingface/transformers/0e254b340265789f234ea0a71cd8228e05e94a5d19068a74ea35abe12aa012c9.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-18 11:09:56 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-18 11:09:57 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/tokenizer.json HTTP/1.1" 404 0
2021-04-18 11:09:57 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:09:57 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/added_tokens.json HTTP/1.1" 404 0
2021-04-18 11:09:57 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:09:58 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/special_tokens_map.json HTTP/1.1" 200 0
2021-04-18 11:09:58 filelock DEBUG: Attempting to acquire lock 140083208211408 on /root/.cache/huggingface/transformers/cfb340e9b5920789a748ef17883dab7fe74644822190f9624abb224a48e96638.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-18 11:09:58 filelock INFO: Lock 14008320821140

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…

2021-04-18 11:09:59 filelock DEBUG: Attempting to release lock 140083208211408 on /root/.cache/huggingface/transformers/cfb340e9b5920789a748ef17883dab7fe74644822190f9624abb224a48e96638.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-18 11:09:59 filelock INFO: Lock 140083208211408 released on /root/.cache/huggingface/transformers/cfb340e9b5920789a748ef17883dab7fe74644822190f9624abb224a48e96638.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-18 11:09:59 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-18 11:09:59 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
2021-04-18 11:09:59 filelock DEBUG: Attempting to acquire lock 140083208504656 on /root/.cache/huggingface/transformers/22ab5ca7b32afe47eef18e996cf9bf7c5a87fb7315b285918612fd9c38a56ac6.f5bfb2432b46fe53cc72280487a37c915c677add081f32fdf0d70a6dca4539ac.lock
2021-04-18 11:09:59 filelock INFO: Lock 140083208504656 acquired on /root/.cache/huggingface/transformers/22ab5ca7b32afe47eef18e996cf9bf7c5a87fb7315b285918612fd9c38a56ac6.f5bfb2432b46fe53cc72280487a37c915c677add081f32fdf0d70a6dca4539ac.lock
2021-04-18 11:09:59 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:10:00 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /overfit/twiner-bert-base-mtl/resolve/main/tokenizer_config.json HTTP/1.1" 200 354


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=354.0, style=ProgressStyle(description_…

2021-04-18 11:10:00 filelock DEBUG: Attempting to release lock 140083208504656 on /root/.cache/huggingface/transformers/22ab5ca7b32afe47eef18e996cf9bf7c5a87fb7315b285918612fd9c38a56ac6.f5bfb2432b46fe53cc72280487a37c915c677add081f32fdf0d70a6dca4539ac.lock
2021-04-18 11:10:00 filelock INFO: Lock 140083208504656 released on /root/.cache/huggingface/transformers/22ab5ca7b32afe47eef18e996cf9bf7c5a87fb7315b285918612fd9c38a56ac6.f5bfb2432b46fe53cc72280487a37c915c677add081f32fdf0d70a6dca4539ac.lock





2021-04-18 11:10:00 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:10:01 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/config.json HTTP/1.1" 200 0
2021-04-18 11:10:01 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:10:01 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/tf_model.h5 HTTP/1.1" 302 0
2021-04-18 11:10:01 filelock DEBUG: Attempting to acquire lock 140085712401680 on /root/.cache/huggingface/transformers/3ef81553e6fd3d5f3cf0bae8f0b3e6f2a9f5ce9bd2f14a52fb1d1f23baddc3ed.c2e65ea90d8672dd4b4142ea7dd4040883e024e5a78acfc46626bed3f02421cd.h5.lock
2021-04-18 11:10:01 filelock INFO: Lock 140085712401680 acquired on /root/.cache/huggingface/transformers/3ef81553e6fd3d5f3cf0bae8f0b3e6f2a9f5ce9bd2f14a52fb1d1f23baddc3ed.c2e65ea90d8672dd4b4142ea7dd4040883e024e5a78acfc46626bed

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=649311328.0, style=ProgressStyle(descri…

2021-04-18 11:10:40 filelock DEBUG: Attempting to release lock 140085712401680 on /root/.cache/huggingface/transformers/3ef81553e6fd3d5f3cf0bae8f0b3e6f2a9f5ce9bd2f14a52fb1d1f23baddc3ed.c2e65ea90d8672dd4b4142ea7dd4040883e024e5a78acfc46626bed3f02421cd.h5.lock
2021-04-18 11:10:40 filelock INFO: Lock 140085712401680 released on /root/.cache/huggingface/transformers/3ef81553e6fd3d5f3cf0bae8f0b3e6f2a9f5ce9bd2f14a52fb1d1f23baddc3ed.c2e65ea90d8672dd4b4142ea7dd4040883e024e5a78acfc46626bed3f02421cd.h5.lock





All model checkpoint layers were used when initializing TFBertForTokenClassification.

All the layers of TFBertForTokenClassification were initialized from the model checkpoint at overfit/twiner-bert-base-mtl.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


In [71]:
tag_map = {
  "B-EVE": "B-EVE",
  "B-LOC": "B-LOC",
  "B-ORG": "B-ORG",
  "B-PER": "B-PER",
  "B-POG": "B-POG",
  "B-NAT": "B-NAT",
  "I-EVE": "I-EVE",
  "I-LOC": "I-LOC",
  "I-ORG": "I-ORG",
  "I-PER": "I-PER",
  "I-POG": "I-POG",
  "I-NAT": "I-NAT",
  'O': 'O'
}
y_true, y_perd = align_prediction(masked_texts, new_tag, twiner_mtl, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG', 'NAT', 'POG'])

10
20
30
40
50
60
70
80


2021-04-18 12:10:02 root INFO: Imported 1 predictions for 1 true examples


({'ent_type': {'actual': 17,
   'correct': 4,
   'incorrect': 0,
   'missed': 80,
   'partial': 0,
   'possible': 84,
   'precision': 0.23529411764705882,
   'recall': 0.047619047619047616,
   'spurious': 13},
  'exact': {'actual': 17,
   'correct': 4,
   'incorrect': 0,
   'missed': 80,
   'partial': 0,
   'possible': 84,
   'precision': 0.23529411764705882,
   'recall': 0.047619047619047616,
   'spurious': 13},
  'partial': {'actual': 17,
   'correct': 4,
   'incorrect': 0,
   'missed': 80,
   'partial': 0,
   'possible': 84,
   'precision': 0.23529411764705882,
   'recall': 0.047619047619047616,
   'spurious': 13},
  'strict': {'actual': 17,
   'correct': 4,
   'incorrect': 0,
   'missed': 80,
   'partial': 0,
   'possible': 84,
   'precision': 0.23529411764705882,
   'recall': 0.047619047619047616,
   'spurious': 13}},
 {'LOC': {'ent_type': {'actual': 5,
    'correct': 1,
    'incorrect': 0,
    'missed': 19,
    'partial': 0,
    'possible': 20,
    'precision': 0.2,
    'recall':

In [32]:
masked_texts_peyma = []
for text, tweet_tag in zip(texts_peyma, tags_peyma):
  masked_text = []
  for word, tag in zip(text, tweet_tag):
    if tag != 'O':
      masked_text.append('[MASK]')
    else:
      masked_text.append(word)
  masked_texts_peyma.append(masked_text)

In [29]:
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-peymaner-uncased")
model = AutoModelForTokenClassification.from_pretrained("HooshvareLab/bert-base-parsbert-peymaner-uncased")
model.eval()
parsbert_peyma = pipeline('ner', model=model, tokenizer=tokenizer, ignore_labels=[])

2021-04-18 11:23:06 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:23:06 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/config.json HTTP/1.1" 200 0
2021-04-18 11:23:06 filelock DEBUG: Attempting to acquire lock 140082938759696 on /root/.cache/huggingface/transformers/63b8a6a3548fd09df2b1c8b59dafbee427c1ec85c7a2a6e896454f11611e7f04.c4ce79e951e964f7837f2d8e269cd9a0bbcee6b5be0aeb65e8fb944d5184af3c.lock
2021-04-18 11:23:06 filelock INFO: Lock 140082938759696 acquired on /root/.cache/huggingface/transformers/63b8a6a3548fd09df2b1c8b59dafbee427c1ec85c7a2a6e896454f11611e7f04.c4ce79e951e964f7837f2d8e269cd9a0bbcee6b5be0aeb65e8fb944d5184af3c.lock
2021-04-18 11:23:06 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:23:07 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /HooshvareLab/bert-base-parsbert-peymane

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=997.0, style=ProgressStyle(description_…

2021-04-18 11:23:07 filelock DEBUG: Attempting to release lock 140082938759696 on /root/.cache/huggingface/transformers/63b8a6a3548fd09df2b1c8b59dafbee427c1ec85c7a2a6e896454f11611e7f04.c4ce79e951e964f7837f2d8e269cd9a0bbcee6b5be0aeb65e8fb944d5184af3c.lock
2021-04-18 11:23:07 filelock INFO: Lock 140082938759696 released on /root/.cache/huggingface/transformers/63b8a6a3548fd09df2b1c8b59dafbee427c1ec85c7a2a6e896454f11611e7f04.c4ce79e951e964f7837f2d8e269cd9a0bbcee6b5be0aeb65e8fb944d5184af3c.lock
2021-04-18 11:23:07 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-18 11:23:08 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/vocab.txt HTTP/1.1" 200 0
2021-04-18 11:23:08 filelock DEBUG: Attempting to acquire lock 140082937258192 on /root/.cache/huggingface/transformers/a636f20d387de26da664850322ecd2eb83581e329c3f6e1f6efced8d86a6d613.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-18 11:23:08 filelock INFO: Lock 140082937258192 acquired on /root/.cache/huggingface/transformers/a636f20d387de26da664850322ecd2eb83581e329c3f6e1f6efced8d86a6d613.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-18 11:23:08 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:23:08 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/vocab.txt HTTP/1.1" 200 1215509


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1215509.0, style=ProgressStyle(descript…

2021-04-18 11:23:09 filelock DEBUG: Attempting to release lock 140082937258192 on /root/.cache/huggingface/transformers/a636f20d387de26da664850322ecd2eb83581e329c3f6e1f6efced8d86a6d613.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-18 11:23:09 filelock INFO: Lock 140082937258192 released on /root/.cache/huggingface/transformers/a636f20d387de26da664850322ecd2eb83581e329c3f6e1f6efced8d86a6d613.6699f2ee4745b6531f79b9781879071b6ace2d2768df83889391421fb44d4474.lock
2021-04-18 11:23:09 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-18 11:23:10 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/tokenizer.json HTTP/1.1" 404 0
2021-04-18 11:23:10 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:23:10 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/added_tokens.json HTTP/1.1" 404 0
2021-04-18 11:23:10 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:23:11 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/special_tokens_map.json HTTP/1.1" 200 0
2021-04-18 11:23:11 filelock DEBUG: Attempting to acquire lock 140082936942864 on /root/.cache/huggingface/transformers/0ec614ae9c28937555af1539fc9e2320ebe7a5e9aecf6b712ff2bea88b62e4b8.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…

2021-04-18 11:23:12 filelock DEBUG: Attempting to release lock 140082936942864 on /root/.cache/huggingface/transformers/0ec614ae9c28937555af1539fc9e2320ebe7a5e9aecf6b712ff2bea88b62e4b8.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-18 11:23:12 filelock INFO: Lock 140082936942864 released on /root/.cache/huggingface/transformers/0ec614ae9c28937555af1539fc9e2320ebe7a5e9aecf6b712ff2bea88b62e4b8.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
2021-04-18 11:23:12 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-18 11:23:12 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
2021-04-18 11:23:12 filelock DEBUG: Attempting to acquire lock 140082938694544 on /root/.cache/huggingface/transformers/1ad82da13039dcf9acaf9f82ffeb7366accb0f33f64b4f697a5716a69eee8285.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b.lock
2021-04-18 11:23:12 filelock INFO: Lock 140082938694544 acquired on /root/.cache/huggingface/transformers/1ad82da13039dcf9acaf9f82ffeb7366accb0f33f64b4f697a5716a69eee8285.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b.lock
2021-04-18 11:23:12 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:23:13 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/tokenizer_config.json HTTP/1.1" 200 2


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…

2021-04-18 11:23:13 filelock DEBUG: Attempting to release lock 140082938694544 on /root/.cache/huggingface/transformers/1ad82da13039dcf9acaf9f82ffeb7366accb0f33f64b4f697a5716a69eee8285.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b.lock
2021-04-18 11:23:13 filelock INFO: Lock 140082938694544 released on /root/.cache/huggingface/transformers/1ad82da13039dcf9acaf9f82ffeb7366accb0f33f64b4f697a5716a69eee8285.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b.lock





2021-04-18 11:23:13 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:23:14 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/config.json HTTP/1.1" 200 0
2021-04-18 11:23:14 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:23:14 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /HooshvareLab/bert-base-parsbert-peymaner-uncased/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
2021-04-18 11:23:15 filelock DEBUG: Attempting to acquire lock 140082939016400 on /root/.cache/huggingface/transformers/070d60b171933e7e2340a97a691b9aa5eebedc5262212f6fdd00ab95ed762986.e4ae6c5a02a126ab0d1c8352dfd857fbab8490a9460b3cc1d92227127dc21742.lock
2021-04-18 11:23:15 filelock INFO: Lock 140082939016400 acquired on /root/.cache/huggingface/transformers/070d60b171933e7e2340a97a691b9aa5eebedc5262212f6fdd00ab95ed762986.e4ae6c5a02a1

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=651459729.0, style=ProgressStyle(descri…

2021-04-18 11:24:10 filelock DEBUG: Attempting to release lock 140082939016400 on /root/.cache/huggingface/transformers/070d60b171933e7e2340a97a691b9aa5eebedc5262212f6fdd00ab95ed762986.e4ae6c5a02a126ab0d1c8352dfd857fbab8490a9460b3cc1d92227127dc21742.lock
2021-04-18 11:24:10 filelock INFO: Lock 140082939016400 released on /root/.cache/huggingface/transformers/070d60b171933e7e2340a97a691b9aa5eebedc5262212f6fdd00ab95ed762986.e4ae6c5a02a126ab0d1c8352dfd857fbab8490a9460b3cc1d92227127dc21742.lock





In [33]:
tag_map = {
  "B_LOC": "B_LOC",
  "B_ORG": "B_ORG",
  "B_PER": "B_PER",
  "B_MON": "B_MON",
  "B_PCT": "B_PCT",
  "B_DAT": "B_DAT",
  "B_TIM": "B_TIM",
  "I_LOC": "I_LOC",
  "I_ORG": "I_ORG",
  "I_PER": "I_PER",
  "I_MON": "I_MON",
  "I_PCT": "I_PCT",
  "I_DAT": "I_DAT",
  "I_TIM": "I_TIM",
  'O': 'O'
}
y_true, y_perd = align_prediction(masked_texts_peyma, tags_peyma, parsbert_peyma, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG', 'MON', 'PCT', 'DAT', 'TIM'])

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


2021-04-18 11:29:31 root INFO: Imported 1 predictions for 1 true examples


500


({'ent_type': {'actual': 198,
   'correct': 102,
   'incorrect': 26,
   'missed': 847,
   'partial': 0,
   'possible': 975,
   'precision': 0.5151515151515151,
   'recall': 0.10461538461538461,
   'spurious': 70},
  'exact': {'actual': 198,
   'correct': 92,
   'incorrect': 36,
   'missed': 847,
   'partial': 0,
   'possible': 975,
   'precision': 0.46464646464646464,
   'recall': 0.09435897435897436,
   'spurious': 70},
  'partial': {'actual': 162,
   'correct': 92,
   'incorrect': 0,
   'missed': 847,
   'partial': 36,
   'possible': 975,
   'precision': 0.6790123456790124,
   'recall': 0.11282051282051282,
   'spurious': 70},
  'strict': {'actual': 198,
   'correct': 72,
   'incorrect': 56,
   'missed': 847,
   'partial': 0,
   'possible': 975,
   'precision': 0.36363636363636365,
   'recall': 0.07384615384615385,
   'spurious': 70}},
 {'DAT': {'ent_type': {'actual': 62,
    'correct': 26,
    'incorrect': 0,
    'missed': 71,
    'partial': 0,
    'possible': 97,
    'precision': 0

In [31]:
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG'])

2021-04-18 11:27:24 root INFO: Imported 1 predictions for 1 true examples


({'ent_type': {'actual': 116,
   'correct': 78,
   'incorrect': 10,
   'missed': 754,
   'partial': 0,
   'possible': 842,
   'precision': 0.6724137931034483,
   'recall': 0.09263657957244656,
   'spurious': 28},
  'exact': {'actual': 116,
   'correct': 87,
   'incorrect': 1,
   'missed': 754,
   'partial': 0,
   'possible': 842,
   'precision': 0.75,
   'recall': 0.10332541567695962,
   'spurious': 28},
  'partial': {'actual': 115,
   'correct': 87,
   'incorrect': 0,
   'missed': 754,
   'partial': 1,
   'possible': 842,
   'precision': 0.7608695652173914,
   'recall': 0.10391923990498812,
   'spurious': 28},
  'strict': {'actual': 116,
   'correct': 77,
   'incorrect': 11,
   'missed': 754,
   'partial': 0,
   'possible': 842,
   'precision': 0.6637931034482759,
   'recall': 0.09144893111638955,
   'spurious': 28}},
 {'LOC': {'ent_type': {'actual': 9,
    'correct': 0,
    'incorrect': 7,
    'missed': 299,
    'partial': 0,
    'possible': 306,
    'precision': 0.0,
    'recall': 0

### TEst with word only

In [41]:
new_texts = []
new_tags = []
for text, tweet_tag in zip(texts, tags):
  new_text = []
  new_tag = []
  flag = False
  for word, tag in zip(text, tweet_tag):
    if tag.startswith('B') and flag == False:
      flag = True
      new_text.append(word)
      new_tag.append(tag)
    elif tag.startswith('B') and flag == True:
      new_texts.append(new_text)
      new_tags.append(new_tag)
      flag = True
      new_text = []
      new_tag = []
      new_text.append(word)
      new_tag.append(tag)
    elif tag.startswith('I'):
      new_text.append(word)
      new_tag.append(tag)
    elif flag:
      new_texts.append(new_text)
      new_tags.append(new_tag)
      flag = False
      new_text = []
      new_tag = []

In [44]:
new_texts[5], new_tags[5]

(['جنگ', 'دوم', 'جهانی'], ['B-EVE', 'I-EVE', 'I-EVE'])

In [45]:
from transformers import TFAutoModelForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("overfit/twiner-bert-base-mtl")
model = TFAutoModelForTokenClassification.from_pretrained("overfit/twiner-bert-base-mtl")
twiner_mtl = pipeline('ner', model=model, tokenizer=tokenizer, ignore_labels=[])

2021-04-18 11:49:25 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:49:26 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/config.json HTTP/1.1" 200 0
2021-04-18 11:49:26 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:49:26 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/vocab.txt HTTP/1.1" 200 0
2021-04-18 11:49:26 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:49:27 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/tokenizer.json HTTP/1.1" 404 0
2021-04-18 11:49:27 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 11:49:28 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/

In [92]:
tag_map = {
  "B-EVE": "B-EVE",
  "B-LOC": "B-LOC",
  "B-ORG": "B-ORG",
  "B-PER": "B-PER",
  "B-POG": "B-POG",
  "B-NAT": "B-NAT",
  "I-EVE": "I-EVE",
  "I-LOC": "I-LOC",
  "I-ORG": "I-ORG",
  "I-PER": "I-PER",
  "I-POG": "I-POG",
  "I-NAT": "I-NAT",
  'O': 'O'
}
y_true, y_perd = align_prediction(new_texts, new_tags, twiner_mtl, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG', 'EVE', 'POG', 'NAT'])

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490


2021-04-18 13:26:00 root INFO: Imported 1 predictions for 1 true examples


500


({'ent_type': {'actual': 460,
   'correct': 404,
   'incorrect': 26,
   'missed': 70,
   'partial': 0,
   'possible': 500,
   'precision': 0.8782608695652174,
   'recall': 0.808,
   'spurious': 30},
  'exact': {'actual': 460,
   'correct': 423,
   'incorrect': 7,
   'missed': 70,
   'partial': 0,
   'possible': 500,
   'precision': 0.9195652173913044,
   'recall': 0.846,
   'spurious': 30},
  'partial': {'actual': 453,
   'correct': 423,
   'incorrect': 0,
   'missed': 70,
   'partial': 7,
   'possible': 500,
   'precision': 0.9415011037527594,
   'recall': 0.853,
   'spurious': 30},
  'strict': {'actual': 460,
   'correct': 397,
   'incorrect': 33,
   'missed': 70,
   'partial': 0,
   'possible': 500,
   'precision': 0.8630434782608696,
   'recall': 0.794,
   'spurious': 30}},
 {'EVE': {'ent_type': {'actual': 9,
    'correct': 9,
    'incorrect': 0,
    'missed': 3,
    'partial': 0,
    'possible': 12,
    'precision': 1.0,
    'recall': 0.75,
    'spurious': 0},
   'exact': {'actual

### Coverage of tokens in test set

#### Twiner

In [119]:
testset_named_entities = []
for tweet, tweet_tags in zip(texts, tags):
  for word, tag in zip(tweet, tweet_tags):
    if tag != 'O':
      testset_named_entities.append(word)

In [120]:
trainset_named_entities = []
for tweet, tweet_tags in zip(texts_train, tags_train):
  for word, tag in zip(tweet, tweet_tags):
    if tag != 'O':
      trainset_named_entities.append(word)

In [121]:
diff_test_train = set(testset_named_entities) - set(trainset_named_entities)

In [130]:
len(set(diff_test_train))

147

In [131]:
diff_test_train

{'1987',
 'Arthur',
 'Childress',
 'Demarest',
 'afc',
 'آشور',
 'آکوامن',
 'آیتی',
 'ابا',
 'ابوالفتوح',
 'ابومرزوق',
 'اردوگاه',
 'ارومچیان',
 'اسوشیتدپرس',
 'اسپیس',
 'اصلاحطلبها',
 'افسانه',
 'الوندی',
 'الیگودرز',
 'ال\u200cسی',
 'امداد',
 'امریکائی\u200cها',
 'امپریالیستی',
 'اهی',
 'اوتادی',
 'اینترسکشنالیتی',
 'ایندیانا',
 'ایکس',
 'بخارا',
 'بدوایم',
 'برنا',
 'بعث',
 'بنان',
 'بهارمست',
 'بهرگان',
 'بکا',
 'بیرانشهر',
 'بیروت',
 'بیشاپور',
 'بی\u200cآی',
 'بی\u200c\u200cبی\u200cسی',
 'تاثیرگذار\u200c',
 'تاپ',
 'تریکو',
 'تسوایک',
 'تورکییسم',
 'ج',
 'جوانان',
 'حکیمی',
 'خاچ',
 'خرمشاهی',
 'دادستانی',
 'دانش\u200cپژوهان',
 'دسته',
 'دور',
 'دیفکتو',
 'دیویدسون',
 'رافائل',
 'راونسبروک',
 'رختیانی',
 'رضوان',
 'رهبرو',
 'روتسکوی',
 'رومشکان',
 'سالیانی',
 'ساکاشویلی',
 'سبز\u200cوار',
 'سلین',
 'سندالی',
 'سورآبادی',
 'سورب',
 'سویگرت',
 'سپیدنام',
 'سیدرضی',
 'سیدعباسی',
 'شاپ\u200cکار',
 'شفرو',
 'شهدای',
 'شیخی',
 'شیرکوند',
 'صابر',
 'طبرسی',
 'عثمان',
 'عفوبین\u200cالملل

In [123]:
len(set(testset_named_entities))

702

In [136]:
diff_tweet = []
diff_tags = []
for tweet, tweet_tags in zip(texts, tags):
  flag = False
  for word, tag in zip(tweet, tweet_tags):
    if tag != 'O':
      if word not in diff_test_train:
        flag = True
  if flag == False:
    diff_tweet.append(tweet)
    diff_tags.append(tweet_tags)

In [137]:
len(diff_tweet)

87

In [138]:
len(texts)

303

In [128]:
diff_tweet

[['دلم',
  'برای',
  'شب\u200cهای',
  'میدان',
  'تنگ',
  'شده',
  '.',
  'خدایا',
  'کرونا',
  'نمیخواد',
  'بره'],
 ['بله',
  'و',
  'بسیار',
  'کار',
  'خوبیه',
  'اما',
  'ادعای',
  'پزشکان',
  'و',
  'دانشمندان',
  'بسیار',
  'بیشتر',
  'از',
  'این',
  'حرفا',
  'ست',
  'بسیار',
  'بسیار',
  'کند',
  'عمل',
  'میکنن'],
 ['گفتم', 'با', 'خبرین', 'لابد'],
 ['منم',
  'پرتغال',
  'هرچی',
  'میوه',
  'باشه',
  'پرتغال',
  'نباشه',
  'میگم',
  'هیچی',
  'میوه',
  'نداریم',
  'که'],
 ['ما',
  'مرد',
  'جنگیم',
  'چه',
  'جنگ',
  'سخت',
  'باشد',
  'و',
  'چه',
  'جنگ',
  'نرم',
  'تصویر',
  'مربوط',
  'به',
  'کانال',
  'شهداست',
  'که',
  'به',
  'مناسبت',
  '40',
  'سالگرد',
  'تجاوز',
  'دشمنان',
  'به',
  'میهن',
  'اسلامی',
  'و',
  'دفاع',
  'جانانه',
  'رزمندگان',
  'در',
  'برابر',
  'آنان',
  'برای',
  'اولین',
  'بار',
  'در',
  'شهرستان',
  'کلاله',
  'ساخته',
  'شده',
  'است',
  'شهدا',
  'کانال',
  'شهدا',
  'بصیرت'],
 ['تو',
  'را',
  'بخدا',
  'بگذارید',
  'خود',
  'مردم',

In [109]:
tokenizer = AutoTokenizer.from_pretrained("overfit/twiner-bert-base-mtl")
model = TFAutoModelForTokenClassification.from_pretrained("overfit/twiner-bert-base-mtl")
twiner_mtl = pipeline('ner', model=model, tokenizer=tokenizer, ignore_labels=[])

2021-04-18 13:50:51 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 13:50:52 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/config.json HTTP/1.1" 200 0
2021-04-18 13:50:52 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 13:50:52 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/vocab.txt HTTP/1.1" 200 0
2021-04-18 13:50:52 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 13:50:53 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/main/tokenizer.json HTTP/1.1" 404 0
2021-04-18 13:50:53 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 13:50:53 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /overfit/twiner-bert-base-mtl/resolve/

In [140]:
tag_map = {
  "B-EVE": "B-EVE",
  "B-LOC": "B-LOC",
  "B-ORG": "B-ORG",
  "B-PER": "B-PER",
  "B-POG": "B-POG",
  "B-NAT": "B-NAT",
  "I-EVE": "I-EVE",
  "I-LOC": "I-LOC",
  "I-ORG": "I-ORG",
  "I-PER": "I-PER",
  "I-POG": "I_POG",
  "I-NAT": "I-NAT",
  'O': 'O'
}
y_true, y_perd = align_prediction(diff_tweet, diff_tags, twiner_mtl, tag_map)
benchmark(y_true, y_perd, ['PER', 'LOC', 'ORG'])

2021-04-18 14:51:13 root INFO: Imported 1 predictions for 1 true examples


({'ent_type': {'actual': 15,
   'correct': 5,
   'incorrect': 1,
   'missed': 2,
   'partial': 0,
   'possible': 8,
   'precision': 0.3333333333333333,
   'recall': 0.625,
   'spurious': 9},
  'exact': {'actual': 15,
   'correct': 6,
   'incorrect': 0,
   'missed': 2,
   'partial': 0,
   'possible': 8,
   'precision': 0.4,
   'recall': 0.75,
   'spurious': 9},
  'partial': {'actual': 15,
   'correct': 6,
   'incorrect': 0,
   'missed': 2,
   'partial': 0,
   'possible': 8,
   'precision': 0.4,
   'recall': 0.75,
   'spurious': 9},
  'strict': {'actual': 15,
   'correct': 5,
   'incorrect': 1,
   'missed': 2,
   'partial': 0,
   'possible': 8,
   'precision': 0.3333333333333333,
   'recall': 0.625,
   'spurious': 9}},
 {'LOC': {'ent_type': {'actual': 4,
    'correct': 3,
    'incorrect': 0,
    'missed': 0,
    'partial': 0,
    'possible': 3,
    'precision': 0.75,
    'recall': 1.0,
    'spurious': 1},
   'exact': {'actual': 4,
    'correct': 3,
    'incorrect': 0,
    'missed': 0,
  

In [None]:
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-peymaner-uncased")
model = AutoModelForTokenClassification.from_pretrained("HooshvareLab/bert-base-parsbert-peymaner-uncased")
model.eval()
parsbert_peyma = pipeline('ner', model=model, tokenizer=tokenizer, ignore_labels=[])

#### Peyma

In [113]:
testset_named_entities = []
for tweet, tweet_tags in zip(texts_peyma, tags_peyma):
  for word, tag in zip(tweet, tweet_tags):
    if tag != 'O':
      testset_named_entities.append(word)

In [114]:
trainset_named_entities = []
for tweet, tweet_tags in zip(texts_peyma_train, tags_peyma_train):
  for word, tag in zip(tweet, tweet_tags):
    if tag != 'O':
      trainset_named_entities.append(word)

In [115]:
diff_test_train = set(testset_named_entities) - set(trainset_named_entities)

In [116]:
len(diff_test_train)

0