In [None]:
!python -m spacy download en_core_web_lg

In [None]:
!pip install transformers

In [None]:
!pip install presidio_analyzer

In [None]:
!pip install span_marker

In [5]:
import pandas as pd

In [6]:
from span_marker import SpanMarkerModel


In [7]:
import logging
from typing import Optional, List, Tuple, Set

from presidio_analyzer import (
    RecognizerResult,
    EntityRecognizer,
    AnalysisExplanation,
)
from presidio_analyzer.nlp_engine import NlpArtifacts

try:
    from span_marker import SpanMarkerModel
except ImportError:
    print("span_marker is not installed")


logger = logging.getLogger("presidio-analyzer")


class SpanMarkerRecognizer(EntityRecognizer):
    """
    Wrapper for a Span Marker model, if needed to be used within Presidio Analyzer.

    :example:
    >from presidio_analyzer import AnalyzerEngine, RecognizerRegistry

    >span_marker_recognizer = SpanMarkerRecognizer()

    >registry = RecognizerRegistry()
    >registry.add_recognizer(span_marker_recognizer)

    >analyzer = AnalyzerEngine(registry=registry)

    >results = analyzer.analyze(
    >    "My name is Christopher and I live in Irbid.",
    >    language="en",
    >    return_decision_process=True,
    >)
    >for result in results:
    >    print(result)
    >    print(result.analysis_explanation)


    """

    ENTITIES = ['QUANTITY',
                'LOC',
                'ORDINAL',
                'CARDINAL',
                'PERCENT',
                'PERSON',
                'GPE',
                'NORP',
               # 'MONEY',
                'FAC',
               # 'PRODUCT',
               #  'LANGUAGE',
               # 'LAW',
               # 'EVENT',
                'ORG',
                'DATE',
                'TIME',
               # 'WORK_OF_ART'
                ]


    DEFAULT_EXPLANATION = "Identified as {} by SpanMarker Named Entity Recognition"

    CHECK_LABEL_GROUPS = [
        ({"LOCATION"}, {"LOC", "LOCATION",'FAC',}),
        ({"PERSON"}, {"PER", "PERSON"}),
        ({"ORGANIZATION"}, {"ORG",'NORP',}),
        ({"MISCELLANEOUS"}, {'QUANTITY','ORDINAL',
                'CARDINAL',
                'PERCENT','DATE',
                'TIME'}),
    ]

    PRESIDIO_EQUIVALENCES = {
        "PER": "PERSON",
        "LOC": "LOCATION",
        "ORG": "ORGANIZATION",
        'MISC': 'MISCELLANEOUS'
    }

    def __init__(
        self,

        supported_language: str = "en",
        supported_entities: Optional[List[str]] = None,
        check_label_groups: Optional[Tuple[Set, Set]] = None,
        model: SpanMarkerModel = None,

    ):
        self.check_label_groups = (
            check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
        )

        supported_entities = supported_entities if supported_entities else self.ENTITIES
        self.model =  (model if model else SpanMarkerModel.from_pretrained("tomaarsen/span-marker-roberta-large-ontonotes5"))
        self.model.cuda()

        super().__init__(
            supported_entities=supported_entities,
            supported_language=supported_language,
            name="SpanMarker Analytics",
        )

    def load(self) -> None:
        """Load the model, not used. Model is loaded during initialization."""
        pass

    def get_supported_entities(self) -> List[str]:
        """
        Return supported entities by this model.

        :return: List of the supported entities.
        """
        return self.supported_entities

    # Class to use SnapMarker with Presidio as an external recognizer.
    def analyze(
        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None
    ) -> List[RecognizerResult]:
        """
        Analyze text using Text Analytics.

        :param text: The text for analysis.
        :param entities: Not working properly for this recognizer.
        :param nlp_artifacts: Not used by this recognizer.
        :param language: Text language. Supported languages in MODEL_LANGUAGES
        :return: The list of Presidio RecognizerResult constructed from the recognized
            SnapMarker detections.
        """

        results = []

        ner = self.model.predict(text)

        # If there are no specific list of entities, we will look for all of it.
        if not entities:
            entities = self.supported_entities

        for entity in entities:
            if entity not in self.supported_entities:
                continue

            for ent in ner:
                if not self.__check_label(
                    entity, ent["label"], self.check_label_groups
                ):
                    continue
                textual_explanation = self.DEFAULT_EXPLANATION.format(
                   ent["label"]
                )
                explanation = self.build_span_marker_explanation(
                    round(ent["score"], 2), textual_explanation
                )
                span_marker_result = self._convert_to_recognizer_result(ent, explanation)

                results.append(span_marker_result)

        return results

    def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:

        entity_type = self.PRESIDIO_EQUIVALENCES.get(entity["label"], entity["label"])
        span_marker_score = round(entity["score"], 2)

        span_marker_results = RecognizerResult(
            entity_type=entity_type,
            start=entity["char_start_index"],
            end=entity["char_end_index"],
            score=span_marker_score,
            analysis_explanation=explanation,
        )

        return span_marker_results

    def build_span_marker_explanation(
        self, original_score: float, explanation: str
    ) -> AnalysisExplanation:
        """
        Create explanation for why this result was detected.

        :param original_score: Score given by this recognizer
        :param explanation: Explanation string
        :return:
        """
        explanation = AnalysisExplanation(
            recognizer=self.__class__.__name__,
            original_score=original_score,
            textual_explanation=explanation,
        )
        return explanation

    @staticmethod
    def __check_label(
        entity: str, label: str, check_label_groups: Tuple[Set, Set]
    ) -> bool:
        return any(
            [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
        )


In [8]:
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry

span_marker_recognizer = (
    SpanMarkerRecognizer()
)  # This would download a very large (+2GB) model on the first run



Downloading (…)lve/main/config.json:   0%|          | 0.00/5.45k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50267. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


In [14]:
registry = RecognizerRegistry()
registry.add_recognizer(span_marker_recognizer)
registry.load_predefined_recognizers()
analyzer = AnalyzerEngine(registry=registry)



In [16]:
results = analyzer.analyze(
    "My name is Christopher and I live in 03/oct/21.",
    language="en",
    return_decision_process=True,
)
for result in results:
    print(result)
    print(result.analysis_explanation)

type: PERSON, start: 11, end: 22, score: 0.98
{'recognizer': 'SpanMarkerRecognizer', 'pattern_name': None, 'pattern': None, 'original_score': 0.98, 'score': 0.98, 'textual_explanation': 'Identified as PERSON by SpanMarker Named Entity Recognition', 'score_context_improvement': 0, 'supportive_context_word': '', 'validation_result': None}
type: DATE_TIME, start: 37, end: 46, score: 0.85
{'recognizer': 'SpacyRecognizer', 'pattern_name': None, 'pattern': None, 'original_score': 0.85, 'score': 0.85, 'textual_explanation': "Identified as DATE by Spacy's Named Entity Recognition", 'score_context_improvement': 0, 'supportive_context_word': '', 'validation_result': None}


In [12]:
from tqdm import tqdm

In [17]:
df=pd.read_csv('/content/redaction_train_set - PII.csv',names=["Text"])

In [18]:
df1=pd.read_csv('/content/redaction_train_set - No PII.csv',names=["Text"])

In [19]:
results_list = []
explanation_list = []
# Loop through each row in the DataFrame
for index, row in tqdm(df.iterrows()):
    text = row["Text"]
    # Analyze the text and store the result in the list
    results = analyzer.analyze(
    text,
    language="en",
    return_decision_process=True,)
    results_list.append(results)
    explanation_list.append(result.analysis_explanation)
# Add the results list as a new column in the DataFrame
df["Annotated_Text"] = results_list
df["Explanation"] = explanation_list

599it [01:37,  6.17it/s]


In [20]:
results_list1 = []
explanation_list1 = []
# Loop through each row in the DataFrame
for index, row in tqdm(df1.iterrows()):
    text = row["Text"]
    # Analyze the text and store the result in the list
    results = analyzer.analyze(
    text,
    language="en",
    return_decision_process=True,)
    results_list1.append(results)
    explanation_list1.append(result.analysis_explanation)

# Add the results list as a new column in the DataFrame
df1["Annotated_Text"] = results_list1
df1["Explanation"] = explanation_list1

249it [00:35,  7.04it/s]


In [21]:
df.to_csv("redaction_annotated_SpanMaker_PIL.csv")
df1.to_csv("redaction_annotated_SpanMaker_NO_PIL.csv")

In [22]:
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider

In [23]:
configuration1 = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
}

In [24]:
# Create NLP engine based on configuration
provider1 = NlpEngineProvider(nlp_configuration=configuration1)
nlp_engine1 = provider1.create_engine()
registry1 = RecognizerRegistry()


In [25]:
analyzer1 = AnalyzerEngine(nlp_engine=nlp_engine1,
                          registry=registry1,
                          supported_languages=["en"])

In [26]:
dfa=pd.read_csv('/content/redaction_train_set - PII.csv',names=["Text"])

In [27]:
df1a=pd.read_csv('/content/redaction_train_set - No PII.csv',names=["Text"])

In [28]:
results_list = []
explanation_list = []
# Loop through each row in the DataFrame
for index, row in tqdm(dfa.iterrows()):
    text = row["Text"]
    # Analyze the text and store the result in the list
    results = analyzer1.analyze(
    text,
    language="en",
    return_decision_process=True,)
    results_list.append(results)
    explanation_list.append(result.analysis_explanation)
# Add the results list as a new column in the DataFrame
dfa["Annotated_Text"] = results_list
dfa["Explanation"] = explanation_list

599it [00:07, 76.89it/s]


In [29]:
results_list1 = []
explanation_list1 = []
# Loop through each row in the DataFrame
for index, row in tqdm(df1a.iterrows()):
    text = row["Text"]
    # Analyze the text and store the result in the list
    results = analyzer1.analyze(
    text,
    language="en",
    return_decision_process=True,)
    results_list1.append(results)
    explanation_list1.append(result.analysis_explanation)

# Add the results list as a new column in the DataFrame
df1a["Annotated_Text"] = results_list1
df1a["Explanation"] = explanation_list1

249it [00:02, 89.48it/s] 


In [30]:
df.to_csv("redaction_annotated_Spacy_PIL.csv")
df1.to_csv("redaction_annotated_Spacy_NO_PIL.csv")