In [35]:
# download presidio
!pip install presidio_analyzer presidio_anonymizer
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m830.1 kB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [72]:
from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngineProvider

LANGUAGES_CONFIG_FILE = "/languages-config.yml"

# Create NLP engine based on configuration file
provider = NlpEngineProvider(conf_file=LANGUAGES_CONFIG_FILE)
nlp_engine_with_french = provider.create_engine()

context_nir=["securite","sociale", "maladie"]
NIR_pattern = Pattern(
    name="nir_pattern",
    regex="[1-478][0-9]{2}(0[1-9]|1[0-2]|62|63)(2[ABab]|[0-9]{2})(00[1-9]|0[1-9][0-9]|[1-8][0-9]{2}|9[0-8][0-9]|990)(00[1-9]|0[1-9][0-9]|[1-9][0-9]{2})(0[1-9]|[1-8][0-9]|9[0-7])",
    score=1)
nir_recognizer = PatternRecognizer(supported_entity="NIR", patterns=[NIR_pattern],context=context_nir, supported_language="fr")
polish_id_pattern = Pattern(
    name="polish_id_pattern",
    regex="[A-Z]{3}\d{6}",
    score=1,
)
polish_id_recognizer = PatternRecognizer(
    supported_entity="POLISH_ID", patterns=[polish_id_pattern], supported_language="fr"
)




In [73]:
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry

registry = RecognizerRegistry(supported_languages=["en","fr"])
registry.load_predefined_recognizers()
registry.remove_recognizer("UsBankRecognizer")
# registry.remove_recognizer("UsLicenseRecognizer")

# Add the recognizer to the existing list of recognizers
registry.add_recognizer(nir_recognizer)
registry.add_recognizer(polish_id_recognizer)

# Set up analyzer with our updated recognizer registry
# analyzer = AnalyzerEngine(registry=registry)
analyzer = AnalyzerEngine(
    registry=registry,
    supported_languages=["en","fr"],
    nlp_engine=nlp_engine_with_french)
# Run with input text
text1="Mon numéro de securite sociale 201069935115371, securite sociale 112233 de Marion Hugo"
text="Voici mon id polonais: ABC123456"

results = analyzer.analyze(text=text, language="fr")
print(results)



[type: POLISH_ID, start: 23, end: 32, score: 1]


In [None]:

# Setting up an English Email recognizer:
email_recognizer_en = EmailRecognizer(supported_language="en", context=["email", "mail"])

# Setting up a Spanish Email recognizer

registry = RecognizerRegistry()

# Add recognizers to registry
registry.add_recognizer(email_recognizer_en)


# Set up analyzer with our updated recognizer registry
analyzer = AnalyzerEngine(
    registry=registry,
    supported_languages=["en","es"],
    nlp_engine=nlp_engine_with_spanish)

analyzer.analyze(text="My name is David", language="en")