## Being able to Indentify PII from a block of text 
### The NER (Named Entity Recognition) way! - Real Time model, takes in a text string at a time
### Pacakages - Pre-trained spacy model

In [6]:
import spacy
import pycountry
import pandas as pd
import re
import numpy as np
import time
from langdetect import detect, lang_detect_exception
import warnings
import utils.pii_data_processing as pii_dp
import utils.pii_score as pii_score
import utils.pii_address_flag as pii_address
import utils.pii_phn_org_name as pii_phn_org
import utils.pii_redact as pii_redact
import utils.pii_main_flag as pii_main_flag
import utils.pii_sniffer as pii_sniffer
warnings.filterwarnings("ignore")
# !python -m spacy download de_core_news_lg

In [13]:
#test_string = "Herr Hans Muller, Hauptstrasse 38A, 3806 Bönigen b. Interlaken, hans.muller@hghg.com, +38939321, google.inc" 
#test_string = "c/o Service des Finances, Place de l'Hôtel de Ville 1, 1030 Bussigny"
#test_string = "123 Main St, New York, USA."
test_string = "John Smith lives at 123 Main St, New York, USA. Email: john.smith@example.com ph 123939402"
#test_string = "Chalet Petit Paradis, 1873 Val-d'Illiez"
#test_string = "Faubourg de l'Hôpital 31, 2000 Neuchâtel"
#test_string = "Avenue d'Epenex 19, 1024 Ecublens VD"
#test_string = "Mary Lou, 25 Rue de la République, 75001 Paris" 

# Pacakging the above functionalities into a self sustaining model

In [14]:
sniffer = pii_sniffer.pii_sniffer(test_string)
sniffer.data_processing()
processed_data = sniffer.data
processed_data

Unnamed: 0,Text,LangCode
0,"John Smith lives at 123 Main St, New York, USA...",EN


In [15]:
sniffer.address_flag()
processed_data = sniffer.data
processed_data

Unnamed: 0,Text,LangCode,SpacyLocationEntities,RegexLocationEntities,AddressFlag
0,"John Smith lives at 123 Main St, New York, USA...",EN,"[[(New, York), (USA)], [(New, York), (USA)]]",,AddressFound


In [17]:
sniffer.pii_scorer_call()
processed_data = sniffer.data
#json_processed = processed_data.to_json(orient='records', default_handler=str)
processed_data

Unnamed: 0,Text,LangCode,SpacyLocationEntities,RegexLocationEntities,AddressFlag,PII_Score,PII_Entities
0,"John Smith lives at 123 Main St, New York, USA...",EN,"[[(New, York), (USA)], [(New, York), (USA)]]",,AddressFound,3,"[(John, Smith), (New, York), (USA)]"


In [18]:
sniffer.rpi_caller()
processed_data = sniffer.data
#json_processed = processed_data.to_json(orient='records', default_handler=str)
processed_data

Unnamed: 0,Text,LangCode,SpacyLocationEntities,RegexLocationEntities,AddressFlag,PII_Score,PII_Entities,Redacted_Text,Name,Emails
0,"John Smith lives at 123 Main St, New York, USA...",EN,"[[(New, York), (USA)], [(New, York), (USA)]]",,AddressFound,3,"[(John, Smith), (New, York), (USA)]","**** lives at 123 Main St, New York, USA. Emai...",[John Smith],[john.smith@example.com]


In [19]:
sniffer.extract_phone_numbers()
processed_data = sniffer.data
#json_processed = processed_data.to_json(orient='records', default_handler=str)
processed_data

Unnamed: 0,Text,LangCode,SpacyLocationEntities,RegexLocationEntities,AddressFlag,PII_Score,PII_Entities,Redacted_Text,Name,Emails,PhoneNumber,HouseNumber
0,"John Smith lives at 123 Main St, New York, USA...",EN,"[[(New, York), (USA)], [(New, York), (USA)]]",,AddressFound,3,"[(John, Smith), (New, York), (USA)]","**** lives at 123 Main St, New York, USA. Emai...",[John Smith],[john.smith@example.com],[123939402],[123]


In [20]:
sniffer.extract_organization_names()
processed_data = sniffer.data
#json_processed = processed_data.to_json(orient='records', default_handler=str)
processed_data

Unnamed: 0,Text,LangCode,SpacyLocationEntities,RegexLocationEntities,AddressFlag,PII_Score,PII_Entities,Redacted_Text,Name,Emails,PhoneNumber,HouseNumber,OrgName
0,"John Smith lives at 123 Main St, New York, USA...",EN,"[[(New, York), (USA)], [(New, York), (USA)]]",,AddressFound,3,"[(John, Smith), (New, York), (USA)]","**** lives at 123 Main St, New York, USA. Emai...",[John Smith],[john.smith@example.com],[123939402],[123],


In [21]:
sniffer.detect_pii_flag()
processed_data = sniffer.data
#json_processed = processed_data.to_json(orient='records', default_handler=str)
processed_data

Unnamed: 0,Text,LangCode,SpacyLocationEntities,RegexLocationEntities,AddressFlag,PII_Score,PII_Entities,Redacted_Text,Name,Emails,PhoneNumber,HouseNumber,OrgName,SpacyEntities,RegexMatches,PII_Flag
0,"John Smith lives at 123 Main St, New York, USA...",EN,"[[(New, York), (USA)], [(New, York), (USA)]]",,AddressFound,3,"[(John, Smith), (New, York), (USA)]","**** lives at 123 Main St, New York, USA. Emai...",[John Smith],[john.smith@example.com],[123939402],[123],,"[(John, Smith)]",,1


In [10]:
#!python -m spacy download it_core_news_lg
#!python -m spacy download ja_core_news_sm
#!python -m spacy download ca_core_news_sm
#!python -m spacy download nl_core_news_sm
#!python -m spacy download pt_core_news_sm
#!python -m spacy download ro_core_news_sm
#!python -m spacy download es_core_news_sm

In [11]:
sniffer.data_processing()
processed_data = sniffer.data
sniffer.pii_scorer_call()
processed_data = sniffer.data
sniffer.rpi_caller()
processed_data = sniffer.data
sniffer.extract_phone_numbers()
processed_data = sniffer.data
sniffer.extract_organization_names()
processed_data = sniffer.data
sniffer.detect_pii_flag()
processed_data = sniffer.data

json_processed = processed_data.to_json(orient='records', default_handler=str)
json_processed

'[{"Text":"c\\/o Service des Finances, Place de l\'H\\u00f4tel de Ville 1, 1030 Bussigny","LangCode":"FR","PII_Score":1,"PII_Entities":["Service des Finances"],"Redacted_Text":"c\\/o Service des Finances, Place de l\'H\\u00f4tel de Ville 1, 1030 Bussigny","Name":null,"Emails":null,"PhoneNumber":null,"HouseNumber":["1","1030"],"OrgName":null,"SpacyEntities":null,"RegexMatches":null,"PII_Flag":0}]'