[![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/machinelearnear/custom-italian-entitites-recognition-comprehend/blob/main/train.ipynb)

# Extract name and addresses from italian documents

## Setup your pipeline

In [None]:
# !pip install random_italian_person

In [None]:
import boto3
import uuid
import re
import pandas as pd
from random_italian_person import RandomItalianPerson

### Create synthetic dataset of italian entities (`nome`,`via`,`indirizzo`)

In [126]:
output_fname = 'entitylist.csv'
n_samples = 10000

In [127]:
# create synthetic data
data = []
for _ in range(n_samples):
    try:
        data.append( RandomItalianPerson().data )
    except Exception:
        continue
        
df = pd.DataFrame(data)

In [128]:
df.head()

Unnamed: 0,region,province,surname,name,sex,birth_municipality,birth_province,birth_region,birth_cap,birth_province_code,birthdate,address,house_number,cap,municipality,province_code,codice_fiscale
0,Lazio,Roma,Fontana,Matteo,M,Trissino,Vicenza,Veneto,36070,VI,1930-04-24,Via Avicenna,8/10,146,Roma,RM,FNTMTT30D24L433K
1,Puglia,Lecce,Bianchi,Loredana,F,Campagnatico,Grosseto,Toscana,58042,GR,1966-10-17,Corso Umberto I,252,73056,Taurisano,LE,BNCLDN66R57B497G
2,Puglia,Bari,Russo,Alessia,F,San Nicola Dell'alto,Crotone,Calabria,88817,KR,1965-10-19,Via Giacomo Matteotti,16,70032,Bitonto,BA,RSSLSS65R59I057I
3,Umbria,Perugia,Greppi,Maria,F,Greggio,Vercelli,Piemonte,13030,VC,1930-07-18,Piazza Mazzini,46,6083,Bastia,PG,GRPMRA30L58E163D
4,Lazio,Roma,Pappalardo,Leonardo,M,Aci Bonaccorsi,Catania,Sicilia,95020,CT,1940-12-04,Piazza Italia,1,34,Colleferro,RM,PPPLRD40T04A025N


In [129]:
# remove overlapping entities
df.drop(df[df.name.apply(lambda x: len(x.split())) > 1].index, inplace=True)
df.drop(df[df.surname.apply(lambda x: len(x.split())) > 1].index, inplace=True)

for col in ['name','surname']:
    df.drop(df[df.apply(lambda x: x[col] in x['address'],axis=1)].index, inplace=True)
    df.drop(df[df.apply(lambda x: x[col] in x['municipality'],axis=1)].index, inplace=True)
    
df.drop(df[df.apply(lambda x: x['municipality'] in x['address'],axis=1)].index, inplace=True)

In [169]:
def fix(text):
    # text = re.sub(r"[^a-zA-Z0-9]+",'', str(string))
    text = str(text)
    if len(text.split()) > 1:
        return ' '.join([remove_special_characters(s) for s in text.split()])
    else:
        return remove_special_characters(text)
    
def remove_special_characters(text):
    return ''.join(item for item in text if item.isalnum()).upper().strip()

In [170]:
nome = [f'{fix(row.name)} {fix(row.surname)}' for row in df.itertuples()]
via = [f'{fix(row.address)} {fix(row.house_number)}' for row in df.itertuples()]
indirizzo = [f'{fix(row.cap)} {fix(row.municipality)} {fix(row.province_code)}' for row in df.itertuples()]       

#### Using `csv`

In [171]:
import csv 
with open(output_fname, "w", encoding="utf-8") as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(["Text", "Type"])
    for i in set(nome): csv_writer.writerow([i, "nome"])
    for i in set(via): csv_writer.writerow([i, "via"])
    for i in set(indirizzo): csv_writer.writerow([i, "indirizzo"])

#### Using `Pandas`

In [37]:
# create dataframes
df_nome = pd.DataFrame.from_dict(set(nome))
df_via = pd.DataFrame.from_dict(set(via))
df_indirizzo = pd.DataFrame.from_dict(set(indirizzo))

In [38]:
# add entity type
df_nome['Type'] = 'nome'
df_via['Type'] = 'via'
df_indirizzo['Type'] = 'indirizzo'

In [39]:
# concat dataframes
df_comprehend = pd.concat([df_nome,df_via,df_indirizzo]).reset_index(drop=True)
df_comprehend.columns = ['Text','Type']

print(f'# of rows: {len(df_comprehend)}')

# of rows: 20475


In [40]:
df_comprehend.to_csv(output_fname,index=False)

### Detecting custom entities using the AWS SDK for Python (`Boto3`)
- https://docs.aws.amazon.com/comprehend/latest/dg/get-started-cer.html

**Uncomment below to train your custom entity recognizer through the Python SDK**

In [32]:
# comprehend = boto3.client("comprehend", region_name="eu-west-1")

In [33]:
# # create entity recognizer
# response = comprehend.create_entity_recognizer(
#     RecognizerName="Recognizer-Name-Goes-Here-{}".format(str(uuid.uuid4())),
#     LanguageCode="en",
#     DataAccessRoleArn="Role ARN",
#     InputDataConfig={
#         "EntityTypes": [
#             {
#                 "Type": "ENTITY_TYPE"
#             }
#         ],
#         "Documents": {
#             "S3Uri": "s3://Bucket Name/Bucket Path/documents"
#         },
#         "Annotations": {
#             "S3Uri": "s3://Bucket Name/Bucket Path/annotations"
#         }
#     }
# )
# recognizer_arn = response["EntityRecognizerArn"]

In [34]:
# list all recognizers
# response = comprehend.list_entity_recognizers()

## Run your detection

### Detecting text in an image using Rekognition
- https://docs.aws.amazon.com/rekognition/latest/dg/text-detecting-text-procedure.html

In [None]:
def detect_text(photo, bucket):
    client=boto3.client('rekognition')
    response=client.detect_text(Image={'S3Object':{'Bucket':bucket,'Name':photo}})
                        
    textDetections=response['TextDetections']
    print ('Detected text\n----------')
    for text in textDetections:
            print ('Detected text:' + text['DetectedText'])
            print ('Confidence: ' + "{:.2f}".format(text['Confidence']) + "%")
            print ('Id: {}'.format(text['Id']))
            if 'ParentId' in text:
                print ('Parent Id: {}'.format(text['ParentId']))
            print ('Type:' + text['Type'])
            print()
    return textDetections, len(textDetections)

In [None]:
bucket = 'bucket'
photo = 'photo'
response, text_count = detect_text(photo,bucket)

In [None]:
print("Text detected: " + str(text_count))

### Extract entities using Comprehend Custom Entity Recognizer

In [None]:
# wait for recognizer to reach `TRAINED` status
response = comprehend.start_entities_detection_job(
    EntityRecognizerArn=recognizer_arn,
    JobName="Detection-Job-Name-{}".format(str(uuid.uuid4())),
    LanguageCode="en",
    DataAccessRoleArn="Role ARN",
    InputDataConfig={
        "InputFormat": "ONE_DOC_PER_LINE",
        "S3Uri": "s3://Bucket Name/Bucket Path/documents"
    },
    OutputDataConfig={
        "S3Uri": "s3://Bucket Name/Bucket Path/output"
    }
)