# Extract name and addresses from italian documents

## Setup environment

In [3]:
# !pip install random_italian_person

```
“Street” (Detected: VIA FERRUCCIO PARRI 2),
“Address” (Detected: 42049 SANT'ILARIO D'ENZA RE),
“Name”( Detected: FRANCO FANTE * this is also incorrect recognition , on the picture is “FANTI”)
```

## Create synthetic dataset of italian entities (`nome`,`via`,`indirizzo`)

In [None]:
import re
import pandas as pd
from random_italian_person import RandomItalianPerson

In [71]:
# create synthetic data
data = []
for _ in range(10000):
    try:
        data.append( RandomItalianPerson().data )
    except Exception:
        continue
        
df = pd.DataFrame(data)

In [84]:
df.head()

Unnamed: 0,region,province,surname,name,sex,birth_municipality,birth_province,birth_region,birth_cap,birth_province_code,birthdate,address,house_number,cap,municipality,province_code,codice_fiscale
0,Campania,Salerno,Brumat,Nicola,M,Romans D'isonzo,Gorizia,Friuli Venezia Giulia,34076,GO,1951-01-17,Via T. De Cusatis,8,84070,San Mauro La Bruca,SA,BRMNCL51A17H514X
1,Sicilia,Catania,Babini,Luca,M,Cervia,Ravenna,Emilia Romagna,48015,RA,1930-12-21,Via Carlo Felice Gambino,52,95131,Catania,CT,BBNLCU30T21C553D
2,Lombardia,Brescia,Di Stefano,Giuseppina,F,Carapelle Calvisio,L'Aquila,Abruzzo,67020,AQ,1972-03-07,Via Rose Di Sotto,2/a,25126,Brescia,BS,DSTGPP72C47B725L
3,Campania,Salerno,Colombo,Elisabetta,F,Casaleggio Novara,Novara,Piemonte,28060,NO,1925-05-24,Via Guerrasio,32,84083,Castel San Giorgio,SA,CLMLBT25E64B883D
4,Lombardia,Brescia,Palladino,Rachid,M,San Polo Matese,Campobasso,Molise,86020,CB,1923-01-29,Via Europa,156,25062,Concesio,BS,PLLRHD23A29I122Y


In [72]:
def clean(string):
    text = re.sub('\W+',' ', str(string))
    return text.lower().strip()

In [73]:
# populate lists
nome = []
via = []
indirizzo = []

for row in df.itertuples():
    nome.append(
        {
            'Text': ' '.join([clean(row.name), clean(row.surname)]),
            'Type': 'nome',
        }
    )
    via.append(
        {
            'Text': ' '.join([clean(row.address).lower(), clean(row.house_number)]),
            'Type': 'via',
        }
    )
    indirizzo.append(
        {
            'Text': ' '.join([clean(row.cap), clean(row.municipality), clean(row.province_code)]),
            'Type': 'indirizzo',
        }
    )

In [74]:
# remove duplicates to avoid "overlapping entities" error
nome = [dict(tupleized) for tupleized in set(tuple(item.items()) for item in nome)]
via = [dict(tupleized) for tupleized in set(tuple(item.items()) for item in via)]
indirizzo = [dict(tupleized) for tupleized in set(tuple(item.items()) for item in indirizzo)]

In [75]:
# create dataframes
df_nome = pd.DataFrame.from_dict(nome)
df_via = pd.DataFrame.from_dict(via)
df_indirizzo = pd.DataFrame.from_dict(indirizzo)

In [80]:
# concat dataframes
df_comprehend = pd.concat([df_nome,df_via,df_indirizzo]).reset_index(drop=True)
print(f'# of rows: {len(df_comprehend)}')

# of rows: 20530


In [81]:
df_comprehend.to_csv('data.csv',index=False)

## Detecting custom entities using the AWS SDK for Python (Boto3)
- https://docs.aws.amazon.com/comprehend/latest/dg/get-started-cer.html

In [None]:
import boto3
import uuid
comprehend = boto3.client("comprehend", region_name="eu-west-1")

In [None]:
# create entity recognizer
response = comprehend.create_entity_recognizer(
    RecognizerName="Recognizer-Name-Goes-Here-{}".format(str(uuid.uuid4())),
    LanguageCode="en",
    DataAccessRoleArn="Role ARN",
    InputDataConfig={
        "EntityTypes": [
            {
                "Type": "ENTITY_TYPE"
            }
        ],
        "Documents": {
            "S3Uri": "s3://Bucket Name/Bucket Path/documents"
        },
        "Annotations": {
            "S3Uri": "s3://Bucket Name/Bucket Path/annotations"
        }
    }
)
recognizer_arn = response["EntityRecognizerArn"]

In [None]:
# list all recognizers
response = comprehend.list_entity_recognizers()

In [None]:
# wait for recognizer to reach `TRAINED` status
response = comprehend.start_entities_detection_job(
    EntityRecognizerArn=recognizer_arn,
    JobName="Detection-Job-Name-{}".format(str(uuid.uuid4())),
    LanguageCode="en",
    DataAccessRoleArn="Role ARN",
    InputDataConfig={
        "InputFormat": "ONE_DOC_PER_LINE",
        "S3Uri": "s3://Bucket Name/Bucket Path/documents"
    },
    OutputDataConfig={
        "S3Uri": "s3://Bucket Name/Bucket Path/output"
    }
)

## Detecting text in an image using Rekognition
- https://docs.aws.amazon.com/rekognition/latest/dg/text-detecting-text-procedure.html

In [None]:
import boto3

def detect_text(photo, bucket):
    client=boto3.client('rekognition')
    response=client.detect_text(Image={'S3Object':{'Bucket':bucket,'Name':photo}})
                        
    textDetections=response['TextDetections']
    print ('Detected text\n----------')
    for text in textDetections:
            print ('Detected text:' + text['DetectedText'])
            print ('Confidence: ' + "{:.2f}".format(text['Confidence']) + "%")
            print ('Id: {}'.format(text['Id']))
            if 'ParentId' in text:
                print ('Parent Id: {}'.format(text['ParentId']))
            print ('Type:' + text['Type'])
            print()
    return textDetections, len(textDetections)

In [None]:
bucket = 'bucket'
photo = 'photo'
response, text_count = detect_text(photo,bucket)

In [None]:
print("Text detected: " + str(text_count))

## Extract entities using Comprehend Custom

In [None]:
https://pypi.org/project/random-italian-person/