# Custom entity recognition with Comprehend
---
*Step 3: request inferences from both the predefined and the custom models*

This series of notebook is a walkthrough on how to leverage Amazon Comprehend to recognize customized entities from documents. More details about the training process can be found here: https://docs.aws.amazon.com/comprehend/latest/dg/training-recognizers.html

## Initialization
---

In [None]:
%%sh
pip -q install --upgrade pip
pip -q install sagemaker awscli boto3 --upgrade
pip -q install spacy

In [None]:
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
import boto3
import pandas as pd
import sagemaker
import time
import json
import os
import spacy
from spacy import displacy

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
prefix = 'comprehend_workshop/inference'

In [None]:
text = ('M. Paul  MARTIN, un bon chrétien, est actuellement malade et a un cancer à ce '
        'titre je ne lui accorderai jamais le produit demandé. De plus c’est un hypocrite '
        'et n’ai aucune confiance en ce membre du parti LR. Traitement des métastases '
        'hépatiques des cancers colorectaux : jusqu\'où aller ? Les patients qui ne sont pas '
        'en mesure d ’ avaler des comprimés devraient utiliser la solution orale , ou alors '
        'ils peuvent écraser les comprimés et les mélanger à une petite quantité de nourriture '
        'ou de boisson , avant de prendre la dose immédiatement. Numéro de téléphone: +33 060 708 101')

with open('test.txt', 'w') as f:
    f.write(text)

In [None]:
medical_client    = boto3.client('comprehendmedical')
comprehend_client = boto3.client('comprehend')
translate_client  = boto3.client('translate')
s3                = boto3.resource('s3')

## Step 1: applying Amazon Comprehend in French
---

In [None]:
response_fr = comprehend_client.detect_entities(
    Text=text,
    LanguageCode='fr'
)
pd.DataFrame(response_fr['Entities'])

## Step 2: translating content in English
---

In [None]:
response = translate_client.translate_text(
    Text=text,
    SourceLanguageCode='auto',
    TargetLanguageCode='en'
)
translation = response['TranslatedText']
translation

## Step 3: extracting entities, PII, and medical entities
---

In [None]:
response_en = comprehend_client.detect_entities(
    Text=translation,
    LanguageCode='en'
)
pd.DataFrame(response_en['Entities'])

In [None]:
response_pii_en = comprehend_client.detect_pii_entities(
    Text=text,
    LanguageCode='en'
)
pd.DataFrame(response_pii_en['Entities'])

In [None]:
response_en_med = medical_client.detect_entities_v2(
    Text=translation
)
pd.DataFrame(response_en_med['Entities'])

## Step 4: leverage custom entities in French
---
Before you start make sure that your Sagemaker Execution Role has the right credentials (see initial notebooks for IAM configuration).

The following yields the properties of the available entity recognizers:

In [None]:
response = comprehend_client.list_entity_recognizers(
    Filter={
        'Status': 'TRAINED',
    },
)

print(len(response['EntityRecognizerPropertiesList']), 'trained model(s) found')
model_arn = response['EntityRecognizerPropertiesList'][0]['EntityRecognizerArn']
print(model_arn)

### Step 4.1 - Using a realtime endpoint:

In [None]:
response = comprehend_client.create_endpoint(
    EndpointName='FrenchMedicalEntityRekognizer',
    ModelArn=model_arn,
    DesiredInferenceUnits=1
)

In [None]:
endpoint_arn = response['EndpointArn']
status_response = comprehend_client.describe_endpoint(EndpointArn=endpoint_arn)
status = status_response['EndpointProperties']['Status']

while status in ['CREATING']:
    status_response = comprehend_client.describe_endpoint(EndpointArn=endpoint_arn)
    status = status_response['EndpointProperties']['Status']
    print(status)
    time.sleep(60)

In [None]:
response_fr_med = comprehend_client.detect_entities(
    Text=text,
    LanguageCode='fr',
    EndpointArn=endpoint_arn
)
pd.DataFrame(response_fr_med['Entities'])

In [None]:
data_fr = []
for e in response_fr['Entities']:
    data_fr.append({
        'start': e['BeginOffset'],
        'end': e['EndOffset'],
        'label': e['Type']
    })
    
for e in response_fr_med['Entities']:
    data_fr.append({
        'start': e['BeginOffset'],
        'end': e['EndOffset'],
        'label': e['Type']
    })
    
data_fr

In [None]:
displacy.render(
    docs={'text': text, 'ents': data_fr, 'title': 'Document traité en français'}, 
    style='ent', 
    jupyter=True, 
    manual=True, 
    options={
        'colors': {
            'DISORDERS': '#A6E22D',
            'CHEMICALS': '#EF60B5',
            'PROCEDURE': '#43C8FF',
            'LIVING_BEING': '#A99DFB',
            'ANATOMY': '#FFCC00',
            'PHYSIOLOGY': '#2FBCAC',
            'OTHER': '#EF60B5',
            'ORGANIZATION': '#2FBCAC',
            'QUANTITY': '#FFCC00',
            'NAME': '#A99DFB',
            'GENERIC_NAME': '#A99DFB',
            'DX_NAME': '#EF60B5',
            'SYSTEM_ORGAN_SITE': '#FFCC00'
        }
    }
)

### Step 4.1 - Using a realtime endpoint:

In [None]:
s3.meta.client.upload_file('test.txt', bucket, prefix + '/test.txt')

response = comprehend_client.start_entities_detection_job(
    InputDataConfig={
        'S3Uri': 's3://{}/{}/test.txt'.format(bucket, prefix),
        'InputFormat': 'ONE_DOC_PER_LINE'
    },
    JobName='GetFrenchMedicalEntitiesV2',
    OutputDataConfig={
        'S3Uri': 's3://{}/comprehend_data/output_v2/'.format(bucket),
    },
    DataAccessRoleArn='arn:aws:iam::123031033346:role/service-role/AmazonComprehendServiceRole-FrenchMedicalEntities',
    EntityRecognizerArn=model_arn #'arn:aws:comprehend:eu-west-1:123031033346:entity-recognizer/FrenchMedicalEntities-v2',
    LanguageCode='en', # This is ignored for the custom entities recognition
)

In [None]:
jobId = response['JobId']
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_job = comprehend_client.describe_entities_detection_job(JobId=jobId)
    status = describe_job["EntitiesDetectionJobProperties"]["JobStatus"]
    print("Job Status: {}".format(status))
    
    if status == "COMPLETED" or status == "FAILED":
        break
        
    time.sleep(60)

In [None]:
# Download the test output to local machine
describe_job = comprehend_client.describe_entities_detection_job(JobId = response['JobId'])
job_output = describe_job["EntitiesDetectionJobProperties"]["OutputDataConfig"]["S3Uri"]
path_prefix = 's3://{}/'.format(bucket)
job_key = os.path.relpath(job_output, path_prefix)

s3 = boto3.resource('s3')
s3.Bucket(bucket).download_file(job_key, 'output.tar.gz')

!tar xvzf output.tar.gz

In [None]:
#Load all the Entities values in a list
data_fr = []
for line in open('output-v2', 'r'):
    entities = json.loads(line)['Entities']
    if entities != None and len(entities) > 0:
        for e in entities:
            data_fr.append({
                'start': e['BeginOffset'],
                'end': e['EndOffset'],
                'label': e['Type']
            })
    
for e in response_fr['Entities']:
    data_fr.append({
        'start': e['BeginOffset'],
        'end': e['EndOffset'],
        'label': e['Type']
    })    

data_fr

In [None]:
data_en = []
for e in response_en['Entities']:
    data_en.append({
        'start': e['BeginOffset'],
        'end': e['EndOffset'],
        'label': e['Type']
    })
    
for e in response_en_med['Entities']:
    data_en.append({
        'start': e['BeginOffset'],
        'end': e['EndOffset'],
        'label': e['Type']
    })

    
data_en

In [None]:
displacy.render(
    docs={'text': text, 'ents': data_fr, 'title': 'Traitement du Français'}, 
    style='ent', 
    jupyter=True, 
    manual=True, 
    options={
        'colors': {
            'DISORDERS': '#A6E22D',
            'CHEMICALS': '#EF60B5',
            'PROCEDURE': '#43C8FF',
            'LIVING_BEING': '#A99DFB',
            'ANATOMY': '#FFCC00',
            'PHYSIOLOGY': '#2FBCAC',
            'OTHER': '#EF60B5',
            'ORGANIZATION': '#2FBCAC',
            'QUANTITY': '#FFCC00'
        }
    }
)

In [None]:
displacy.render(
    docs={'text': translation, 'ents': data_en, 'title': 'Après traduction en anglais'}, 
    style='ent', 
    jupyter=True, 
    manual=True, 
    options={
        'colors': {
            'DISORDERS': '#A6E22D',
            'CHEMICALS': '#EF60B5',
            'PROCEDURE': '#43C8FF',
            'LIVING_BEING': '#A99DFB',
            'ANATOMY': '#FFCC00',
            'PHYSIOLOGY': '#2FBCAC',
            'OTHER': '#EF60B5',
            'ORGANIZATION': '#2FBCAC',
            'QUANTITY': '#FFCC00',
            'NAME': '#A99DFB',
            'GENERIC_NAME': '#A99DFB',
            'DX_NAME': '#EF60B5',
            'SYSTEM_ORGAN_SITE': '#FFCC00'
        }
    }
)