In [None]:
import json

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

In [None]:
%%capture
for k,v in license_keys.items(): 
    %set_env $k=$v

!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jsl_colab_setup.sh
!bash jsl_colab_setup.sh

! pip install spark-nlp-display

In [None]:
import os
import json
import pandas as pd

import sparknlp_jsl
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

In [None]:
spark = sparknlp_jsl.start(os.environ['SECRET'])

In [None]:
example1 = {
    "src": "Lactante de 9 meses con facies compatible con el síndrome de Noonan, hipertelorismo, implatación baja de "\
    "las orejas, trastorno de la conducta e hiperbilirrubinemia.",
    "hpo_codes": {
        "HP:0001999": "Abnormal facial shape",
        "HP:0000316": "Hypertelorism",
        "HP:0002904": "Hyperbilirubinemia",
        "HP:0000369": "Low-set ears",
        "HP:0000719": "Inappropriate behavior"
    }
}

example2 = {
    "src": "Paciente de 21 años con distrofia de retina compatible con retinosis pigmentaria y distrofia de conos "\
            "bastones, esclerosis múltiple y Ataxia Rendu-Osler.",
    "hpo_codes": {
        "HP:0000510": "Rod-cone dystrophy",
        "HP:0000556": "Retinal dystrophy",
        "HP:0001251": "Ataxia",
        "HP:0032118": "Retinitis",
        "HP:0000548": "Cone/cone-rod dystrophy"
    }
}

In [None]:
# DataFrame vacío que usaremos para hacer el fit de los pipelines para obtener modelos
emptyDF = spark.createDataFrame([[""]]).toDF("text")

# NER Pipeline extrae fenotipos en Español

document_assembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol(("document"))

sentence_detector_dl = SentenceDetectorDLModel\
  .pretrained("sentence_detector_dl", "xx") \
  .setInputCols(["document"]) \
  .setOutputCol("sentence")

tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

emb_scielowiki_es = WordEmbeddingsModel.pretrained("embeddings_scielowiki_300d","es","clinical/models")\
	.setInputCols(["document","token"])\
	.setOutputCol("word_embeddings")

ner_diag_proc_es = MedicalNerModel.pretrained("ner_diag_proc","es","clinical/models")\
	.setInputCols("sentence","token","word_embeddings")\
	.setOutputCol("ner")

ner_chunker = nerConverter = NerConverter()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_chunk")

ner_es_pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector_dl,
    tokenizer,
    emb_scielowiki_es,
    ner_diag_proc_es,
    ner_chunker
])


ner_es_model = ner_es_pipeline.fit(emptyDF)
l_ner_es_model = LightPipeline(ner_es_model)

# Marian ES_EN Pipeline, traduce de español a inglés

marian_es_en = MarianTransformer.pretrained("opus_mt_es_en", "xx")\
    .setInputCols(["document"])\
    .setOutputCol("translation")

marian_es_en_pipeline = Pipeline(
    stages=[
            document_assembler, # reutilizamos el DocumentAssembler()
            marian_es_en
            ])
l_marian_es_en = LightPipeline(marian_es_en_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")))


# Resolver Pipeline devuelve a partir de un fenotipo en inglés el código HPO

sbert_embedder = BertSentenceEmbeddings\
     .pretrained("sbiobert_base_cased_mli",'en','clinical/models')\
     .setInputCols(["document"])\
     .setOutputCol("sbert_embeddings")

resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_HPO", "en", "clinical/models") \
     .setInputCols(["ner_chunk", "sbert_embeddings"]) \
     .setOutputCol("resolution")\
     .setDistanceFunction("EUCLIDEAN")

resolver_pipeline = Pipeline(stages = [
    document_assembler, # reutilizamos el DocumentAssembler()
    sbert_embedder,
    resolver]
)
resolver_model = resolver_pipeline.fit(emptyDF)
l_resolver_model = LightPipeline(resolver_model)

sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
embeddings_scielowiki_300d download started this may take some time.
Approximate size to download 351.2 MB
[OK!]
ner_diag_proc download started this may take some time.
Approximate size to download 14.2 MB
[OK!]
opus_mt_es_en download started this may take some time.
Approximate size to download 395.5 MB
[OK!]
sbiobert_base_cased_mli download started this may take some time.
Approximate size to download 384.3 MB
[OK!]
sbiobertresolve_HPO download started this may take some time.
Approximate size to download 98.9 MB
[OK!]


In [None]:
def extract_hpo(src, threshold=0.5):
    """
    Esta función extrae la lista de HPO de un texto mostrando solo aquellos
    donde se ha asignado el HPO con una confianza superior o igual a un threshold
    """
    rdata = []
    # Extraemos las entidades NER potenciales fenotipos
    ner_es_results = l_ner_es_model.annotate(src)
    for chunk in ner_es_results['ner_chunk']:
        # Para cada entidad encontrada la traducimos al inglés
        en_chunk = l_marian_es_en.annotate(chunk)['translation'][0]
        # Obtenemos el HPO de la entidad traducida al inglés
        hpo_results = l_resolver_model.fullAnnotate(en_chunk)[0]['resolution'][0]
        # El resolver devuelve varios códigos HPO separados por ::: 
        # nos quedamos con el primero
        resolution = hpo_results.metadata['all_k_resolutions'].split(":::")[0]
        hpo_code = hpo_results.metadata['all_k_results'].split(":::")[0]
        # confidence indica la fiabilidad del primer HPO (el más probable)
        confidence = hpo_results.metadata['confidence']
        # Añadimos el HPO de la entidad a la tabla de resultados final
        rdata.append([chunk, en_chunk, hpo_code, resolution, confidence])
    results = pd.DataFrame(data=rdata, columns=['chunk', 'en_chunk', 'hpo_code', 'hpo_label', 'confidence'])
    results['confidence'] = results['confidence'].astype(float)
    # Devolvemos ordenado por "confidence" descendente, solo si supera el threshold
    return results[results['confidence']>=threshold].sort_values('confidence', ascending=False)



In [None]:
r1 = extract_hpo(example1['src'])

In [None]:
print(example1['src'])

Lactante de 9 meses con facies compatible con el síndrome de Noonan, hipertelorismo, implatación baja de las orejas, trastorno de la conducta e hiperbilirrubinemia.


In [None]:
r1

Unnamed: 0,chunk,en_chunk,hpo_code,hpo_label,confidence
1,hipertelorismo,hypertelorism,HP:0000316,hypertelorism,0.9945
2,trastorno de la conducta,behavior disorder,HP:0000708,behavioral disorders,0.8406
3,hiperbilirrubinemia,hyperbilirubinaemia,HP:0002904,hyperbilirubinemia,0.801


In [None]:
r2 = extract_hpo(example2['src'])

In [None]:
print(example2['src'])

Paciente de 21 años con distrofia de retina compatible con retinosis pigmentaria y distrofia de conos bastones, esclerosis múltiple y Ataxia Rendu-Osler.


In [None]:
r2

Unnamed: 0,chunk,en_chunk,hpo_code,hpo_label,confidence
0,distrofia de retina,retinal dystrophy,HP:0000556,retinal dystrophy,0.928
4,Ataxia,Ataxia,HP:0001251,ataxia,0.6419
