In [3]:
#Imports
import random
import os
random.seed(30) # set random seed for reproducibility

import numpy as np
from itertools import chain
from collections import Counter


import nltk
nltk.download('punkt')
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import pandas as pd

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp

#Instal eli5
! pip install eli5

! pip install sklearn-crfsuite

import eli5

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
openjdk version "1.8.0_275"
OpenJDK Runtime Environment (build 1.8.0_275-8u275-b01-0ubuntu1~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.275-b01, mixed mode)
Processing /root/.cache/pip/wheels/ab/09/4d/0d184230058e654eb1b04467dbc1292f00eaa186544604b471/pyspark-2.4.4-py2.py3-none-any.whl
Collecting py4j==0.10.7
  Using cached https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.4
Collecting spark-nlp
  Using cached https://files.pythonhosted.org/packages/c6/1d/9a2a7c17fc3b3aa78b3921167feed4911d5a055833fea390e7741bba0870/spark_nlp-2.6.5-py2.py3-none-any.whl
Installing collected packages: spark-nlp
Successfully installed spark-nlp-2.6.5
Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/



In [6]:
def parse_data(text, annotation):    
    # parse annotation-file
    bio_tag = []
    with open(annotation, encoding='utf-8') as ann:
        for l in ann:
            l = l.split('\t')
            tag = l[1].split()[0]
            if not tag == 'AnnotatorNotes':
                string = l[2].strip()
                bio_tag.append((tag, string))
                
    # parse text-file and add POS tags
    sents = []
    with open(text, encoding='utf-8') as fp:
        for line in fp:
            line = line.strip()
            line = nltk.word_tokenize(line)
            pos_line = add_pos_tag(line)
            sents.append(pos_line)
    
    # add BIO tags and format data as follows -> (word, pos, biotag)
    sents = add_bio_tag(sents, bio_tag)
    
    return sents


def add_pos_tag(sent):
    return nltk.pos_tag(sent)


def add_bio_tag(sents, bio_tag):
    """ We use 'cadec/original/' as annotation to add BIO-tags """
    msg = []
    
    df= pd.DataFrame()
    for sent in sents:
        bio_sent = []
        remaining = 0
        for i, word in enumerate(sent):
            BIOtag = 'O'
            for j, tag in enumerate(bio_tag):
                target = nltk.word_tokenize(tag[1])
                tag = tag[0]
                count = 0
                
                # changes the biotag to either 'B-' or 'I-' when necessary
                if word[0] == target[0]:
                    for k in range(len(target)):
                        if len(sent) > i+k and sent[i+k][0] == target[k]:
                            count += 1
                            
                    # if target is found, the current word gets a 'B-' tag assigned
                    if count == len(target):
                        definite_tag = tag
                        BIOtag = 'B-' + definite_tag
                        remaining = len(target) - 1
                        break
            
            # changes the biotag to 'I-' when necessary
            if remaining > 0 and BIOtag == 'O':
                BIOtag = 'I-' + definite_tag
                remaining -= 1
          
            bio_sent.append((word[0], word[1], BIOtag))
        msg.append(bio_sent)
    return msg

In [None]:
DIR_text = 'cadec/text/'
DIR_annotation = 'cadec/original/'

data = []
for f in os.listdir(DIR_text):
    text = DIR_text + f
    ann = DIR_annotation + f[:-4] + '.ann'
    m = parse_data(text, ann)
    data.append(m)

# shows first patient post
data[0]

In [None]:
#split in test train
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.33, random_state=42)

import pandas as pd
df = pd.DataFrame()

#generate in CONLL format
f = open("data_train2", "w")
f.write('-DOCSTART- -X- -X- O \n \n')

for i in train:
    for j in i:
        for k in j:
            line = k[0] + ' -X- ' + '-X- ' + k[2] + '\n'
            f.write(line)
        f.write('\n')
f.close()

g = open('data_test2', 'w')
g.write('-DOCSTART- -X- -X- O \n \n')
for i in test:
    for j in i:
        for k in j:
            line = k[0] + ' -X- ' + '-X- ' + k[2] + '\n'
            g.write(line)
        g.write('\n')

#Starting BERT

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
import sparknlp

spark = sparknlp.start()
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.6.5
Apache Spark version:  2.4.4


In [6]:
from sparknlp.training import CoNLL
training_data = CoNLL().readDataset(spark, './data_train2')
training_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Blured vision , c...|[[document, 0, 42...|[[document, 0, 42...|[[token, 0, 5, Bl...|[[pos, 0, 5, -X-,...|[[named_entity, 0...|
|        Very shaky ?|[[document, 0, 11...|[[document, 0, 11...|[[token, 0, 3, Ve...|[[pos, 0, 3, -X-,...|[[named_entity, 0...|
|After only two we...|[[document, 0, 21...|[[document, 0, 21...|[[token, 0, 4, Af...|[[pos, 0, 4, -X-,...|[[named_entity, 0...|
|Feels like a 'pan...|[[document, 0, 29...|[[document, 0, 29...|[[token, 0, 4, Fe...|[[pos, 0, 4, -X-,...|[[named_entity, 0...|
|I would not risk ...|[[document, 0, 14...|[[document, 0, 14...|[[token, 0, 0, I,...|[[pos, 0, 0, -X-,..

In [13]:
bert = BertEmbeddings.pretrained('bert_base_cased', 'en') \
 .setInputCols(["sentence",'token'])\
 .setOutputCol("bert")\
 .setCaseSensitive(False)

nerTagger = NerDLApproach()\
.setInputCols(["sentence", "token", "bert"])\
.setLabelColumn("label")\
.setOutputCol("ner")\
.setMaxEpochs(1)\
.setRandomSeed(0)\
.setVerbose(1)\
.setValidationSplit(0.2)\
.setEvaluationLogExtended(True)\
.setEnableOutputLogs(True)\
.setIncludeConfidence(True)\
.setTestDataset("test_withEmbeds.parquet")

test_data = CoNLL().readDataset(spark, './data_test2')
test_data = bert.transform(test_data)
test_data.write.parquet("test_withEmbeds.parquet")

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [14]:
ner_pipeline = Pipeline(stages = [bert, nerTagger])
ner_model = ner_pipeline.fit(training_data)

In [15]:
ner_model.stages[1].write().save('NER_bert_cadac')

In [17]:
predictions = ner_model.transform(test_data)
predictions.show()
predictions.select('token.result','label.result','ner.result').show(truncate=40)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|                bert|                 ner|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|After 1 1/2 years...|[[document, 0, 83...|[[document, 0, 83...|[[token, 0, 4, Af...|[[pos, 0, 4, -X-,...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|
|Have stopped taki...|[[document, 0, 62...|[[document, 0, 62...|[[token, 0, 3, Ha...|[[pos, 0, 3, -X-,...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|
|Excellent job of ...|[[document, 0, 43...|[[document, 0, 43...|[[token, 0, 8, Ex...|[[pos, 0, 8, -X-,...|[[named_entity, 0...|[[word_embeddings...|[[

In [18]:
import pyspark.sql.functions as F

predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth"),
        F.expr("cols['2']").alias("prediction")).show(truncate=False)

+------------+------------+----------+
|token       |ground_truth|prediction|
+------------+------------+----------+
|After       |O           |O         |
|1           |O           |O         |
|1/2         |O           |O         |
|years       |O           |O         |
|of          |O           |O         |
|taking      |O           |O         |
|10          |O           |O         |
|mg/         |O           |O         |
|day         |O           |O         |
|I           |O           |O         |
|am          |O           |O         |
|experiencing|O           |O         |
|constant    |B-ADR       |O         |
|gas         |I-ADR       |B-ADR     |
|and         |O           |O         |
|diarrhea    |B-ADR       |B-ADR     |
|.           |O           |O         |
|Have        |O           |O         |
|stopped     |O           |O         |
|taking      |O           |O         |
+------------+------------+----------+
only showing top 20 rows

