In [17]:
import re

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer

In [2]:
# Initialise Spark

spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[4]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.4")\
    .getOrCreate()

In [None]:
# Load BERT fine tuned model

class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 17

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

model = torch.load('model/bert_trials.pth')
model.eval()

In [3]:
# Load data

CHEMBL_EVIDENCE_PATH = 'data/chembl-2021-08-23.json.gz'

stopReasons = (
        spark.read.json(CHEMBL_EVIDENCE_PATH)

        # Extract a test set
        .sample(0.01)

        # Extract studies with their reasons to stop
        .withColumn('urls', F.explode('urls'))
        .filter(F.col('urls.niceName').contains('ClinicalTrials'))
        .withColumn('nct_id', F.element_at(F.split(F.col('urls.url'), '%22'), -2))
        .select('nct_id', 'studyStopReason')
        .filter(F.col('studyStopReason').isNotNull())
        .distinct()
    )

## Create Pipeline

What the Pipeline should consist of:
- Document Assembler: converts the raw string to documents that Spark NLP can handle.
- Tokenize each document with a series of constraints:
  I have to reproduce this in SparkNLP's built-in tokenizer.
  ```
  encoded_sent = ( 
            BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
            .encode_plus(
            text=text_preprocessing(sent),  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,             # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
    )
  ```
- Create DataLoader. Transformers is fed with 2 Tensors: input_ids (the id representation of each token) and attention masks (mask that identifies whether a token is made out of padding).

In [16]:
document = (
    DocumentAssembler()
    .setInputCol('studyStopReason').setOutputCol('document')
)

tokenizer = Tokenizer().setInputCols('document').setOutputCol('token')

In [10]:
pipeline = Pipeline().setStages([
    document, tokenizer
])

model = pipeline.fit(stopReasons)

In [13]:
model.transform(stopReasons).first()

Row(nct_id='NCT00880373', studyStopReason='The funding withdrawal and early termination of the trial is based upon lack of suitable recruitment figures in order to reach the required trial endpoints.', document=[Row(annotatorType='document', begin=0, end=155, result='The funding withdrawal and early termination of the trial is based upon lack of suitable recruitment figures in order to reach the required trial endpoints.', metadata={'sentence': '0'}, embeddings=[])], token=[Row(annotatorType='token', begin=0, end=2, result='The', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=4, end=10, result='funding', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=12, end=21, result='withdrawal', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=23, end=25, result='and', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=27, end=31, result='early', metadata={'sentence': '0'}, embeddings=[

### Problem: you cannot add custom tokenizers to the pipeline. It'll have to be more manual

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def clean_sentence(sentece:str) -> str:
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
    sentece = re.sub(r'(@.*?)[\s]', ' ', sentece)

    # Replace '&amp;' with '&'
    sentece = re.sub(r'&amp;', '&', sentece)

    # Remove trailing whitespace
    sentece = re.sub(r'\s+', ' ', sentece).strip()

    return sentece

def apply_bert_tokenizer(sentence:str, bert_tokenizer, max_len:int==64):

    cleaned_sentence = clean_sentence(sentence)

    return ( 
        bert_tokenizer
        .encode_plus(
        text=cleaned_sentence,  # Preprocess sentence
        add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
        max_length=max_len,             # Max length to truncate/pad
        pad_to_max_length=True,         # Pad sentence to max length
        return_tensors='pt',           # Return PyTorch tensor
        return_attention_mask=True      # Return attention mask
    ))

## 2. Import HF models into Spark NLP

Following https://medium.com/spark-nlp/importing-huggingface-models-into-sparknlp-8c63bdea671d

### 2.1. Downloading tokenizer and classification models from the Hub

- Tokenizer: AutoTokenizer
- Sentence Classificator: AutoModelForMaskedLM

In [1]:
import transformers
from transformers import TFRobertaModel
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertModel, BertTokenizer, TFRobertaModel

import tensorflow

import json
import os
import shutil

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = sparknlp.start()

# it works!

In [3]:
tokenizer = AutoTokenizer.from_pretrained("PlanTL-GOB-ES/roberta-base-biomedical-es")
model = AutoModelForMaskedLM.from_pretrained("PlanTL-GOB-ES/roberta-base-biomedical-es")

# models succesfully loaded

In [38]:
# Can i import theses PT models from TFBertModel? No. There must be weights in TF format.

# model_tf = TFRobertaModel.from_pretrained("PlanTL-GOB-ES/roberta-base-biomedical-es")

In [4]:
# make an inference - it works!

unmasker = transformers.pipeline('fill-mask', model="PlanTL-GOB-ES/roberta-base-biomedical-es")
unmasker("El único antecedente personal a reseñar era la <mask> arterial.")

[{'score': 0.9855023622512817,
  'token': 3529,
  'token_str': ' hipertensión',
  'sequence': ' El único antecedente personal a reseñar era la hipertensión arterial.'},
 {'score': 0.0039140768349170685,
  'token': 1945,
  'token_str': ' diabetes',
  'sequence': ' El único antecedente personal a reseñar era la diabetes arterial.'},
 {'score': 0.002484647324308753,
  'token': 11483,
  'token_str': ' hipotensión',
  'sequence': ' El único antecedente personal a reseñar era la hipotensión arterial.'},
 {'score': 0.0023484493140131235,
  'token': 12238,
  'token_str': ' Hipertensión',
  'sequence': ' El único antecedente personal a reseñar era la Hipertensión arterial.'},
 {'score': 0.0008009276352822781,
  'token': 2267,
  'token_str': ' presión',
  'sequence': ' El único antecedente personal a reseñar era la presión arterial.'}]

### 2.2. Saving the model in Tensorflow/Pytorch format

`save_pretrained` saves the model with its weights and configuration.

In this case the RoBERTa model has been trained using PyTorch. We can indicate to save it in Tensorflow.
This is also the case of Olesya's.

Spark NLP needs to import a TF specific model.

In [5]:
MODEL_NAME_TF = 'roberta_tf'

tokenizer.save_pretrained(f'model/test/{MODEL_NAME_TF}_tokenizer/')
model.save_pretrained(f'model/test/{MODEL_NAME_TF}_classificator/', saved_model=True, save_format='tf')

# models exported - but roberta_rf_classificator does not include the weights in TF format?

In [25]:
model.save_pretrained(f'model/test/{MODEL_NAME_TF}_classificator/', saved_model=True, save_format='tf')

# There's an issue from Nov 21 saying saved_model does not work as expected. Removing the parameter doesnt work either (https://github.com/huggingface/transformers/issues/14403).

### 2.3. Load the model into Spark NLP

In [11]:

loaded_classificator = TFRobertaModel.from_pretrained(f'model/test/{MODEL_NAME_TF}_classificator/', from_pt=True)

# Eureka!!!!!! Model can be used in Transformers

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.embeddings.position_ids', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

In [13]:
# As expected, when the TF model is exported the TF structure is followed (-h5 format)

loaded_classificator.save_pretrained(f'model/test/{MODEL_NAME_TF}_classificator_tf/', saved_model=True, save_format='tf')





INFO:tensorflow:Assets written to: model/roberta_tf_classificator_tf/saved_model/1/assets


INFO:tensorflow:Assets written to: model/roberta_tf_classificator_tf/saved_model/1/assets


In [21]:
TFRobertaModel.from_pretrained(f'model/test/{MODEL_NAME_TF}_classificator_tf/')

All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at model/test/roberta_tf_classificator_tf/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


<transformers.models.roberta.modeling_tf_roberta.TFRobertaModel at 0x7fe7b5123690>

In [16]:
# After saving the model, you also need to add the vocab.txt file to the assets directory of the saved model.
# vocab was exported in JSON format. I have to convert it

with open(f'model/test/{MODEL_NAME_TF}_tokenizer/vocab.json', 'r') as f:
    data = json.load(f)
with open(f'model/test/{MODEL_NAME_TF}_tokenizer/vocab.txt', 'w') as f:
    for d in data.keys():
        f.write(d)
        f.write('\n')

# Now vocab.txt and merges.txt are added to the assets directory

vocab_pth = f"model/test/{MODEL_NAME_TF}_tokenizer/vocab.txt"
merges_pth = f"model/test/{MODEL_NAME_TF}_tokenizer/merges.txt"
saved_model_pth = f'model/test/{MODEL_NAME_TF}_classificator_tf/saved_model/1/assets'

!cp $vocab_pth $saved_model_pth
!cp $merges_pth $saved_model_pth

In [31]:
# Let's import it into Spark NLP

embeddings = RoBertaEmbeddings.loadSavedModel('model/test/roberta_tf_classificator_tf/saved_model/1', spark).setInputCols(["document",'token']).setOutputCol("embeddings")

In [32]:
# It is loaded into Spark 🥲

type(embeddings)

sparknlp.annotator.RoBertaEmbeddings

In [19]:
# embeddings is the Spark NLP version of the model - let's save it

embeddings.write().overwrite().save(f"model/test/{MODEL_NAME_TF}_spark_nlp")

In [20]:
shutil.make_archive(
    base_name=f"model/test/roberta-base-biomedical-es",
    format="zip",
    f"model/test/roberta_tf_spark_nlp"
)

'/Users/irene/MEGAsync/EBI/repos/evidence_datasource_parsers/exploration/stopReasons/roberta-base-biomedical-es.zip'

### 2.4. Make predictions within a Spark NLP Pipeline

In [36]:
documentAssembler = (DocumentAssembler()
    .setInputCol("text")
    .setOutputCol("document")
)

tokenizer = (Tokenizer()
    .setInputCols("document")
    .setOutputCol("token")
)

sequenceClassifier = (embeddings
    .setInputCols(["document", "token"])
    .setOutputCol("class")
    .setCaseSensitive(True)
)

pipeline = Pipeline(stages = [
    documentAssembler,
    tokenizer,
    sequenceClassifier])

In [34]:
# Load data

CHEMBL_EVIDENCE_PATH = 'data/chembl-2021-08-23.json.gz'

stopReasons = (
        spark.read.json(CHEMBL_EVIDENCE_PATH)

        # Extract a test set
        .sample(0.01)

        # Extract studies with their reasons to stop
        .withColumn('urls', F.explode('urls'))
        .filter(F.col('urls.niceName').contains('ClinicalTrials'))
        .withColumn('nct_id', F.element_at(F.split(F.col('urls.url'), '%22'), -2))
        .select('nct_id', F.col('studyStopReason').alias('text'))
        .filter(F.col('studyStopReason').isNotNull())
        .distinct()
    )

stopReasons.first()

Row(nct_id='NCT01313390', text='Lack of recruitment')

In [37]:
result = pipeline.fit(stopReasons).transform(stopReasons)

In [47]:
result.select('nct_id', 'text', 'class').printSchema()

root
 |-- nct_id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- class: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)



In [None]:
# TODO: Labels of the classification need to be prepared



## 3. Load Olesya's model with Transformers to follow the same pipeline

Olesya's model is just a .pth file, is this compatible with Transformers?

NO. Making a model compatible with Transformers is another task and it is a complex one.
In their docs it is described what such a task entails. A PR must be open adding a new module where the code of the new model sources the library source code.


## 4. Can I train the model using Spark NLP?

I will follow this notebook as a reference: https://www.kaggle.com/pranavkasela/bert-vs-spark-nlp-use-embeddings

In [46]:
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark import SparkContext, SparkConf


import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.common import *
#from sparknlp.embeddings import *

In [4]:
#spark = sparknlp.start()

sqlContext = SQLContext(sparkContext=spark.sparkContext, 
                        sparkSession=spark)
                    

### 4.1 Load training dataset

From `gs://ot-team/olesya/maintrain.txt`

In [9]:
train = spark.read.csv('data/maintrain.txt', header=True, sep='\t')

train.show(3, False, True)

-RECORD 0--------------------------------------------------------------------------------------------------
 text  | Novartis decided to terminate based on strategic and administrative reasons.                      
 label | Business_Administrative                                                                           
-RECORD 1--------------------------------------------------------------------------------------------------
 text  | Company decision has been taken in light of recent demands by certain national health authorities 
 label | Business_Administrative                                                                           
-RECORD 2--------------------------------------------------------------------------------------------------
 text  | Company decision taken in light of demands by certain national health authorities                 
 label | Business_Administrative                                                                           
only showing top 3 rows



In [37]:
import plotly.express as px

frequencies = train.toPandas().groupby('label').count().reset_index().rename(columns={'text': 'frequency'}).sort_values(by=['frequency'], ascending=False)

fig = px.bar(frequencies, x='label', y='frequency')
fig.show()

# Labels are quite unbalanced

### 4.2 Apply same cleaning process as Olesya before tokenization

Her function to implement: 

```
def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united') # NOT NECESSARY
    - Correct errors (eg. '&amp;' to '&') # NOT NECESSARY
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text
```

Only the removal of trailing whitespace applies.

In [43]:
train = (
    train.withColumn('text', F.trim('text'))
)

### 4.3. Create embeddings from text

1. Tokenize using BERT Tokenizer from HF

Olesya's code for tokenization:
```
encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing(sent),  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
            )
```

1. Load tokenizer into Spark NLP
2. Use tokenizer to create embeddings


In [None]:
from transformers import TFBertModel, BertTokenizer 

MODEL_NAME = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME).save_pretrained(f'data/{MODEL_NAME}_tokenizer')

# just in case if there is no TF/Keras file provided in the model
# we can just use `from_pt` and convert PyTorch to TensorFlow
try:
  print('try downloading TF weights')
  model = TFBertModel.from_pretrained(MODEL_NAME)
except:
  print('try downloading PyTorch weights')
  model = TFBertModel.from_pretrained(MODEL_NAME, from_pt=True)

model.save_pretrained(f'data/{MODEL_NAME}_tokenizer', saved_model=True)

In [52]:
document_assembler = (
    DocumentAssembler()
    .setInputCol('text')
    .setOutputCol('document')
)

embeddings = BertSentenceEmbeddings.pretrained("sent_bert_base_uncased", "en") \
      .setInputCols("document") \
      .setOutputCol("sentence_embeddings")

sent_bert_base_uncased download started this may take some time.


Py4JJavaError: An error occurred while calling z:com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize.
: java.lang.NoClassDefFoundError: org/json4s/package$MappingException
	at org.json4s.ext.EnumNameSerializer.deserialize(EnumSerializer.scala:53)
	at org.json4s.Formats$$anonfun$customDeserializer$1.applyOrElse(Formats.scala:66)
	at org.json4s.Formats$$anonfun$customDeserializer$1.applyOrElse(Formats.scala:66)
	at scala.collection.TraversableOnce.collectFirst(TraversableOnce.scala:180)
	at scala.collection.TraversableOnce.collectFirst$(TraversableOnce.scala:167)
	at scala.collection.AbstractTraversable.collectFirst(Traversable.scala:108)
	at org.json4s.Formats$.customDeserializer(Formats.scala:66)
	at org.json4s.Extraction$.customOrElse(Extraction.scala:775)
	at org.json4s.Extraction$.extract(Extraction.scala:454)
	at org.json4s.Extraction$.extract(Extraction.scala:56)
	at org.json4s.ExtractableJsonAstNode.extract(ExtractableJsonAstNode.scala:22)
	at com.johnsnowlabs.util.JsonParser$.parseObject(JsonParser.scala:28)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$.parseJson(ResourceMetadata.scala:109)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$$anonfun$readResources$1.applyOrElse(ResourceMetadata.scala:138)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$$anonfun$readResources$1.applyOrElse(ResourceMetadata.scala:137)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at scala.collection.Iterator$$anon$13.next(Iterator.scala:593)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ListBuffer.$plus$plus$eq(ListBuffer.scala:184)
	at scala.collection.mutable.ListBuffer.$plus$plus$eq(ListBuffer.scala:47)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at scala.collection.AbstractIterator.to(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toList(TraversableOnce.scala:350)
	at scala.collection.TraversableOnce.toList$(TraversableOnce.scala:350)
	at scala.collection.AbstractIterator.toList(Iterator.scala:1431)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$.readResources(ResourceMetadata.scala:137)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$.readResources(ResourceMetadata.scala:132)
	at com.johnsnowlabs.client.aws.AWSGateway.getMetadata(AWSGateway.scala:78)
	at com.johnsnowlabs.nlp.pretrained.S3ResourceDownloader.downloadMetadataIfNeed(S3ResourceDownloader.scala:62)
	at com.johnsnowlabs.nlp.pretrained.S3ResourceDownloader.resolveLink(S3ResourceDownloader.scala:68)
	at com.johnsnowlabs.nlp.pretrained.S3ResourceDownloader.getDownloadSize(S3ResourceDownloader.scala:145)
	at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.getDownloadSize(ResourceDownloader.scala:445)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader$.getDownloadSize(ResourceDownloader.scala:584)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize(ResourceDownloader.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.ClassNotFoundException: org.json4s.package$MappingException
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:471)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:588)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:521)
	... 51 more


In [51]:
document_assembler = (
    DocumentAssembler()
    .setInputCol('text')
    .setOutputCol('document')
)

tokenizer = (
  # from https://nlp.johnsnowlabs.com/docs/en/annotators#tokenizer
  Tokenizer()
  .setInputCols(["document"])
  .setOutputCol("token")
  .fit(train)
)

sentence = (
  # from https://nlp.johnsnowlabs.com/docs/en/annotators#sentencedetector
  SentenceDetector()
  .setInputCols(["document"])
  .setOutputCol("sentence")
)

encoder = (
  # from https://nlp.johnsnowlabs.com/2020/08/25/bert_base_cased.html
  BertEmbeddings.pretrained("bert_base_cased", "en")
  .setInputCols("sentence", "token")
  .setOutputCol("embeddings")
)

'''
clf = ClassifierDLApproach()\
          .setInputCols(["embeddings"])\
          .setOutputCol("prediction")\
          .setLabelColumn("target")\
          .setMaxEpochs(30)\
          .setBatchSize(32)

pipeline = Pipeline(
    stages = [
        document_assembler,
        tokenizer,
        encoder,
        clf
    ])
'''



bert_base_cased download started this may take some time.


Py4JJavaError: An error occurred while calling z:com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize.
: java.lang.NoClassDefFoundError: org/json4s/package$MappingException
	at org.json4s.ext.EnumNameSerializer.deserialize(EnumSerializer.scala:53)
	at org.json4s.Formats$$anonfun$customDeserializer$1.applyOrElse(Formats.scala:66)
	at org.json4s.Formats$$anonfun$customDeserializer$1.applyOrElse(Formats.scala:66)
	at scala.collection.TraversableOnce.collectFirst(TraversableOnce.scala:180)
	at scala.collection.TraversableOnce.collectFirst$(TraversableOnce.scala:167)
	at scala.collection.AbstractTraversable.collectFirst(Traversable.scala:108)
	at org.json4s.Formats$.customDeserializer(Formats.scala:66)
	at org.json4s.Extraction$.customOrElse(Extraction.scala:775)
	at org.json4s.Extraction$.extract(Extraction.scala:454)
	at org.json4s.Extraction$.extract(Extraction.scala:56)
	at org.json4s.ExtractableJsonAstNode.extract(ExtractableJsonAstNode.scala:22)
	at com.johnsnowlabs.util.JsonParser$.parseObject(JsonParser.scala:28)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$.parseJson(ResourceMetadata.scala:109)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$$anonfun$readResources$1.applyOrElse(ResourceMetadata.scala:138)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$$anonfun$readResources$1.applyOrElse(ResourceMetadata.scala:137)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at scala.collection.Iterator$$anon$13.next(Iterator.scala:593)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ListBuffer.$plus$plus$eq(ListBuffer.scala:184)
	at scala.collection.mutable.ListBuffer.$plus$plus$eq(ListBuffer.scala:47)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at scala.collection.AbstractIterator.to(Iterator.scala:1431)
	at scala.collection.TraversableOnce.toList(TraversableOnce.scala:350)
	at scala.collection.TraversableOnce.toList$(TraversableOnce.scala:350)
	at scala.collection.AbstractIterator.toList(Iterator.scala:1431)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$.readResources(ResourceMetadata.scala:137)
	at com.johnsnowlabs.nlp.pretrained.ResourceMetadata$.readResources(ResourceMetadata.scala:132)
	at com.johnsnowlabs.client.aws.AWSGateway.getMetadata(AWSGateway.scala:78)
	at com.johnsnowlabs.nlp.pretrained.S3ResourceDownloader.downloadMetadataIfNeed(S3ResourceDownloader.scala:62)
	at com.johnsnowlabs.nlp.pretrained.S3ResourceDownloader.resolveLink(S3ResourceDownloader.scala:68)
	at com.johnsnowlabs.nlp.pretrained.S3ResourceDownloader.getDownloadSize(S3ResourceDownloader.scala:145)
	at com.johnsnowlabs.nlp.pretrained.ResourceDownloader$.getDownloadSize(ResourceDownloader.scala:445)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader$.getDownloadSize(ResourceDownloader.scala:584)
	at com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.getDownloadSize(ResourceDownloader.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.ClassNotFoundException: org.json4s.package$MappingException
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:471)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:588)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:521)
	... 51 more
