<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-the-libraries" data-toc-modified-id="Load-the-libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load the libraries</a></span></li><li><span><a href="#Useful-Functions" data-toc-modified-id="Useful-Functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Useful Functions</a></span></li><li><span><a href="#Load-the-data" data-toc-modified-id="Load-the-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Load the data</a></span></li><li><span><a href="#Sparknlp-functions" data-toc-modified-id="Sparknlp-functions-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Sparknlp functions</a></span></li><li><span><a href="#Bert-using-sparknlp" data-toc-modified-id="Bert-using-sparknlp-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Bert using sparknlp</a></span></li><li><span><a href="#Modelling:-logistic-regression" data-toc-modified-id="Modelling:-logistic-regression-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Modelling: logistic regression</a></span></li></ul></div>

In [2]:
#=============== setup sparknlp
import os
import sys

sys.path.append("/Users/poudel/opt/miniconda3/envs/mysparknlp/lib/python3.7/site-packages")
os.environ["SPARK_HOME"] = "/Users/poudel/Softwares/Spark/spark-2.4.4-bin-hadoop2.7"
os.environ["PYSPARK_PYTHON"] = "/Users/poudel/opt/miniconda3/envs/mysparknlp/bin/python"
os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = "notebook"
#================ setup sparknlp end

# Load the libraries

In [9]:
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option('max_columns',100)

import time,os,json
time_start_notebook = time.time()
home = os.path.expanduser('~')
SEED=100

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

[(x.__name__,x.__version__) for x in [np,pd,sns]]

[('numpy', '1.19.1'), ('pandas', '0.23.4'), ('seaborn', '0.10.1')]

# Useful Functions

In [36]:
def show_methods(obj, ncols=7,
            start=None, inside=None,exclude=None,
            caps_only=False,lower_only=False,
                            printt=False):
    """ Show all the attributes of a given method.

    Parameters
    -----------
    obj: object
        Name of python object. eg. list, pd.DataFrame
    ncols: int
        Number of columns
    start: str
        Substring the attribute starsts with.
    inside: str or tuple or list
        Show only these attributes if given substring exists.
    exclude: str or tuple or list
        Exclude these exact elements
    caps_only: bool
        Show only Title case words
    lower_only: bool
        Show only lowercase case words
    printt: bool
        Print the dataframe or not.

    """

    # print(f'Object Type: {type(obj)}\n')
    lst = [I for I in dir(obj) if I[0]!='_' ]

    # exclude usual imports
    usual_imports = ['np','pd','os','sys','time','psycopg2',
                    'plt','string','px',
                    're','nltk','sklearn','spacy']
    lst = [I for I in lst
            if I not in usual_imports ]

    # capital only (for classes)
    if caps_only:
        lst = [I for I in lst if I[0].isupper()]

    # lowercase only (method attributes)
    if lower_only:
        lst = [I for I in lst if I[0].islower()]

    # starts with something
    if isinstance(start,str):
        lst = [I for I in lst if i.startswith(start)]

    if isinstance(start,tuple) or isinstance(start,list):
        lst = [I for I in lst for start_i in start
                if i.startswith(start_i)]

    # inside something
    if isinstance(inside,str):
        lst = [I for I in lst if inside in I]
    if isinstance(inside,tuple) or isinstance(inside,list):
        lst = [I for I in lst for inside_i in inside
                if inside_i in I]

    # exclude substring
    if isinstance(exclude,str):
        lst = [I for I in lst if I != exclude]

    if isinstance(exclude,tuple) or isinstance(exclude,list):
        lst = [I for I in lst if I not in exclude]

    # ouput dataframe
    df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')

    # for terminal sometimes we need to print
    if printt:
        print(df)

    return df

# Load the data

In [3]:
import sparknlp

spark = sparknlp.start()
data = [
  ("New York is the greatest city in the world", 0),
  ("The beauty of Paris is vast", 1),
  ("The Centre Pompidou is in Paris", 1)
]

sdf = spark.createDataFrame(data, ["text","label"])
sdf.show()

+--------------------+-----+
|                text|label|
+--------------------+-----+
|New York is the g...|    0|
|The beauty of Par...|    1|
|The Centre Pompid...|    1|
+--------------------+-----+



# Sparknlp functions
sparknlp Annotators: https://nlp.johnsnowlabs.com/docs/en/annotators#documentassembler-getting-data-in  
sparknlp models: https://nlp.johnsnowlabs.com/docs/en/models

In [37]:
show_methods(sparknlp)

Unnamed: 0,0,1,2,3,4,5,6
0,Chunk2Doc,Finisher,annotation,annotators,common,internal,start
1,Doc2Chunk,SparkSession,annotator,base,embeddings,pretrained,version
2,DocumentAssembler,TokenAssembler,,,,,


In [38]:
show_methods(sparknlp.base,3)

Unnamed: 0,0,1,2
0,ABC,Finisher,RecursiveEstimator
1,Annotation,HasRecursiveFit,RecursivePipeline
2,AnnotatorProperties,HasRecursiveTransform,RecursivePipelineModel
3,AnnotatorTransformer,JavaEstimator,RecursiveTransformer
4,Chunk2Doc,LightPipeline,TokenAssembler
5,Doc2Chunk,Param,Transformer
6,DocumentAssembler,Params,TypeConverters
7,EmbeddingsFinisher,Pipeline,keyword_only
8,Estimator,PipelineModel,


In [39]:
show_methods(sparknlp.annotator,3)

Unnamed: 0,0,1,2
0,AlbertEmbeddings,NGramGenerator,Tokenizer
1,AnnotatorApproach,NerApproach,TokenizerModel
2,AnnotatorModel,NerConverter,TypeConverters
3,AnnotatorProperties,NerCrfApproach,TypedDependencyParserApproach
4,BertEmbeddings,NerCrfModel,TypedDependencyParserModel
5,BertSentenceEmbeddings,NerDLApproach,UniversalSentenceEncoder
6,BigTextMatcher,NerDLModel,ViveknSentimentApproach
7,BigTextMatcherModel,NerOverwriter,ViveknSentimentModel
8,ChunkEmbeddings,Normalizer,WordEmbeddings
9,ChunkTokenizer,NormalizerModel,WordEmbeddingsModel


In [40]:
show_methods(sparknlp.common,3)

Unnamed: 0,0,1,2
0,AnnotatorApproach,HasExcludableStorage,Param
1,AnnotatorModel,HasStorage,Params
2,AnnotatorProperties,HasStorageModel,ReadAs
3,CoverageResult,HasStorageRef,RecursiveAnnotatorApproach
4,ExternalResource,JavaEstimator,RegexRule
5,HasCaseSensitiveProperties,JavaMLWritable,TypeConverters
6,HasEmbeddingsProperties,JavaModel,keyword_only


In [41]:
show_methods(sparknlp.embeddings,3)

Unnamed: 0,0,1,2
0,AlbertEmbeddings,NGramGenerator,Tokenizer
1,AnnotatorApproach,NerApproach,TokenizerModel
2,AnnotatorModel,NerConverter,TypeConverters
3,AnnotatorProperties,NerCrfApproach,TypedDependencyParserApproach
4,BertEmbeddings,NerCrfModel,TypedDependencyParserModel
5,BertSentenceEmbeddings,NerDLApproach,UniversalSentenceEncoder
6,BigTextMatcher,NerDLModel,ViveknSentimentApproach
7,BigTextMatcherModel,NerOverwriter,ViveknSentimentModel
8,ChunkEmbeddings,Normalizer,WordEmbeddings
9,ChunkTokenizer,NormalizerModel,WordEmbeddingsModel


# Bert using sparknlp

In [21]:
document_assembler = sparknlp.DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

tokenizer = sparknlp.embeddings.Tokenizer().setInputCols(["document"])\
  .setOutputCol("token")
 
word_embeddings = sparknlp.embeddings.BertEmbeddings.pretrained('bert_base_cased', 'en')\
  .setInputCols(["document", "token"])\
  .setOutputCol("embeddings")


bert_pipeline = sparknlp.base.Pipeline().setStages(
  [
    document_assembler,
    tokenizer,
    word_embeddings
  ]
)

sdf_bert = bert_pipeline.fit(sdf).transform(sdf)
display(sdf_bert)

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


DataFrame[text: string, label: bigint, document: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, token: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, embeddings: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>]

In [23]:
sdf_bert.show()

+--------------------+-----+--------------------+--------------------+--------------------+
|                text|label|            document|               token|          embeddings|
+--------------------+-----+--------------------+--------------------+--------------------+
|New York is the g...|    0|[[document, 0, 41...|[[token, 0, 2, Ne...|[[word_embeddings...|
|The beauty of Par...|    1|[[document, 0, 26...|[[token, 0, 2, Th...|[[word_embeddings...|
|The Centre Pompid...|    1|[[document, 0, 30...|[[token, 0, 2, Th...|[[word_embeddings...|
+--------------------+-----+--------------------+--------------------+--------------------+



# Modelling: logistic regression

In [24]:
import pyspark.sql.functions as F
import pyspark.sql.types as T


from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors, VectorUDT

def avg_vectors(bert_vectors):
    length = len(bert_vectors[0]["embeddings"])
    avg_vec = [0] * length
    for vec in bert_vectors:
        for i, x in enumerate(vec["embeddings"]):
            avg_vec[i] += x
        avg_vec[i] = avg_vec[i] / length
    return avg_vec


#create a udf
avg_vectors_udf = F.udf(avg_vectors, T.ArrayType(T.DoubleType()))
df_doc_vec = df_bert.withColumn("doc_vector", avg_vectors_udf(F.col("embeddings")))
display(df_doc_vec)

DataFrame[text: string, label: bigint, document: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, token: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, embeddings: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, doc_vector: array<double>]

In [28]:
def dense_vector(vec):
    return Vectors.dense(vec)

dense_vector_udf = F.udf(dense_vector, VectorUDT())
training = df_doc_vec.withColumn("features", dense_vector_udf(F.col("doc_vector")))


model = LogisticRegression(labelCol="label", featuresCol="features",
                        maxIter=10, regParam=0.3, elasticNetParam=0.8)
model = model.fit(training)
# print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

Intercept: 0.6936644983903933
