In [1]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-12-01 22:05:33--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2021-12-01 22:05:34--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-12-01 22:05:34--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:44

In [2]:
import sys

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.functions import array_contains
from pyspark.ml import Pipeline, PipelineModel

from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

In [3]:
import sparknlp
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  3.3.4
Apache Spark version:  3.0.3


# Sentiment Analysis

In [9]:
pipeline = PretrainedPipeline("analyze_sentiment", lang="en")

analyze_sentiment download started this may take some time.
Approx size to download 4.9 MB
[OK!]


In [17]:
testDocs = ["I don't feel alone"]

In [18]:
result_sentiment = pipeline.annotate(testDocs)
[(r['sentence'], r['sentiment']) for r in result_sentiment]

[(["I don't feel alone"], ['negative'])]

In [16]:
result_sentiment

[{'checked': ['I', 'feel', 'alone'],
  'document': ['I feel alone'],
  'sentence': ['I feel alone'],
  'sentiment': ['positive'],
  'token': ['I', 'feel', 'alone']}]

# Language Detector Pipeline

In [4]:
from sparknlp.base import *

In [5]:
documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

language_detector = LanguageDetectorDL.pretrained("ld_wiki_tatoeba_cnn_21")\
.setInputCols(["document"])\
.setOutputCol("lang")\
.setThreshold(0.8)\
.setCoalesceSentences(True)

languagePipeline = Pipeline(stages=[
 documentAssembler, 
 language_detector
])

ld_wiki_tatoeba_cnn_21 download started this may take some time.
Approximate size to download 7.1 MB
[OK!]


In [6]:
test_df = spark.createDataFrame([
  ['Spark NLP is an open-source text processing library for advanced natural language processing for the Python, Java and Scala programming languages.'], 
  ['Spark NLP est une bibliothèque de traitement de texte open source pour le traitement avancé du langage naturel pour les langages de programmation Python, Java et Scala.']]
).toDF("text")

results = languagePipeline.fit(test_df).transform(test_df)

In [7]:
results.select("lang.result").show()

+------+
|result|
+------+
|  [en]|
|  [fr]|
+------+



In [8]:
# probabilities for other languages
results.select("lang.metadata").show(2, False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|metadata                                                                                                                                                                                                                                                                                                                                                                                                                            |
+---------------------------------------------------------------------------------------------------------------------------------------------------------