In [3]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[4]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.2")\
    .getOrCreate()

22/05/05 12:57:50 WARN Utils: Your hostname, hp-HP-Notebook resolves to a loopback address: 127.0.1.1; using 192.168.1.185 instead (on interface wlp13s0)
22/05/05 12:57:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/hp/.local/lib/python3.8/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/hp/.ivy2/cache
The jars for the packages stored in: /home/hp/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-dd8ead37-eaba-4ca8-8b6b-9dcc8ee5c8fb;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;3.4.2 in central
	found com.typesafe#config;1.4.1 in central
	found org.rocksdb#rocksdbjni;6.5.3 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.603 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.

In [5]:
#import sparknlp

spark = sparknlp.start(spark32 = True)

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

spark

Spark NLP version:  3.4.2
Apache Spark version:  3.1.2


Create Spark Dataframe

In [6]:
text = 'Peter Parker is a nice guy and lives in New York'

spark_df = spark.createDataFrame([[text]]).toDF("text")

spark_df.show(truncate=False)


                                                                                

+------------------------------------------------+
|text                                            |
+------------------------------------------------+
|Peter Parker is a nice guy and lives in New York|
+------------------------------------------------+



In [7]:
from pyspark.sql.types import StringType, IntegerType

# if you want to create a spark datafarme from a list of strings

text_list = ['Peter Parker is a nice guy and lives in New York.', 'Bruce Wayne is also a nice guy and lives in Gotham City.']

spark.createDataFrame(text_list, StringType()).toDF("text").show(truncate=80)


+--------------------------------------------------------+
|                                                    text|
+--------------------------------------------------------+
|       Peter Parker is a nice guy and lives in New York.|
|Bruce Wayne is also a nice guy and lives in Gotham City.|
+--------------------------------------------------------+



In [9]:
from pyspark.sql import Row

spark.createDataFrame(list(map(lambda x: Row(text=x), text_list))).show(truncate=80)


+--------------------------------------------------------+
|                                                    text|
+--------------------------------------------------------+
|       Peter Parker is a nice guy and lives in New York.|
|Bruce Wayne is also a nice guy and lives in Gotham City.|
+--------------------------------------------------------+



In [15]:
# get data from file
with open('sample.txt') as f:
  print (f.read())


Peter is a very good person.
My life in Russia is very interesting.
John and Peter are brothers. However they don't support each other that much.
Lucas Nogal Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!




In [16]:
# Read file and convert its text in dataframe 
spark_df = spark.read.text('sample.txt').toDF('text')

spark_df.show(truncate=False)

+-----------------------------------------------------------------------------+
|text                                                                         |
+-----------------------------------------------------------------------------+
|Peter is a very good person.                                                 |
|My life in Russia is very interesting.                                       |
|John and Peter are brothers. However they don't support each other that much.|
|Lucas Nogal Dunbercker is no longer happy. He has a good car though.         |
|Europe is very culture rich. There are huge churches! and big houses!        |
|                                                                             |
+-----------------------------------------------------------------------------+



In [17]:
# Read and convert text in dataframe 
spark_df.select('text').show(truncate=False)


+-----------------------------------------------------------------------------+
|text                                                                         |
+-----------------------------------------------------------------------------+
|Peter is a very good person.                                                 |
|My life in Russia is very interesting.                                       |
|John and Peter are brothers. However they don't support each other that much.|
|Lucas Nogal Dunbercker is no longer happy. He has a good car though.         |
|Europe is very culture rich. There are huge churches! and big houses!        |
|                                                                             |
+-----------------------------------------------------------------------------+



In [18]:
textFiles = spark.sparkContext.wholeTextFiles("./*.txt",4)
    
spark_df_folder = textFiles.toDF(schema=['path','text'])

spark_df_folder.show(truncate=30)


+------------------------------+------------------------------+
|                          path|                          text|
+------------------------------+------------------------------+
|file:/home/hp/Projects/samp...|Peter is a very good person...|
+------------------------------+------------------------------+



In [21]:
# Filter specific column
spark_df_folder.select('text').take(1)

[Row(text="Peter is a very good person.\nMy life in Russia is very interesting.\nJohn and Peter are brothers. However they don't support each other that much.\nLucas Nogal Dunbercker is no longer happy. He has a good car though.\nEurope is very culture rich. There are huge churches! and big houses!\n\n")]