In [27]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder \
    .appName("project")\
    .master("local[4]")\
    .config("spark.driver.memory","8G")\
    .config("spark.driver.maxResultSize", "2G") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.4")\
    .config("spark.kryoserializer.buffer.max", "500m")\
    .getOrCreate()

sc = spark.sparkContext
sc

In [28]:
spark

In [29]:
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer, 
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml import Pipeline

In [30]:
import nltk
nltk.download('stopwords')
nltk.download('words')

from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')
eng_stopwords.append('xxxx')

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/hadoop/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [31]:
documentAssembler = DocumentAssembler() \
    .setInputCol('context') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
    .setInputCols(['token']) \
    .setOutputCol('normalized') \
    .setLowercase(True)

# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(['normalized']) \
    .setOutputCol('lemma') \

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(['lemma']) \
    .setOutputCol('clean_lemma') \
    .setCaseSensitive(False) \
    .setStopWords(eng_stopwords)

# finisher converts tokens to human-readable output
finisher = Finisher() \
    .setInputCols(['clean_lemma']) \
    .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [32]:
pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        tokenizer,
        normalizer,
        lemmatizer,
        stopwords_cleaner,
        finisher
    ])

In [33]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType([
    StructField("article_type", StringType(), True),
    StructField("np1", StringType(), True),
    StructField("np2", StringType(), True),
    StructField("context", StringType(), True),
    StructField("source", StringType(), True),
    StructField("category", StringType(), True),
    StructField("location", StringType(), True),
    StructField("time", StringType(), True),])

In [34]:
df = spark.read.csv("s3://anly502project/data/part-r-00000",sep = "\t",header=False,schema=schema)
df_2 = spark.read.csv("s3://anly502project/data/part-r-00001",sep = "\t",header=False,schema=schema)

Py4JJavaError: An error occurred while calling o451.csv.
: java.io.IOException: No FileSystem for scheme: s3
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:547)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary$1.apply(DataSource.scala:545)
	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
	at scala.collection.immutable.List.flatMap(List.scala:355)
	at org.apache.spark.sql.execution.datasources.DataSource.org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary(DataSource.scala:545)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:359)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:619)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [100]:
#### Data Schema
df.printSchema()
df_2.printSchema()

root
 |-- article_type: string (nullable = true)
 |-- np1: string (nullable = true)
 |-- np2: string (nullable = true)
 |-- context: string (nullable = true)
 |-- source: string (nullable = true)
 |-- category: string (nullable = true)
 |-- location: string (nullable = true)
 |-- time: string (nullable = true)

root
 |-- article_type: string (nullable = true)
 |-- np1: string (nullable = true)
 |-- np2: string (nullable = true)
 |-- context: string (nullable = true)
 |-- source: string (nullable = true)
 |-- category: string (nullable = true)
 |-- location: string (nullable = true)
 |-- time: string (nullable = true)



In [101]:
import functools 

def unionAll(dfs):
    return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs) 

In [102]:
unioned_df = unionAll([df, df_2])

In [103]:
#### show combined
unioned_df.show(10)

+------------+---------------+---+--------------------+-------------+--------------------+--------------------+-----+
|article_type|            np1|np2|             context|       source|            category|            location| time|
+------------+---------------+---+--------------------+-------------+--------------------+--------------------+-----+
|     article|    Dark Knight|  E|  arg1 and Wall arg2|             |intlnews topstor ...|      , kerala india|14299|
|     article|    Carotenoids|  E|arg1 and caroteno...|             |topstor,health,sc...|                   ,|14660|
|     article|    Communities|  E|arg1 mobilised in...|             |    politics topstor|                   ,|14026|
|     article|    Carotenoids|  E|arg1 and caroteno...|             |topstor,health,sc...|                   ,|14660|
|            |     Coast bias|  E|arg2 is for East ...|             |      sports topstor| columbus, ohio u...|13956|
|     article|Commerce office|  E|arg1 at DDD Linco...| 

In [104]:
from pyspark.sql.functions import split
split_col = split(unioned_df['category'], ',')
unioned_df = unioned_df.withColumn('category', split_col.getItem(0))

In [105]:
split_col_2 = split(unioned_df['category'], ' ')
unioned_df = unioned_df.withColumn('category', split_col_2.getItem(0))

In [106]:
split_col_3 = split(unioned_df['category'], '_')
unioned_df = unioned_df.withColumn('category', split_col_3.getItem(0))

In [107]:
split_col_4 = split(unioned_df['category'], '-')
unioned_df = unioned_df.withColumn('category', split_col_4.getItem(0))

In [108]:
#### Show data frame after filtering the category
unioned_df.show(10)

+------------+---------------+---+--------------------+-------------+---------+--------------------+-----+
|article_type|            np1|np2|             context|       source| category|            location| time|
+------------+---------------+---+--------------------+-------------+---------+--------------------+-----+
|     article|    Dark Knight|  E|  arg1 and Wall arg2|             | intlnews|      , kerala india|14299|
|     article|    Carotenoids|  E|arg1 and caroteno...|             |  topstor|                   ,|14660|
|     article|    Communities|  E|arg1 mobilised in...|             | politics|                   ,|14026|
|     article|    Carotenoids|  E|arg1 and caroteno...|             |  topstor|                   ,|14660|
|            |     Coast bias|  E|arg2 is for East ...|             |   sports| columbus, ohio u...|13956|
|     article|Commerce office|  E|arg1 at DDD Linco...|             |  topstor| canton, ohio uni...|14363|
|     article| 75-minute mark|  E|arg

In [35]:
df_pip = pipeline.fit(unioned_df).transform(unioned_df)

NameError: name 'unioned_df' is not defined

In [36]:
df_pip.show()

NameError: name 'df_pip' is not defined

In [111]:
sc.stop()

In [26]:
SparkSession._instantiatedContext = None