**Setup**
1. conda install pyspark

Note: code in this notebook doesn't need a Spark installation

In [32]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import shutil   
import pyspark
sc = pyspark.SparkContext.getOrCreate()
spark = SparkSession(sc)

In [23]:
# RDDs - partitions distributed across individual nodes of a cluster.
# We use special functions called higher-order functions to operate on the RDDs and transform the RDDs according to our business logic
#
# loads all text files at the specified location into the cluster memory, splits them into individual lines, and returns an RDD of lines or strings.
lines = sc.textFile("Essential-PySpark-for-Scalable-Data-Analytics/README.md")
# apply the flatMap() higher-order function to the new RDD of lines and supply it with a function that instructs it to take each line and split it based on a white space.
# flatMmap() bundles the lambda and sends it over a network to the Worker Nodes via serialization, sent to every executor who all apply this lambda to individual RDD partitions in parallel.
words = lines.flatMap(lambda s: s.split(" "))
# use the map() function to assign a count of 1 to every individual word. This is pretty easy and definitely more intuitive compared to developing a MapReduce application using the Java programming language.
word_tuples = words.map(lambda s: (s, 1))
# Merge the values for each key using an associative and commutative reduce function.
word_count = word_tuples.reduceByKey(lambda x, y: x + y)
word_count.take(10)
shutil.rmtree("/tmp/wordcount.txt", ignore_errors=True)
word_count.saveAsTextFile("/tmp/wordcount.txt")

In [25]:
# The Spark SQL engine was added as a layer on top of the RDD API and expanded to every component of Spark and include the DataFrame API 
# DataFrames are immutable, and support actions write, count, show, and transformations read, select, where, filter, join & groupBy which return another DataFrame
#
from pyspark.sql.functions import split, explode
from pyspark.sql import SparkSession
#
#creates a DataFrame of lines of StringType
linesDF = spark.read.text("Essential-PySpark-for-Scalable-Data-Analytics/README.md")
linesDF.show()

+--------------------+
|               value|
+--------------------+
|# Essential PySpa...|
|                    |
|<a href="https://...|
|                    |
|This is the code ...|
|                    |
|**A beginner's gu...|
|                    |
|## What is this b...|
|Apache Spark is a...|
|                    |
|This book covers ...|
|Understand the ro...|
|Gain an appreciat...|
|Scale out your da...|
|Build data pipeli...|
|Leverage the clou...|
|Explore the appli...|
|Integrate your cl...|
|                    |
+--------------------+
only showing top 20 rows



In [42]:
# separate out every line into its individual words; the result is a DataFrame with a single column, named value, which is actually a list of words.
wordListDf = linesDF.select(split("value", " ").alias("words"))
wordListDf.take(5)
# separate the list of words in each row out to every word on a separate row; the result is a DataFrame with a column labeled word.
wordsDf = wordListDf.select(explode("words").alias("word"))
wordsDf.take(5)
wordCountDf = wordsDf.groupBy("word").count()
wordCountDf.take(20)
wordCountDf.write.csv("/tmp/wordcounts.csv")

[Row(words=['#', 'Essential', 'PySpark', 'for', 'Scalable', 'Data', 'Analytics']),
 Row(words=['']),
 Row(words=['<a', 'href="https://www.packtpub.com/product/essential-pyspark-for-scalable-data-analytics/9781800568877?utm_source=github&utm_medium=repository&utm_campaign=9781800568877"><img', 'src="https://static.packt-cdn.com/products/9781800568877/cover/smaller"', 'alt="Essential', 'PySpark', 'for', 'Scalable', 'Data', 'Analytics"', 'height="256px"', 'align="right"></a>']),
 Row(words=['']),
 Row(words=['This', 'is', 'the', 'code', 'repository', 'for', '[Essential', 'PySpark', 'for', 'Scalable', 'Data', 'Analytics](https://www.packtpub.com/product/essential-pyspark-for-scalable-data-analytics/9781800568877?utm_source=github&utm_medium=repository&utm_campaign=9781800568877),', 'published', 'by', 'Packt.'])]

[Row(word='#'),
 Row(word='Essential'),
 Row(word='PySpark'),
 Row(word='for'),
 Row(word='Scalable')]

[Row(word='lakes,', count=1),
 Row(word='[copy](https://www.amazon.com/dp/1800568878)', count=1),
 Row(word='today!', count=1),
 Row(word='Dashboards', count=1),
 Row(word='API', count=1),
 Row(word='If', count=1),
 Row(word=".add('Description',", count=1),
 Row(word='used', count=1),
 Row(word='**Following', count=1),
 Row(word='analysts,', count=1),
 Row(word='practicing', count=1),
 Row(word='PDF', count=1),
 Row(word='Data', count=5),
 Row(word='present', count=1),
 Row(word='=', count=1),
 Row(word='helping', count=1),
 Row(word="beginner's", count=1),
 Row(word='go-to', count=1),
 Row(word='Databricks,', count=1),
 Row(word='border="5"', count=1)]

In [45]:
spark.sql("drop table if exists word_counts")
spark.sql('CREATE TABLE word_counts (word STRING) USING csv OPTIONS("delimiter"=" ") LOCATION "/tmp/hive"')

spark.sql("SELECT word, COUNT(word) AS count FROM word_counts GROUP BY word").show()

DataFrame[]

DataFrame[]

+--------------------+-----+
|                word|count|
+--------------------+-----+
|                  If|    1|
|         **Following|    1|
|                   *|    2|
|alt="https://www....|    1|
|                null|    0|
|            Leverage|    1|
|              Apache|    1|
|                  <a|    2|
|                  is|    1|
|                 All|    1|
|                   ||    3|
|                Gain|    1|
|           **Sreeram|    1|
|               Scale|    1|
|           Integrate|    1|
|                   )|    1|
|             Explore|    1|
|                With|    1|
|                 The|    1|
|                 ```|    2|
+--------------------+-----+
only showing top 20 rows

