In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [3]:
lines = sc.textFile("C://Users//nolfonzo//Spark//README.md")
words = lines.flatMap(lambda s: s.split(" "))
word_tuples = words.map(lambda s: (s, 1))
word_count = word_tuples.reduceByKey(lambda x, y: x + y)
word_count.take(10)

[('#', 1),
 ('Apache', 1),
 ('Spark', 14),
 ('', 73),
 ('is', 7),
 ('unified', 1),
 ('analytics', 1),
 ('engine', 2),
 ('It', 2),
 ('provides', 1)]

In [10]:
from pyspark.sql.functions import split, explode
from pyspark.sql import SparkSession
spark = SparkSession(sc)
linesDF = spark.read.text("C://Users//nolfonzo//Spark//README.md")

In [12]:
linesDF.show()

+--------------------+
|               value|
+--------------------+
|      # Apache Spark|
|                    |
|Spark is a unifie...|
|high-level APIs i...|
|supports general ...|
|rich set of highe...|
|MLlib for machine...|
|and Structured St...|
|                    |
|<https://spark.ap...|
|                    |
|[![GitHub Action ...|
|[![Jenkins Build]...|
|[![AppVeyor Build...|
|[![PySpark Covera...|
|                    |
|                    |
|## Online Documen...|
|                    |
|You can find the ...|
+--------------------+
only showing top 20 rows



In [13]:
wordListDf = linesDF.select(split("value", " ").alias("words"))
wordListDf.show()
wordsDf = wordListDf.select(explode("words").alias("word"))
wordsDf.show()

+--------------------+
|               words|
+--------------------+
|  [#, Apache, Spark]|
|                  []|
|[Spark, is, a, un...|
|[high-level, APIs...|
|[supports, genera...|
|[rich, set, of, h...|
|[MLlib, for, mach...|
|[and, Structured,...|
|                  []|
|[<https://spark.a...|
|                  []|
|[[![GitHub, Actio...|
|[[![Jenkins, Buil...|
|[[![AppVeyor, Bui...|
|[[![PySpark, Cove...|
|                  []|
|                  []|
|[##, Online, Docu...|
|                  []|
|[You, can, find, ...|
+--------------------+
only showing top 20 rows

+-----------+
|       word|
+-----------+
|          #|
|     Apache|
|      Spark|
|           |
|      Spark|
|         is|
|          a|
|    unified|
|  analytics|
|     engine|
|        for|
|large-scale|
|       data|
|processing.|
|         It|
|   provides|
| high-level|
|       APIs|
|         in|
|     Scala,|
+-----------+
only showing top 20 rows



In [14]:
wordCountDf = wordsDf.groupBy("word").count()
wordCountDf.show()

+---------------+-----+
|           word|count|
+---------------+-----+
|     [![PySpark|    1|
|         online|    1|
|         graphs|    1|
|     ["Building|    1|
|  documentation|    3|
|       command,|    2|
|    abbreviated|    1|
|       overview|    1|
|           rich|    1|
|            set|    2|
|    -DskipTests|    1|
| 1,000,000,000:|    2|
|           name|    1|
|   ["Specifying|    1|
|         stream|    1|
|           run:|    1|
|            not|    1|
|       programs|    2|
|          tests|    2|
|./dev/run-tests|    1|
+---------------+-----+
only showing top 20 rows



In [25]:
spark.sql("drop table if exists word_counts")
spark.sql('CREATE TABLE word_counts (word STRING) USING csv OPTIONS("delimiter"=" ") LOCATION "C://Users//nolfonzo//Spark//README.md"')

spark.sql("SELECT word, COUNT(word) AS count FROM word_counts GROUP BY word").show()

+--------------------+-----+
|                word|count|
+--------------------+-----+
|          [![PySpark|    1|
|          ["Building|    1|
|                rich|    1|
|        ["Specifying|    1|
|                will|    1|
|                [run|    1|
|      Alternatively,|    1|
|               MLlib|    1|
|                 can|    2|
|                 for|    2|
|                null|    0|
|                  in|    1|
|            building|    1|
|             locally|    1|
|         [![AppVeyor|    1|
|<https://spark.ap...|    1|
|                 You|    2|
|                  To|    2|
|          [![Jenkins|    1|
|                More|    1|
+--------------------+-----+
only showing top 20 rows

