In [1]:
from pyspark.sql import SparkSession

# 4 thread cluster
spark = (SparkSession
         .builder.master("local[4]")
         .appName("Analyzing vocabulary of Pride and Prejudice")
         .getOrCreate())


23/11/16 21:27:06 WARN Utils: Your hostname, nyck33-tt resolves to a loopback address: 127.0.1.1; using 192.168.3.16 instead (on interface wlo1)
23/11/16 21:27:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/16 21:27:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
spark.sparkContext

In [3]:
spark.sparkContext.setLogLevel("WARN")


In [4]:
spark.read

<pyspark.sql.readwriter.DataFrameReader at 0x7f7edc7c91e0>

In [5]:
print(dir(spark.read))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_df', '_jreader', '_set_opts', '_spark', 'csv', 'format', 'jdbc', 'json', 'load', 'option', 'options', 'orc', 'parquet', 'schema', 'table', 'text']


In [6]:
book_dir ="/home/nyck33/Documents/DataEngineering/DataAnalysisWithPythonAndPySpark"

directory = book_dir + "/code/data/gutenberg_books/1342-0.txt"
print(f'directory: {directory}')

book = spark.read.text(directory)
print(book)
book.printSchema()
print(book.dtypes)
book.head(10)

directory: /home/nyck33/Documents/DataEngineering/DataAnalysisWithPythonAndPySpark/code/data/gutenberg_books/1342-0.txt
DataFrame[value: string]
root
 |-- value: string (nullable = true)

[('value', 'string')]


[Row(value='The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen'),
 Row(value=''),
 Row(value='This eBook is for the use of anyone anywhere at no cost and with'),
 Row(value='almost no restrictions whatsoever.  You may copy it, give it away or'),
 Row(value='re-use it under the terms of the Project Gutenberg License included'),
 Row(value='with this eBook or online at www.gutenberg.org'),
 Row(value=''),
 Row(value=''),
 Row(value='Title: Pride and Prejudice'),
 Row(value='')]

In [7]:
# docs
print(spark.__doc__)

The entry point to programming Spark with the Dataset and DataFrame API.

    A SparkSession can be used to create :class:`DataFrame`, register :class:`DataFrame` as
    tables, execute SQL over tables, cache tables, and read parquet files.
    To create a :class:`SparkSession`, use the following builder pattern:

    .. versionchanged:: 3.4.0
        Supports Spark Connect.

    .. autoattribute:: builder
       :annotation:

    Examples
    --------
    Create a Spark session.

    >>> spark = (
    ...     SparkSession.builder
    ...         .master("local")
    ...         .appName("Word Count")
    ...         .config("spark.some.config.option", "some-value")
    ...         .getOrCreate()
    ... )

    Create a Spark session with Spark Connect.

    >>> spark = (
    ...     SparkSession.builder
    ...         .remote("sc://localhost")
    ...         .appName("Word Count")
    ...         .config("spark.some.config.option", "some-value")
    ...         .getOrCreate()
    ..

In [8]:
# show first 10 records, truncated at 50 characters
book.show(10, truncate=50)

+--------------------------------------------------+
|                                             value|
+--------------------------------------------------+
|The Project Gutenberg EBook of Pride and Prejud...|
|                                                  |
|This eBook is for the use of anyone anywhere at...|
|almost no restrictions whatsoever.  You may cop...|
|re-use it under the terms of the Project Gutenb...|
|    with this eBook or online at www.gutenberg.org|
|                                                  |
|                                                  |
|                        Title: Pride and Prejudice|
|                                                  |
+--------------------------------------------------+
only showing top 10 rows



In [9]:
# eager execution available
'''
from pyspark.sql import SparkSession
spark = (SparkSession.builder.config("spark.sql.repl.eagerEval.enabled", True).getOrCreate())
'''

'\nfrom pyspark.sql import SparkSession\nspark = (SparkSession.builder.config("spark.sql.repl.eagerEval.enabled", True).getOrCreate())\n'

In [10]:
# transform from sentence to list of words
from pyspark.sql.functions import split
lines = book.select(split(book.value, " ")).alias("line")
print(lines)
# show executes the lazy evaluation
lines.show(5)

DataFrame[split(value,  , -1): array<string>]
+--------------------+
| split(value,  , -1)|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re-use, it, unde...|
+--------------------+
only showing top 5 rows



In [11]:
# select statement
from pyspark.sql.functions import col
book.select(book.value)
book.select(book["value"])
book.select(col("value"))
book.select("value")


DataFrame[value: string]

In [12]:
from pyspark.sql.functions import col, split

lines = book.select(split(col("value"), " "))

lines

DataFrame[split(value,  , -1): array<string>]

In [13]:
lines.printSchema()

root
 |-- split(value,  , -1): array (nullable = true)
 |    |-- element: string (containsNull = false)



In [14]:
lines.show(5)

+--------------------+
| split(value,  , -1)|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re-use, it, unde...|
+--------------------+
only showing top 5 rows



In [15]:
# ugly result
book.select(split(col("value"), " ")).printSchema()

root
 |-- split(value,  , -1): array (nullable = true)
 |    |-- element: string (containsNull = false)



In [16]:
# use alias pretty result
book.select(split(col("value"), " ").alias("line")).printSchema()

root
 |-- line: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [17]:
lines = book.select(split(book.value, " ").alias("line"))
lines.show(10)
#lines[0].show()

+--------------------+
|                line|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re-use, it, unde...|
|[with, this, eBoo...|
|                  []|
|                  []|
|[Title:, Pride, a...|
|                  []|
+--------------------+
only showing top 10 rows



In [18]:
# each row with container like arrays is exploded into multiple rows
from pyspark.sql.functions import split, explode, col

words = lines.select(explode(col("line")).alias("word"))
words.show(15)

+----------+
|      word|
+----------+
|       The|
|   Project|
| Gutenberg|
|     EBook|
|        of|
|     Pride|
|       and|
|Prejudice,|
|        by|
|      Jane|
|    Austen|
|          |
|      This|
|     eBook|
|        is|
+----------+
only showing top 15 rows



In [19]:
from pyspark.sql.functions import lower
words_lower = words.select(lower(col("word")).alias("word_lower"))
words_lower.show(15)

+----------+
|word_lower|
+----------+
|       the|
|   project|
| gutenberg|
|     ebook|
|        of|
|     pride|
|       and|
|prejudice,|
|        by|
|      jane|
|    austen|
|          |
|      this|
|     ebook|
|        is|
+----------+
only showing top 15 rows



In [20]:
from pyspark.sql.functions import regexp_extract

words_clean = words_lower.select(regexp_extract(col("word_lower"), "[a-z]+", 0).alias("word"))

words_clean.show(20)

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|         |
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
+---------+
only showing top 20 rows



In [21]:
words_nonull = words_clean.filter(col("word") != "")

words_nonull.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
| anywhere|
+---------+
only showing top 20 rows



In [22]:
words_nonull = words_clean.filter(~col("word").isin(""))
words_nonull.show()


+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
| anywhere|
+---------+
only showing top 20 rows



In [23]:
groups = words_nonull.groupby(col("word"))
print(groups)
#results = groups.count()
#results.show()



GroupedData[grouping expressions: [word], value: [word: string], type: GroupBy]


In [24]:
results = words_nonull.groupby(col("word")).count()
print(results)
results.show()


DataFrame[word: string, count: bigint]
+-------------+-----+
|         word|count|
+-------------+-----+
|       online|    4|
|         some|  209|
|        still|   72|
|          few|   72|
|         hope|  122|
|        those|   60|
|     cautious|    4|
|    imitation|    1|
|          art|    3|
|      solaced|    1|
|       poetry|    2|
|    arguments|    5|
| premeditated|    1|
|      elevate|    1|
|       doubts|    2|
|    destitute|    1|
|    solemnity|    5|
|   lieutenant|    1|
|gratification|    1|
|    connected|   14|
+-------------+-----+
only showing top 20 rows



In [25]:
results.orderBy("count", ascending=False).show(10)
results.orderBy(col("count").desc()).show(10)



+----+-----+
|word|count|
+----+-----+
| the| 4496|
|  to| 4235|
|  of| 3719|
| and| 3602|
| her| 2223|
|   i| 2052|
|   a| 1997|
|  in| 1920|
| was| 1844|
| she| 1703|
+----+-----+
only showing top 10 rows

+----+-----+
|word|count|
+----+-----+
| the| 4496|
|  to| 4235|
|  of| 3719|
| and| 3602|
| her| 2223|
|   i| 2052|
|   a| 1997|
|  in| 1920|
| was| 1844|
| she| 1703|
+----+-----+
only showing top 10 rows



In [29]:
results.write.csv("simple_count.csv")
!ls -l simple_count.csv

total 72
-rw-r--r-- 1 nyck33 nyck33 69768 Nov 16 21:27 part-00000-11962f4f-3380-4045-900a-62d21e1cb107-c000.csv
-rw-r--r-- 1 nyck33 nyck33     0 Nov 16 21:27 _SUCCESS


In [27]:
results.coalesce(1).write.csv("simple_count_coalesced.csv")

In [30]:
results.rdd.getNumPartitions()

1

In [28]:
from pyspark.sql.functions import length
letters = words_nonull.select(length(col("word")).alias("length")).groupby("length").count()
print(letters)
letters.show()


DataFrame[length: int, count: bigint]
+------+-----+
|length|count|
+------+-----+
|    12|  812|
|     1| 4116|
|    13|  393|
|     6| 9276|
|    16|    5|
|     3|28831|
|     5|11998|
|    15|   32|
|     9| 5165|
|    17|    3|
|     4|22213|
|     8| 5121|
|     7| 8679|
|    10| 2455|
|    11| 1386|
|    14|  107|
|     2|23856|
+------+-----+

