In [40]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

Define file paths

In [41]:
current_dir = os.getcwd()
words_file_name = '../datasources/gutenberg_books/30254-0.txt'
output_file_name = '../output_files/english_count_words_analytic.csv'
words_file_path = os.path.join(current_dir, words_file_name)

Start ingest and analyze data

In [42]:
spark = SparkSession.builder.appName("English words analysis").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

words_df = spark.read.text(words_file_path)
words_df.printSchema()

root
 |-- value: string (nullable = true)



In [43]:
words_df.show(10, truncate=False)

+-----------------------------------------------------------------------+
|value                                                                  |
+-----------------------------------------------------------------------+
|                                                                       |
|The Project Gutenberg EBook of The Romance of Lust, by Anonymous       |
|                                                                       |
|This eBook is for the use of anyone anywhere at no cost and with almost|
|no restrictions whatsoever.  You may copy it, give it away or re-use it|
|under the terms of the Project Gutenberg License included with this    |
|eBook or online at www.gutenberg.org                                   |
|                                                                       |
|Title: The Romance of Lust A classic Victorian erotic novel            |
|                                                                       |
+-------------------------------------

Tokenize the words and do analysis

In [44]:
words_df = (words_df.select(F.split(F.col("value"), " ").alias("line"))
            .select(F.explode("line").alias("word"))
            .select(F.lower(F.col("word")).alias("word_lower"))
            .select(F.regexp_extract(F.col("word_lower"), "^[a-z]+", 0).alias("final_word")))
words_df = words_df.filter(F.col("final_word" ) != "")
words_df.show(20, truncate=False)

+----------+
|final_word|
+----------+
|the       |
|project   |
|gutenberg |
|ebook     |
|of        |
|the       |
|romance   |
|of        |
|lust      |
|by        |
|anonymous |
|this      |
|ebook     |
|is        |
|for       |
|the       |
|use       |
|of        |
|anyone    |
|anywhere  |
+----------+
only showing top 20 rows



Do analytic

In [45]:
count_analytic = words_df.groupBy(F.col("final_word")).count()
count_analytic.orderBy(F.col("count").desc()).show(20)

+----------+-----+
|final_word|count|
+----------+-----+
|       the| 8105|
|       and| 7186|
|        to| 5860|
|        of| 4637|
|         i| 4505|
|       her| 4347|
|        my| 3399|
|        in| 3282|
|         a| 3018|
|       was| 2662|
|       she| 2640|
|        it| 2465|
|      that| 2143|
|       had| 2079|
|        me| 2051|
|      with| 1907|
|        as| 1904|
|       you| 1614|
|        we| 1359|
|       for| 1339|
+----------+-----+
only showing top 20 rows



Write analytic data into file

In [46]:
count_analytic.coalesce(1).write.csv(os.path.join(current_dir, output_file_name), mode="overwrite", header=True)