In [18]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

Define file paths

In [19]:
current_dir = os.getcwd()
words_file_name = '../datasources/gutenberg_books/*.txt'
output_file_name = '../output_files/english_count_words_analytic.csv'
words_file_path = os.path.join(current_dir, words_file_name)

Start ingest and analyze data

In [20]:
spark = SparkSession.builder.appName("English words analysis").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

words_df = spark.read.text(words_file_path)
words_df.printSchema()

root
 |-- value: string (nullable = true)



24/12/26 20:43:23 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/12/26 20:43:23 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/12/26 20:43:23 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
24/12/26 20:43:23 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
24/12/26 20:43:23 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.
24/12/26 20:43:23 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.
24/12/26 20:43:23 WARN Utils: Service 'SparkUI' could not bind on port 4046. Attempting port 4047.
24/12/26 20:43:23 WARN Utils: Service 'SparkUI' could not bind on port 4047. Attempting port 4048.
24/12/26 20:43:23 WARN Utils: Service 'SparkUI' could not bind on port 4048. Attempting port 4049.


In [21]:
words_df.show(10, truncate=False)

+-----------------------------------------------------------------------+
|value                                                                  |
+-----------------------------------------------------------------------+
|                                                                       |
|The Project Gutenberg EBook of Moby Dick; or The Whale, by Herman      |
|Melville                                                               |
|                                                                       |
|This eBook is for the use of anyone anywhere at no cost and with almost|
|no restrictions whatsoever.  You may copy it, give it away or re-use it|
|under the terms of the Project Gutenberg License included with this    |
|eBook or online at www.gutenberg.org                                   |
|                                                                       |
|                                                                       |
+-------------------------------------

Tokenize the words and do analysis

In [22]:
words_df = (words_df.select(F.split(F.col("value"), " ").alias("line"))
            .select(F.explode("line").alias("word"))
            .select(F.lower(F.col("word")).alias("word_lower"))
            .select(F.regexp_extract(F.col("word_lower"), "^[a-z]+", 0).alias("final_word")))
words_df = words_df.filter(F.col("final_word" ) != "")
words_df.show(20, truncate=False)

+----------+
|final_word|
+----------+
|the       |
|project   |
|gutenberg |
|ebook     |
|of        |
|moby      |
|dick      |
|or        |
|the       |
|whale     |
|by        |
|herman    |
|melville  |
|this      |
|ebook     |
|is        |
|for       |
|the       |
|use       |
|of        |
+----------+
only showing top 20 rows



Do analytic

In [23]:
count_analytic = words_df.groupBy(F.col("final_word")).count()
count_analytic.orderBy(F.col("count").desc()).show(20)

+----------+-----+
|final_word|count|
+----------+-----+
|       the|38895|
|       and|23919|
|        of|21199|
|        to|20526|
|         a|14464|
|         i|13974|
|        in|12777|
|      that| 9623|
|        it| 9099|
|       was| 8920|
|       her| 7923|
|        my| 7385|
|       his| 6642|
|      with| 6575|
|        he| 6444|
|        as| 6439|
|       you| 6297|
|       had| 5718|
|       she| 5617|
|       for| 5425|
+----------+-----+
only showing top 20 rows



Write analytic data into file

In [24]:
count_analytic.coalesce(1).write.csv(os.path.join(current_dir, output_file_name), mode="overwrite", header=True)

In [25]:
spark.stop()