<a href="https://colab.research.google.com/github/mosesyhc/de300-wn2024-notes/blob/main/examples/ex-word-count.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Mounting Google drive for a permanent venv

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Retrieving Java, Spark, and `findspark` in Python

In [None]:
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz

In [None]:
!pip install -q findspark

In [None]:
# spark setup
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
# findspark helps locate the environment variables
import findspark
findspark.init()

In [None]:
# import spark modules
from pyspark.sql import SparkSession

spark = (SparkSession.builder
                     .appName("Analyzing an unknown article.")
                     .getOrCreate())


In [None]:
spark

In [None]:
spark.sparkContext

In [None]:
sc = spark.sparkContext
spark.sparkContext.setLogLevel('WARN')

In [None]:
# determine file path
dir_path = r'/content/drive/MyDrive/DATA_ENG300/'
file_path = dir_path + "sur.txt"

## Read

In [None]:
# documentation of read
spark.read??

In [None]:
# read file from spark
article = spark.read.text(file_path)  #

In [None]:
article

In [None]:
article.printSchema()

In [None]:
article.show()

In [None]:
# default show arguments
article.show(20, truncate=False)

In [None]:
article

In [None]:
# retrieving first rows of the article
article.printSchema()

In [None]:
# we can work with a dataframe by selecting the content
article.select(article.value)
article.show(10, truncate=False)

In [None]:
from pyspark.sql.functions import col

# all of the following returns the same df
article.select(article.value)
article.select(article['value'])
article.select(col('value'))
article.select('value')

## Split

In [None]:
# splitting the lines
from pyspark.sql.functions import col, split

In [None]:
lines = article.select(split(col('value'), " ").alias('line'))

In [None]:
lines.show(10, truncate=False)

In [None]:
article.select(split(col('value'), " ")).printSchema()

In [None]:
article.select(split(col('value'), " ").alias("line")).printSchema()

In [None]:
lines = article.select(split(article.value, " ").alias("line"))

In [None]:
lines.printSchema()

In [None]:
lines.show()

## Explode / Tokenize

In [None]:
from pyspark.sql.functions import explode

In [None]:
words = lines.select(explode(col("line")).alias("word"))

In [None]:
words.show()

In [None]:
words.printSchema()

## Clean

In [None]:
from pyspark.sql.functions import lower
words_lower = words.select(lower(col("word")).alias("word_lower"))

In [None]:
words_lower.show(3)

In [None]:
from pyspark.sql.functions import regexp_extract

words_clean = words_lower.select(
    regexp_extract(col("word_lower"), r"(\W+)?([a-z]+)", 2).alias("word")
)

In [None]:
words_clean.show(30)

In [None]:
words_nonull = words_clean.where(col("word") != "")

words_nonull.show(100)

## Count / Group

In [None]:
groups = words_nonull.groupBy(col("word"))

groups

In [None]:
counts = words_nonull.groupBy(col("word")).count()

In [None]:
counts

In [None]:
counts.orderBy('count', ascending=False).show(15)

In [None]:
counts.orderBy("count", ascending=False).show(10)

## All in one go!

In [None]:
import pyspark.sql.functions as F

counts = (
    spark.read.text(file_path)
     .select(F.split(F.col('value'), ' ').alias('line'))
     .select(F.explode(F.col('line')).alias('word'))
     .select(F.lower(F.col('word')).alias('word'))
     .select(F.regexp_extract(F.col('word'), r"(\W+)?([a-z]+)", 2).alias('word'))
     .where(F.col('word') != "")
     .groupby('word')
     .count()
)

In [None]:
counts.orderBy('count', ascending=False).show(10)

## Exercises

1. Return the number of words by word length.
  - `F.length()` returns the length of a word
2. Return the number of each vowel used in the article.
  - `filter` / `where`
  - `where(F.col('column').isin(['x', 'y', 'z']))` filters so that only values equals x, y, or z remain

In [None]:
import pyspark.sql.functions as F

counts = (
    spark.read.text(file_path)
     .select(F.split(F.col('value'), ' ').alias('line'))
     .select(F.explode(F.col('line')).alias('word'))
     .select(F.lower(F.col('word')).alias('word'))
     .select(F.regexp_extract(F.col('word'), r"(\W+)?([a-z]+)", 2).alias('word'))
     .where(F.col('word') != "")
     .select(F.length(F.col('word')).alias('length'))
     .groupby('length')
     .count()
)

In [None]:
counts.orderBy('length', ascending=True).show(20)

In [None]:
import pyspark.sql.functions as F

counts = (
    spark.read.text(file_path)
     .select(F.split(F.col('value'), ' ').alias('line'))
     .select(F.explode(F.col('line')).alias('word'))
     .select(F.lower(F.col('word')).alias('word'))
     .select(F.regexp_extract(F.col('word'), r"(\W+)?([a-z]+)", 2).alias('word'))
     .select(F.split(F.col('word'), '').alias('letters'))
     .select(F.explode(F.col('letters')).alias('letter'))
     .where(F.col('letter') != "")
     .filter(F.col('letter').isin(['a', 'e', 'i', 'o', 'u']))
     .groupby('letter')
     .count()
)

In [None]:
counts.show()

## Batch processing

In [None]:
## analyze-article.py

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

dir_path = r'/content/drive/MyDrive/DATA_ENG300/'
file_path = dir_path + "sur.txt"  ## "*.txt"

spark = SparkSession.builder.appName(
    "Counting word occurences from a book."
).getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

# If you need to read multiple text files, replace `1342-0` by `*`.
results = (
    spark.read.text(file_path)
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
    .where(F.col("word") != "")
    .groupby(F.col("word"))
    .count()
)

results.orderBy("count", ascending=False).show(10)
results.coalesce(1).write.csv("./results-analyze-article.csv")

To run the same code in batch:

In [None]:
!rm -r results-analyze-article.csv/

In [None]:
!spark-3.1.1-bin-hadoop3.2/bin/spark-submit analyze-article.py