In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, length

In [45]:
csv_path = "/sparkdata/1342-0.txt" 

In [2]:
spark = SparkSession \
    .builder \
    .appName("Analyzing the vocabulary of Pride and Prejudice") \
    .getOrCreate()

In [16]:
word_null = spark.read.csv('word_count.csv/')

In [18]:
word_null = word_null.withColumnRenamed('_c0','word')

### Grouping records

In [19]:
groups = word_null.groupby(col('word'))

In [20]:
print(groups)

GroupedData[grouping expressions: [word], value: [word: string], type: GroupBy]


In [21]:
results = groups.count()

In [22]:
print(results)

DataFrame[word: string, count: bigint]


In [38]:
results.show(5)

+------+-----+
|  word|count|
+------+-----+
|online|    4|
|  some|  209|
| still|   72|
|   few|   72|
|  hope|  122|
+------+-----+
only showing top 5 rows



### Ordering results

In [40]:
results.orderBy('count',ascending = False).show(5)

+----+-----+
|word|count|
+----+-----+
| the| 4496|
|  to| 4235|
|  of| 3719|
| and| 3602|
| her| 2223|
+----+-----+
only showing top 5 rows



In [41]:
# Can also call the col function
results.orderBy(col("count").desc()).show(5)

+----+-----+
|word|count|
+----+-----+
| the| 4496|
|  to| 4235|
|  of| 3719|
| and| 3602|
| her| 2223|
+----+-----+
only showing top 5 rows



### Writing data from a data frame


In [42]:
results.write.csv('simple_count.csv')

In [None]:
## changing number of partitions with dataframe.coalesce()
results.coalesce(3).write.csv('simple_count_partition.csv')

#### all together

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col,
    explode,
    lower,
    regexp_extract,
    sp
lit,
)
spark = SparkSession.builder.appName(
    "Analyzing the vocabulary of Pride and Prejudice."
).getOrCreate()

book = spark.read.text("./data/gutenberg_books/1342-0.txt")

lines = book.select(split(book.value, " ").alias("line"))

words = lines.select(explode(col("line")).alias("word"))

words_lower = words.select(lower(col("word")).alias("word"))

words_clean = words_lower.select(
    regexp_extract(col("word"), "[a-z']*", 0).alias("word")
)

words_nonull = words_clean.where(col("word") != "")

results = words_nonull.groupby(col("word")).count()

results.orderBy("count", ascending=False).show(10)

results.coalesce(1).write.csv("./simple_count_single_partition.csv")

### Simplifyign PySpark import functions

In [None]:
## Since we usually use a lot of functions from pyspark.sql, instead of doing this:
from pyspark.sql.functions import col, explode, lower

## it is useful to import the whole module:
import pyspark.sql.functions as F

### Method Chaining

In [None]:
# Before
book = spark.read.text("./data/gutenberg_books/1342-0.txt")

lines = book.select(split(book.value, " ").alias("line"))

words = lines.select(explode(col("line")).alias("word"))

words_lower = words.select(lower(col("word")).alias("word"))

words_clean = words_lower.select(
    regexp_extract(col("word"), "[a-z']*", 0).alias("word")
)

words_nonull = words_clean.where(col("word") != "")

results = words_nonull.groupby("word").count()

# After
import pyspark.sql.functions as F

results = (
    spark.read.text(csv_path)
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
    .where(F.col("word") != "")
    .groupby("word")
    .count()
)

### Reading Multiple Files

In [None]:
spark.read.text('./data/gutenberg_books/*.txt') ## note the glob pattern at *