In [53]:
import pandas as pd
from pyspark.sql import SparkSession

In [54]:
spark = SparkSession \
    .builder \
    .appName("Analyzing the vocabulary of Pride and Prejudice") \
    .getOrCreate()

Adjusting log-level

In [55]:
spark.sparkContext.setLogLevel('OFF')

## Ingest the data: spark.read

Main structures for storing data: RDD and data frame
 * RDD: "bag that you give orders to"
 * data frame: stricter thant RDD

In [56]:
pwd

'/home/jovyan'

In [57]:
csv_path = "/sparkdata/1342-0.txt" 

## Reading

In [119]:
book = spark.read.text(csv_path)

In [120]:
book

DataFrame[value: string]

In [121]:
book.printSchema()

root
 |-- value: string (nullable = true)



In [122]:
print(book.dtypes)

[('value', 'string')]


In [123]:
book.show(n=10, truncate=50)

+--------------------------------------------------+
|                                             value|
+--------------------------------------------------+
|The Project Gutenberg EBook of Pride and Prejud...|
|                                                  |
|This eBook is for the use of anyone anywhere at...|
|almost no restrictions whatsoever.  You may cop...|
|re-use it under the terms of the Project Gutenb...|
|    with this eBook or online at www.gutenberg.org|
|                                                  |
|                                                  |
|                        Title: Pride and Prejudice|
|                                                  |
+--------------------------------------------------+
only showing top 10 rows



# Simple column transformations

In [124]:
from pyspark.sql.functions import split

In [125]:
lines = book.select(split(book.value, " ").alias("line"))

lines.show(5)

+--------------------+
|                line|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re-use, it, unde...|
+--------------------+
only showing top 5 rows



### select()

select one or more columns from df.

In [65]:
book.select(book.value)

DataFrame[value: string]

In [66]:
## different ways to select a column
from pyspark.sql.functions import col

book.select(book.value)
book.select(book['value'])
book.select(col('value'))
book.select('value')


DataFrame[value: string]

### Renaming columns

In [67]:

lines = book.select(split(book.value, " ").alias("line"))

lines = book.select(split(book.value, " "))
lines = lines.withColumnRenamed("split(value, , -1)", "line")

In [76]:
lines = lines.withColumnRenamed("split(value,  , -1)", "line")

In [77]:
lines.show()

+--------------------+
|                line|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[almost, no, rest...|
|[re-use, it, unde...|
|[with, this, eBoo...|
|                  []|
|                  []|
|[Title:, Pride, a...|
|                  []|
|[Author:, Jane, A...|
|                  []|
|[Posting, Date:, ...|
|[Release, Date:, ...|
|[Last, Updated:, ...|
|                  []|
|[Language:, English]|
|                  []|
|[Character, set, ...|
|                  []|
+--------------------+
only showing top 20 rows



### Explode

In [78]:
from pyspark.sql.functions import explode

In [79]:
words = lines.select(explode(col=('line')).alias("word"))

words.show(15)

+----------+
|      word|
+----------+
|       The|
|   Project|
| Gutenberg|
|     EBook|
|        of|
|     Pride|
|       and|
|Prejudice,|
|        by|
|      Jane|
|    Austen|
|          |
|      This|
|     eBook|
|        is|
+----------+
only showing top 15 rows



### Lower

In [80]:
from pyspark.sql.functions import lower

words_lower = words.select(lower(col('word')).alias("word_lower"))

words_lower.show(15)

+----------+
|word_lower|
+----------+
|       the|
|   project|
| gutenberg|
|     ebook|
|        of|
|     pride|
|       and|
|prejudice,|
|        by|
|      jane|
|    austen|
|          |
|      this|
|     ebook|
|        is|
+----------+
only showing top 15 rows



### RegExp functions

In [81]:
from pyspark.sql.functions import regexp_extract
words_clean = words_lower.select(
    regexp_extract(col("word_lower"), "[a-z]+",0).alias("word")
)

words_clean.show(15)


+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|         |
|     this|
|    ebook|
|       is|
+---------+
only showing top 15 rows



###  Filter

In [82]:
words_null = words_clean.filter(col("word")!='')

words_null.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
| anywhere|
+---------+
only showing top 20 rows



### Writing file

In [128]:
words_null.write.csv('word_count.csv')

### Exercises

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

#### 2.2 
Given the following data frame, programmatically count the number of columns that
aren’t strings (answer = only one column isn’t a string).
createDataFrame() allows you to create a data frame from a variety of sources,
such as a pandas data frame or (in this case) a list of lists.

In [3]:
exo2_2_df = spark.createDataFrame(
[["test", "more test", 10_000_000_000]], ["one", "two", "three"]
)

In [17]:
len(exo2_2_df[[i[0] for i in exo2_2_df.dtypes if i[1] != 'string']].columns)

1

In [24]:
exo2_2_df.select(col('one'),col('two'))

DataFrame[one: string, two: string]

#### 2.3
Rewrite the following code snippet, removing the withColumnRenamed method. Which
version is clearer and easier to read?

In [32]:
from pyspark.sql.functions import col, length


In [None]:
from pyspark.sql.functions import col, length
# The `length` function returns the number of characters in a string column.
exo2_3_df = (
spark.read.text("./data/gutenberg_books/1342-0.txt")
.select(length(col("value")))
.withColumnRenamed("length(value)", "number_of_char")
)

In [37]:
exo2_3_df = (
spark.read.text(csv_path)
.select(length(col("value")).alias("number_of_char")))


In [38]:
exo2_3_df.show()

+--------------+
|number_of_char|
+--------------+
|            66|
|             0|
|            64|
|            68|
|            67|
|            46|
|             0|
|             0|
|            26|
|             0|
|            19|
|             0|
|            43|
|            24|
|            28|
|             0|
|            17|
|             0|
|            29|
|             0|
+--------------+
only showing top 20 rows



#### 2.4 
Assume a data frame exo2_4_df. The following code block gives an error. What is the
problem, and how can you solve it?

In [39]:
from pyspark.sql.functions import col, greatest
exo2_4_df = spark.createDataFrame(
[["key", 10_000, 20_000]], ["key", "value1", "value2"]
)
exo2_4_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value1: long (nullable = true)
 |-- value2: long (nullable = true)



In [42]:
# `greatest` will return the greatest value of the list of column names,
# skipping null value
# The following statement will return an error
from pyspark.sql.utils import AnalysisException
try:
    exo2_4_mod = exo2_4_df.select(
    greatest(col("value1"), col("value2")).alias("maximum_value")
    ).select("key", "max_value")
except AnalysisException as err:
    print(err)

[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `key` cannot be resolved. Did you mean one of the following? [`maximum_value`].;
'Project ['key, 'max_value]
+- Project [greatest(value1#77L, value2#78L) AS maximum_value#82L]
   +- LogicalRDD [key#76, value1#77L, value2#78L], false



We are defining a dataframe which has the greatest as a single column, so there is no 'key' column to select from it:

In [48]:
exo2_4_df.select(
    greatest(col("value1"), col("value2")).alias("maximum_value")).show()

+-------------+
|maximum_value|
+-------------+
|        20000|
+-------------+



We need to select the rest of the columns with the greatest function:

In [49]:
exo2_4_mod = exo2_4_df.select(col('key'),
    greatest(col("value1"), col("value2")).alias("maximum_value")
    )

In [52]:
exo2_4_mod.show()

+---+-------------+
|key|maximum_value|
+---+-------------+
|key|        20000|
+---+-------------+



####  2.5
Let’s take our words_nonull data frame, available in the next listing. You can use the
code from the repository (code/Ch02/end_of_chapter.py) in your REPL to get the
data frame loaded.

a) Remove all of the occurrences of the word is.

b) (Challenge) Using the length function, keep only the words with more than three
characters.

In [90]:
from pyspark.sql.functions import filter, length

In [86]:
words_no_is = words_null.filter(col('word')!='is')

In [93]:
words_3 = words_null.filter(length(col('word'))<=3)

In [94]:
words_3.show()

+----+
|word|
+----+
| the|
|  of|
| and|
|  by|
|  is|
| for|
| the|
| use|
|  of|
|  at|
|  no|
| and|
|  no|
| you|
| may|
|  it|
|  it|
|  or|
|  re|
|  it|
+----+
only showing top 20 rows



#### 2.6
The where clause takes a Boolean expression over one or many columns to filter the
data frame. Beyond the usual Boolean operators (>, <, ==, <=, >=, !=), PySpark provides
other functions that return Boolean columns in the pyspark.sql.functions
module.
A good example is the isin() method (applied on a Column object, like
col(…).isin(…)), which takes a list of values as a parameter, and will return only the
records where the value in the column equals a member of the list.
Let’s say you want to remove the words is, not, the and if from your list of words,
using a single where() method on the words_nonull data frame. Write the code to
do so.

In [96]:
filtered_words = words_null.where(~col('word').isin('is','not','the','if'))

In [97]:
filtered_words.show()

+---------+
|     word|
+---------+
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|     this|
|    ebook|
|      for|
|      use|
|       of|
|   anyone|
| anywhere|
|       at|
|       no|
|     cost|
+---------+
only showing top 20 rows



#### 2.7
One of your friends comes to you with the following code. They have no idea why it
doesn’t work. Can you diagnose the problem in the try block, explain why it is an
error, and provide a fix?

In [100]:
from pyspark.sql.functions import col, split
try:
    book = spark.read.text(csv_path)
    book = book.printSchema()
    lines = book.select(split(book.value, " ").alias("line"))
    words = lines.select(explode(col("line")).alias("word"))
except AnalysisException as err:
    print(err)

root
 |-- value: string (nullable = true)



AttributeError: 'NoneType' object has no attribute 'select'

The problem comes from assigning book as book.printSchema(), since printing returns nothing, book becomes None

In [126]:
book = spark.read.text(csv_path)
book.printSchema()
lines = book.select(split(book.value, " ").alias("line"))
words = lines.select(explode(col("line")).alias("word"))

root
 |-- value: string (nullable = true)



In [127]:
words.show()

+----------+
|      word|
+----------+
|       The|
|   Project|
| Gutenberg|
|     EBook|
|        of|
|     Pride|
|       and|
|Prejudice,|
|        by|
|      Jane|
|    Austen|
|          |
|      This|
|     eBook|
|        is|
|       for|
|       the|
|       use|
|        of|
|    anyone|
+----------+
only showing top 20 rows

