In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
plt.style.use('dark_background')

In [2]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

23/05/02 19:01:38 WARN Utils: Your hostname, rig resolves to a loopback address: 127.0.1.1; using 192.168.0.102 instead (on interface enp6s0)
23/05/02 19:01:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/02 19:01:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Dataframe column operations

## Filtering column content with Python

In [3]:
from pyspark.sql.types import *
voter_schema = StructType([
  # Define a StructField for each field
  StructField('DATE', DateType(), False),
  StructField('TITLE', StringType(), False),
  StructField('VOTER_NAME', StringType(), False)
])
voter_df = spark.read.format('csv').options(Header=True).option("dateFormat", "mm/dd/yyyy")\
    .load('DallasCouncilVoters.csv.gz', schema=voter_schema)

In [4]:
voter_df.show()

+----------+-------------+-------------------+
|      DATE|        TITLE|         VOTER_NAME|
+----------+-------------+-------------------+
|2017-01-08|Councilmember|  Jennifer S. Gates|
|2017-01-08|Councilmember| Philip T. Kingston|
|2017-01-08|        Mayor|Michael S. Rawlings|
|2017-01-08|Councilmember|       Adam Medrano|
|2017-01-08|Councilmember|       Casey Thomas|
|2017-01-08|Councilmember|Carolyn King Arnold|
|2017-01-08|Councilmember|       Scott Griggs|
|2017-01-08|Councilmember|   B. Adam  McGough|
|2017-01-08|Councilmember|       Lee Kleinman|
|2017-01-08|Councilmember|      Sandy Greyson|
|2017-01-08|Councilmember|  Jennifer S. Gates|
|2017-01-08|Councilmember| Philip T. Kingston|
|2017-01-08|        Mayor|Michael S. Rawlings|
|2017-01-08|Councilmember|       Adam Medrano|
|2017-01-08|Councilmember|       Casey Thomas|
|2017-01-08|Councilmember|Carolyn King Arnold|
|2017-01-08|Councilmember| Rickey D. Callahan|
|2017-01-11|Councilmember|  Jennifer S. Gates|
|2018-01-25|C

In [5]:
from pyspark.sql import functions as F

In [6]:
# Show the distinct VOTER_NAME entries
voter_df.select('VOTER_NAME').distinct().show(40, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|VOTER_NAME                                                                                                                                                                                                                                                                                                                                                                                                                 |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [7]:
# Filter voter_df where the VOTER_NAME is 1-20 characters in length
voter_df = voter_df.filter('length(VOTER_NAME) > 0 and length(VOTER_NAME) < 20')
# Filter out voter_df where the VOTER_NAME contains an underscore
voter_df = voter_df.filter(~ F.col('VOTER_NAME').contains('_'))
# or voter_df = voter_df.filter(~ voter_df.VOTER_NAME.contains('_'))
# Show the distinct VOTER_NAME entries again
voter_df.select('VOTER_NAME').distinct().show(40, truncate=False)

+-------------------+
|VOTER_NAME         |
+-------------------+
|Tennell Atkins     |
|Scott Griggs       |
|Scott  Griggs      |
|Sandy Greyson      |
|Michael S. Rawlings|
|Kevin Felder       |
|Adam Medrano       |
|Casey  Thomas      |
|Mark  Clayton      |
|Casey Thomas       |
|Sandy  Greyson     |
|Mark Clayton       |
|Jennifer S.  Gates |
|Tiffinni A. Young  |
|B. Adam  McGough   |
|Omar Narvaez       |
|Philip T. Kingston |
|Rickey D. Callahan |
|Dwaine R. Caraway  |
|Philip T.  Kingston|
|Jennifer S. Gates  |
|Lee M. Kleinman    |
|Monica R. Alonzo   |
|Rickey D.  Callahan|
|Carolyn King Arnold|
|Erik Wilson        |
|Lee Kleinman       |
+-------------------+



## Returning only names without nullls

In [8]:
voter_df.filter(voter_df.VOTER_NAME.isNotNull()).take(1)

[Row(DATE=datetime.date(2017, 1, 8), TITLE='Councilmember', VOTER_NAME='Jennifer S. Gates')]

In [9]:
voter_df.filter(~F.col('VOTER_NAME').isNull()).take(1)

[Row(DATE=datetime.date(2017, 1, 8), TITLE='Councilmember', VOTER_NAME='Jennifer S. Gates')]

In [10]:
voter_df.filter(F.col('VOTER_NAME').isNotNull()).take(1)

[Row(DATE=datetime.date(2017, 1, 8), TITLE='Councilmember', VOTER_NAME='Jennifer S. Gates')]

## Returning rows before 2018

In [11]:
voter_df.filter('DATE < "2018"').show()

+----------+-------------+-------------------+
|      DATE|        TITLE|         VOTER_NAME|
+----------+-------------+-------------------+
|2017-01-08|Councilmember|  Jennifer S. Gates|
|2017-01-08|Councilmember| Philip T. Kingston|
|2017-01-08|        Mayor|Michael S. Rawlings|
|2017-01-08|Councilmember|       Adam Medrano|
|2017-01-08|Councilmember|       Casey Thomas|
|2017-01-08|Councilmember|Carolyn King Arnold|
|2017-01-08|Councilmember|       Scott Griggs|
|2017-01-08|Councilmember|   B. Adam  McGough|
|2017-01-08|Councilmember|       Lee Kleinman|
|2017-01-08|Councilmember|      Sandy Greyson|
|2017-01-08|Councilmember|  Jennifer S. Gates|
|2017-01-08|Councilmember| Philip T. Kingston|
|2017-01-08|        Mayor|Michael S. Rawlings|
|2017-01-08|Councilmember|       Adam Medrano|
|2017-01-08|Councilmember|       Casey Thomas|
|2017-01-08|Councilmember|Carolyn King Arnold|
|2017-01-08|Councilmember| Rickey D. Callahan|
|2017-01-11|Councilmember|  Jennifer S. Gates|
|2017-01-04|C

## Modifying DataFrame columns

In [12]:
# Add a new column called splits separated on whitespace
voter_df = voter_df.withColumn('splits', F.split(voter_df.VOTER_NAME, '\s+'))
# Create a new column called first_name based on the first item in splits
voter_df = voter_df.withColumn('first_name', voter_df.splits.getItem(0))
# Get the last entry of the splits list and create a column called last_name
# voter_df = voter_df.withColumn('last_name', voter_df.splits.getItem(F.size('splits') - 1))
voter_df = voter_df.withColumn('last_name', F.element_at(F.col('splits'), -1))
# Drop the splits column
voter_df = voter_df.drop('splits')
# Show the voter_df DataFrame
voter_df.show()

+----------+-------------+-------------------+----------+---------+
|      DATE|        TITLE|         VOTER_NAME|first_name|last_name|
+----------+-------------+-------------------+----------+---------+
|2017-01-08|Councilmember|  Jennifer S. Gates|  Jennifer|    Gates|
|2017-01-08|Councilmember| Philip T. Kingston|    Philip| Kingston|
|2017-01-08|        Mayor|Michael S. Rawlings|   Michael| Rawlings|
|2017-01-08|Councilmember|       Adam Medrano|      Adam|  Medrano|
|2017-01-08|Councilmember|       Casey Thomas|     Casey|   Thomas|
|2017-01-08|Councilmember|Carolyn King Arnold|   Carolyn|   Arnold|
|2017-01-08|Councilmember|       Scott Griggs|     Scott|   Griggs|
|2017-01-08|Councilmember|   B. Adam  McGough|        B.|  McGough|
|2017-01-08|Councilmember|       Lee Kleinman|       Lee| Kleinman|
|2017-01-08|Councilmember|      Sandy Greyson|     Sandy|  Greyson|
|2017-01-08|Councilmember|  Jennifer S. Gates|  Jennifer|    Gates|
|2017-01-08|Councilmember| Philip T. Kingston|  

# Conditional DataFrame column operations

## Conditional example

In [13]:
voter_df.select('DATE', F.when(F.col('DATE')>'2018', 'latest').alias('tip')).show()

+----------+------+
|      DATE|   tip|
+----------+------+
|2017-01-08|  null|
|2017-01-08|  null|
|2017-01-08|  null|
|2017-01-08|  null|
|2017-01-08|  null|
|2017-01-08|  null|
|2017-01-08|  null|
|2017-01-08|  null|
|2017-01-08|  null|
|2017-01-08|  null|
|2017-01-08|  null|
|2017-01-08|  null|
|2017-01-08|  null|
|2017-01-08|  null|
|2017-01-08|  null|
|2017-01-08|  null|
|2017-01-08|  null|
|2017-01-11|  null|
|2018-01-25|latest|
|2018-01-25|latest|
+----------+------+
only showing top 20 rows



In [14]:
voter_df.select('DATE',
                F.when(F.col('DATE')>'2018', 'latest')\
                .when(F.col('DATE')<'2018', 'older').alias('tip')
               ).show()

+----------+------+
|      DATE|   tip|
+----------+------+
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-11| older|
|2018-01-25|latest|
|2018-01-25|latest|
+----------+------+
only showing top 20 rows



In [15]:
voter_df.select('DATE',
                F.when(F.col('DATE')>'2018', 'latest')\
                .otherwise('older').alias('tip')
               ).show()

+----------+------+
|      DATE|   tip|
+----------+------+
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-08| older|
|2017-01-11| older|
|2018-01-25|latest|
|2018-01-25|latest|
+----------+------+
only showing top 20 rows



## when() example

In [16]:
# Add a column to voter_df for any voter with the title **Councilmember**
voter_df = voter_df.withColumn('random_val',
                               F.when(voter_df.TITLE=='Councilmember', F.rand(seed=1)))
# voter_df = voter_df.withColumn('random_val',
#                                F.when(F.col('TITLE')=='Councilmember', F.rand(seed=1)))
# Show some of the DataFrame rows, noting whether the when clause worked
voter_df.show()

+----------+-------------+-------------------+----------+---------+-------------------+
|      DATE|        TITLE|         VOTER_NAME|first_name|last_name|         random_val|
+----------+-------------+-------------------+----------+---------+-------------------+
|2017-01-08|Councilmember|  Jennifer S. Gates|  Jennifer|    Gates| 0.6363787615254752|
|2017-01-08|Councilmember| Philip T. Kingston|    Philip| Kingston| 0.5993846534021868|
|2017-01-08|        Mayor|Michael S. Rawlings|   Michael| Rawlings|               null|
|2017-01-08|Councilmember|       Adam Medrano|      Adam|  Medrano|  0.134842710012538|
|2017-01-08|Councilmember|       Casey Thomas|     Casey|   Thomas|0.07684163905460906|
|2017-01-08|Councilmember|Carolyn King Arnold|   Carolyn|   Arnold| 0.8539211111755448|
|2017-01-08|Councilmember|       Scott Griggs|     Scott|   Griggs| 0.7167704217972344|
|2017-01-08|Councilmember|   B. Adam  McGough|        B.|  McGough|0.24739024075979754|
|2017-01-08|Councilmember|      

## When / Otherwise

In [17]:
# Add a column to voter_df for a voter based on their position
voter_df = voter_df.withColumn('random_val',
                               F.when(voter_df.TITLE == 'Councilmember', F.rand())
                               .when(voter_df.TITLE == 'Mayor', 2)
                               .otherwise(0))

# Show some of the DataFrame rows
voter_df.show()

# Use the .filter() clause with random_val
voter_df.filter(voter_df.random_val == 0).show()

+----------+-------------+-------------------+----------+---------+-------------------+
|      DATE|        TITLE|         VOTER_NAME|first_name|last_name|         random_val|
+----------+-------------+-------------------+----------+---------+-------------------+
|2017-01-08|Councilmember|  Jennifer S. Gates|  Jennifer|    Gates|0.01756797668845078|
|2017-01-08|Councilmember| Philip T. Kingston|    Philip| Kingston| 0.8928190978260016|
|2017-01-08|        Mayor|Michael S. Rawlings|   Michael| Rawlings|                2.0|
|2017-01-08|Councilmember|       Adam Medrano|      Adam|  Medrano|0.30704684751189415|
|2017-01-08|Councilmember|       Casey Thomas|     Casey|   Thomas| 0.4096070682426126|
|2017-01-08|Councilmember|Carolyn King Arnold|   Carolyn|   Arnold|  0.163249579156727|
|2017-01-08|Councilmember|       Scott Griggs|     Scott|   Griggs|0.46390769378598706|
|2017-01-08|Councilmember|   B. Adam  McGough|        B.|  McGough| 0.1515774617769755|
|2017-01-08|Councilmember|      

# User defined functions

> The return type from a UDF can be any defined type, even a full `StructType()` schema object.

## Reverse string UDF

In [18]:
from pyspark.sql.functions import udf

#Define a Python method
def reverseString(mystr):
    return mystr[::-1]

# Wrap the function and store as a variable
udfReverseString = udf(reverseString, StringType())

#Use with Spark
voter_df.withColumn('ReverseName', udfReverseString(voter_df.VOTER_NAME)).show()

+----------+-------------+-------------------+----------+---------+-------------------+-------------------+
|      DATE|        TITLE|         VOTER_NAME|first_name|last_name|         random_val|        ReverseName|
+----------+-------------+-------------------+----------+---------+-------------------+-------------------+
|2017-01-08|Councilmember|  Jennifer S. Gates|  Jennifer|    Gates|0.01756797668845078|  setaG .S refinneJ|
|2017-01-08|Councilmember| Philip T. Kingston|    Philip| Kingston| 0.8928190978260016| notsgniK .T pilihP|
|2017-01-08|        Mayor|Michael S. Rawlings|   Michael| Rawlings|                2.0|sgnilwaR .S leahciM|
|2017-01-08|Councilmember|       Adam Medrano|      Adam|  Medrano|0.30704684751189415|       onardeM madA|
|2017-01-08|Councilmember|       Casey Thomas|     Casey|   Thomas| 0.4096070682426126|       samohT yesaC|
|2017-01-08|Councilmember|Carolyn King Arnold|   Carolyn|   Arnold|  0.163249579156727|dlonrA gniK nyloraC|
|2017-01-08|Councilmember|  

## Argument-less example

In [19]:
def sortingCap():
    # needs str() because spark doesn't understand numpy.str_ returned by choice
    return str(np.random.choice(['G', 'H', 'R', 'S']))
udfSortingCap = udf(sortingCap, StringType())
voter_df.withColumn('Class', udfSortingCap()).show()

+----------+-------------+-------------------+----------+---------+-------------------+-----+
|      DATE|        TITLE|         VOTER_NAME|first_name|last_name|         random_val|Class|
+----------+-------------+-------------------+----------+---------+-------------------+-----+
|2017-01-08|Councilmember|  Jennifer S. Gates|  Jennifer|    Gates|0.01756797668845078|    S|
|2017-01-08|Councilmember| Philip T. Kingston|    Philip| Kingston| 0.8928190978260016|    H|
|2017-01-08|        Mayor|Michael S. Rawlings|   Michael| Rawlings|                2.0|    G|
|2017-01-08|Councilmember|       Adam Medrano|      Adam|  Medrano|0.30704684751189415|    G|
|2017-01-08|Councilmember|       Casey Thomas|     Casey|   Thomas| 0.4096070682426126|    S|
|2017-01-08|Councilmember|Carolyn King Arnold|   Carolyn|   Arnold|  0.163249579156727|    S|
|2017-01-08|Councilmember|       Scott Griggs|     Scott|   Griggs|0.46390769378598706|    H|
|2017-01-08|Councilmember|   B. Adam  McGough|        B.|  M

## Using user defined functions in Spark 

In [20]:
def getFirstAndMiddle(names):
  # Return a space separated string of names
  return ' '.join(names.split()[:-1])

# Define the method as a UDF
udfFirstAndMiddle = F.udf(getFirstAndMiddle, StringType())

# Create a new column using your UDF
voter_df = voter_df.withColumn('first_and_middle_name', udfFirstAndMiddle(voter_df.VOTER_NAME))

# Show the DataFrame
voter_df.show()

+----------+-------------+-------------------+----------+---------+-------------------+---------------------+
|      DATE|        TITLE|         VOTER_NAME|first_name|last_name|         random_val|first_and_middle_name|
+----------+-------------+-------------------+----------+---------+-------------------+---------------------+
|2017-01-08|Councilmember|  Jennifer S. Gates|  Jennifer|    Gates|0.01756797668845078|          Jennifer S.|
|2017-01-08|Councilmember| Philip T. Kingston|    Philip| Kingston| 0.8928190978260016|            Philip T.|
|2017-01-08|        Mayor|Michael S. Rawlings|   Michael| Rawlings|                2.0|           Michael S.|
|2017-01-08|Councilmember|       Adam Medrano|      Adam|  Medrano|0.30704684751189415|                 Adam|
|2017-01-08|Councilmember|       Casey Thomas|     Casey|   Thomas| 0.4096070682426126|                Casey|
|2017-01-08|Councilmember|Carolyn King Arnold|   Carolyn|   Arnold|  0.163249579156727|         Carolyn King|
|2017-01-0

# Partitioning and lazy processing

## Partitioning
* DataFrames are broken up into partitions
* Partition size can vary
* Each partition is handled independently

> This is part of what provides the performance levels and horizontal scaling ability in Spark.  
If a Spark node doesn't need to compete for resources, nor consult with other Spark nodes for answers, it can reliably schedule the processing for the best performance. 

## Lazy processing
* Transformations are lazy
    * .withColumn(...)
    * .select(...)
* Nothing is actually done until an action is performed
    * .count()
    * .write(...)
* Transformations can be re-ordered for best performance
* Sometimes causes unexpected behavior

## Adding IDs
Normal ID fields:
* Common in relational databases
* Most usually an integer increasing, sequential and unique
* Not very parallel

> The problem with these IDs is they're not very parallel in nature. Given that the values are given out sequentially, if there are multiple workers, they must all refer to a common source for the next entry. This is OK in a single server environment, but in a distributed platform such as Spark, it creates some undue bottlenecks. Let's take a look at how to generate ID's in Spark.

## Monotonically increasing IDs
pyspark.sql.functions.monotonically_increasing_id()
* Integer (64-bit), increases in value, unique
* Not necessarily sequential (gaps exist)
* Completely parallel

> Unlike a normal relational ID, Spark's is completely parallel - each partition is allocated up to 8 billion IDs that can be assigned. Notice that the ID fields in the sample table are integers, increasing in value, but are not sequential. It's a little out scope, but the IDs are a 64-bit number effectively split into groups based on the Spark partition. Each group contains 8.4 billion IDs, and there are 2.1 billion possible groups, none of which overlap. 

## Notes
Remember, Spark is lazy!
* Occasionally out of order
* If performing a join, ID may be assigned after a join
* Test your transformations

## Adding an ID Field

In [21]:
# Select all the unique council voters
voter_df = voter_df.select(F.col("VOTER_NAME")).distinct()

# Count the rows in voter_df
print("\nThere are %d rows in the voter_df DataFrame.\n" % voter_df.count())

# Add a ROW_ID
voter_df = voter_df.withColumn('ROW_ID', F.monotonically_increasing_id())

# Show the rows with 10 highest IDs in the set
voter_df.orderBy(voter_df.ROW_ID.desc()).show(10)


There are 27 rows in the voter_df DataFrame.

+-------------------+------+
|         VOTER_NAME|ROW_ID|
+-------------------+------+
|       Lee Kleinman|    26|
|        Erik Wilson|    25|
|Carolyn King Arnold|    24|
|Rickey D.  Callahan|    23|
|   Monica R. Alonzo|    22|
|    Lee M. Kleinman|    21|
|  Jennifer S. Gates|    20|
|Philip T.  Kingston|    19|
|  Dwaine R. Caraway|    18|
| Rickey D. Callahan|    17|
+-------------------+------+
only showing top 10 rows



## IDs with different partitions

In [22]:
voter_df = spark.read.format('csv').options(Header=True).option("dateFormat", "mm/dd/yyyy")\
    .load('DallasCouncilVoters.csv.gz', schema=voter_schema)

In [23]:
# Print the number of partitions in each DataFrame
print("\nThere are %d partitions in the voter_df DataFrame.\n" % voter_df.rdd.getNumPartitions())

# Add a ROW_ID field to each DataFrame
voter_df = voter_df.withColumn('ROW_ID', F.monotonically_increasing_id())

# Show the top 10 IDs in each DataFrame 
voter_df.orderBy(F.col('ROW_ID').desc()).show(10)


There are 1 partitions in the voter_df DataFrame.

+----------+--------------------+-------------------+------+
|      DATE|               TITLE|         VOTER_NAME|ROW_ID|
+----------+--------------------+-------------------+------+
|2018-01-20|       Councilmember|      Mark  Clayton| 44624|
|2018-01-20|       Councilmember|     Tennell Atkins| 44623|
|2018-01-20|       Councilmember|       Kevin Felder| 44622|
|2018-01-20|       Councilmember|       Omar Narvaez| 44621|
|2018-01-20|       Councilmember|Rickey D.  Callahan| 44620|
|2018-01-20|              Vacant|               null| 44619|
|2018-01-20|       Mayor Pro Tem|      Casey  Thomas| 44618|
|2018-01-20|Deputy Mayor Pro Tem|       Adam Medrano| 44617|
|2018-01-20|               Mayor|Michael S. Rawlings| 44616|
|2018-01-20|       Councilmember|Philip T.  Kingston| 44615|
+----------+--------------------+-------------------+------+
only showing top 10 rows



## More ID tricks

In [24]:
# Determine the highest ROW_ID and save it in previous_max_ID
previous_max_ID = voter_df.select('ROW_ID').rdd.max()[0]

# Add a ROW_ID column to voter_df_april starting at the desired value
voter_df_new = voter_df.withColumn('ROW_ID', F.monotonically_increasing_id() + previous_max_ID)

# Show the ROW_ID from both DataFrames and compare
voter_df.select('ROW_ID').show()
voter_df_new.select('ROW_ID').show()

+------+
|ROW_ID|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
|     5|
|     6|
|     7|
|     8|
|     9|
|    10|
|    11|
|    12|
|    13|
|    14|
|    15|
|    16|
|    17|
|    18|
|    19|
+------+
only showing top 20 rows

+------+
|ROW_ID|
+------+
| 44624|
| 44625|
| 44626|
| 44627|
| 44628|
| 44629|
| 44630|
| 44631|
| 44632|
| 44633|
| 44634|
| 44635|
| 44636|
| 44637|
| 44638|
| 44639|
| 44640|
| 44641|
| 44642|
| 44643|
+------+
only showing top 20 rows



It's easy to forget that the output of a Spark method can often be modified before being assigned. This provides a lot of power and flexibility, especially when trying to migrate tasks from various technologies. Consider how you could use everything we've learned in this chapter to create a combination ID containing a name, a new ID, and perhaps a conditional value.