In [1]:
import pyspark

In [2]:
import pandas as pd
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
df = spark.read.csv('./example_data.csv', header=True)

In [7]:
# remember not to pull a .show() at the end of this line.  .show() tells lazy spark to perform the task, but it outputs to the console.  
# assigning it back into df would result in df being a nonetype.
df = df.select(df.n.cast('float'), df.g)

In [8]:
type(df)

pyspark.sql.dataframe.DataFrame

In [11]:
(df\
      .withColumnRenamed('n', 'number')\
     .withColumnRenamed('g', 'group')).show()

+------+-----+
|number|group|
+------+-----+
|  15.0|    b|
|  23.0|    c|
|   6.0|    c|
|   NaN|    c|
|  26.0|    b|
|  12.0|    b|
|   8.0|    a|
|  18.0|    c|
|  14.0|    a|
|  20.0|    c|
|  22.0|    a|
|  21.0|    a|
|   1.0|    c|
|   0.0|    a|
|  17.0|    b|
|   2.0|    a|
|   7.0|    a|
|  16.0|    b|
|  24.0|    b|
|  10.0|    a|
+------+-----+
only showing top 20 rows



In [12]:
df = (df\
      .withColumnRenamed('n', 'number')\
     .withColumnRenamed('g', 'group'))

In [13]:
# performing a transformation on a column and broadcasting it as a new column:
df.withColumn('n_is_even', df.number % 2 == 0).show()

+------+-----+---------+
|number|group|n_is_even|
+------+-----+---------+
|  15.0|    b|    false|
|  23.0|    c|    false|
|   6.0|    c|     true|
|   NaN|    c|    false|
|  26.0|    b|     true|
|  12.0|    b|     true|
|   8.0|    a|     true|
|  18.0|    c|     true|
|  14.0|    a|     true|
|  20.0|    c|     true|
|  22.0|    a|     true|
|  21.0|    a|    false|
|   1.0|    c|    false|
|   0.0|    a|     true|
|  17.0|    b|    false|
|   2.0|    a|     true|
|   7.0|    a|    false|
|  16.0|    b|     true|
|  24.0|    b|     true|
|  10.0|    a|     true|
+------+-----+---------+
only showing top 20 rows



In [14]:
# alternatively:
from pyspark.sql.functions import col, expr
df.withColumn('n_is_even', col('number') % 2 == 0).show()

+------+-----+---------+
|number|group|n_is_even|
+------+-----+---------+
|  15.0|    b|    false|
|  23.0|    c|    false|
|   6.0|    c|     true|
|   NaN|    c|    false|
|  26.0|    b|     true|
|  12.0|    b|     true|
|   8.0|    a|     true|
|  18.0|    c|     true|
|  14.0|    a|     true|
|  20.0|    c|     true|
|  22.0|    a|     true|
|  21.0|    a|    false|
|   1.0|    c|    false|
|   0.0|    a|     true|
|  17.0|    b|    false|
|   2.0|    a|     true|
|   7.0|    a|    false|
|  16.0|    b|     true|
|  24.0|    b|     true|
|  10.0|    a|     true|
+------+-----+---------+
only showing top 20 rows



In [15]:
# doing it by using a string literal:
df.selectExpr('number + 1', 'number % 2 = 0').show()

+------------+------------------+
|(number + 1)|((number % 2) = 0)|
+------------+------------------+
|        16.0|             false|
|        24.0|             false|
|         7.0|              true|
|         NaN|             false|
|        27.0|              true|
|        13.0|              true|
|         9.0|              true|
|        19.0|              true|
|        15.0|              true|
|        21.0|              true|
|        23.0|              true|
|        22.0|             false|
|         2.0|             false|
|         1.0|              true|
|        18.0|             false|
|         3.0|              true|
|         8.0|             false|
|        17.0|              true|
|        25.0|              true|
|        11.0|              true|
+------------+------------------+
only showing top 20 rows



In [16]:
# another tactic for column creation:
df.selectExpr('*', 'number % 2 = 0 as number_is_even').show()

+------+-----+--------------+
|number|group|number_is_even|
+------+-----+--------------+
|  15.0|    b|         false|
|  23.0|    c|         false|
|   6.0|    c|          true|
|   NaN|    c|         false|
|  26.0|    b|          true|
|  12.0|    b|          true|
|   8.0|    a|          true|
|  18.0|    c|          true|
|  14.0|    a|          true|
|  20.0|    c|          true|
|  22.0|    a|          true|
|  21.0|    a|         false|
|   1.0|    c|         false|
|   0.0|    a|          true|
|  17.0|    b|         false|
|   2.0|    a|          true|
|   7.0|    a|         false|
|  16.0|    b|          true|
|  24.0|    b|          true|
|  10.0|    a|          true|
+------+-----+--------------+
only showing top 20 rows



In [17]:
# NOTE: remember, spark dataframes are immutable.
# you cannot make a new column as in pandas like df['new_col'] and broadcast a new column that way
# because you cannot merely adapt your current dataframe.
# you can only perform transformations and output a new dataframe.

In [18]:
# drop a column: 
f.drop('group').show()

+------+
|number|
+------+
|  15.0|
|  23.0|
|   6.0|
|   NaN|
|  26.0|
|  12.0|
|   8.0|
|  18.0|
|  14.0|
|  20.0|
|  22.0|
|  21.0|
|   1.0|
|   0.0|
|  17.0|
|   2.0|
|   7.0|
|  16.0|
|  24.0|
|  10.0|
+------+
only showing top 20 rows



In [19]:
# where clause:
df.where(df.number < 10).show()

+------+-----+
|number|group|
+------+-----+
|   6.0|    c|
|   8.0|    a|
|   1.0|    c|
|   0.0|    a|
|   2.0|    a|
|   7.0|    a|
|   4.0|    c|
|   5.0|    a|
+------+-----+



In [20]:
# alternatively:
df.where('number < 10').show()

+------+-----+
|number|group|
+------+-----+
|   6.0|    c|
|   8.0|    a|
|   1.0|    c|
|   0.0|    a|
|   2.0|    a|
|   7.0|    a|
|   4.0|    c|
|   5.0|    a|
+------+-----+



In [21]:
df.where('number < 10').where(col('number') > 4).show()

+------+-----+
|number|group|
+------+-----+
|   6.0|    c|
|   8.0|    a|
|   7.0|    a|
|   5.0|    a|
+------+-----+



In [23]:
df.where(df.number.between(4, 10)).show()

+------+-----+
|number|group|
+------+-----+
|   6.0|    c|
|   8.0|    a|
|   7.0|    a|
|  10.0|    a|
|   4.0|    c|
|   5.0|    a|
+------+-----+



In [24]:
# filter method:
df.filter(df.group == 'c').filter(df.number > 10).show()

+------+-----+
|number|group|
+------+-----+
|  23.0|    c|
|   NaN|    c|
|  18.0|    c|
|  20.0|    c|
|  27.0|    c|
|  19.0|    c|
|  29.0|    c|
+------+-----+



In [26]:
# take a sample of your dataframe:
df.sample(0.5).show()

# we will elaborate on this later to do a train/test split with your data

+------+-----+
|number|group|
+------+-----+
|  23.0|    c|
|   6.0|    c|
|  12.0|    b|
|  18.0|    c|
|  14.0|    a|
|  20.0|    c|
|  22.0|    a|
|  17.0|    b|
|   7.0|    a|
|  27.0|    c|
|  19.0|    c|
|  13.0|    b|
+------+-----+



In [27]:
# lets move on to some potentially more expensive operations.
# so far everything we have done has been something can be done row by row.  
# Let's attempt some operations that require full breadth of the data

df.orderBy(df.group.desc(), df.number).show()

+------+-----+
|number|group|
+------+-----+
|   1.0|    c|
|   4.0|    c|
|   6.0|    c|
|  18.0|    c|
|  19.0|    c|
|  20.0|    c|
|  23.0|    c|
|  27.0|    c|
|  29.0|    c|
|   NaN|    c|
|  12.0|    b|
|  13.0|    b|
|  15.0|    b|
|  16.0|    b|
|  17.0|    b|
|  24.0|    b|
|  25.0|    b|
|  26.0|    b|
|   0.0|    a|
|   2.0|    a|
+------+-----+
only showing top 20 rows



In [30]:
from pyspark.sql.functions import desc

df.orderBy(df.group.desc(), desc('number')).show()

+------+-----+
|number|group|
+------+-----+
|   NaN|    c|
|  29.0|    c|
|  27.0|    c|
|  23.0|    c|
|  20.0|    c|
|  19.0|    c|
|  18.0|    c|
|   6.0|    c|
|   4.0|    c|
|   1.0|    c|
|  26.0|    b|
|  25.0|    b|
|  24.0|    b|
|  17.0|    b|
|  16.0|    b|
|  15.0|    b|
|  13.0|    b|
|  12.0|    b|
|   NaN|    a|
|  28.0|    a|
+------+-----+
only showing top 20 rows



In [37]:
# This doesn't seem to be working at the moment here, but there are ways to arrange the nulls in spark.
df.orderBy(df.group.desc(), df.number.asc_nulls_last()).show()

+------+-----+
|number|group|
+------+-----+
|   1.0|    c|
|   4.0|    c|
|   6.0|    c|
|  18.0|    c|
|  19.0|    c|
|  20.0|    c|
|  23.0|    c|
|  27.0|    c|
|  29.0|    c|
|   NaN|    c|
|  12.0|    b|
|  13.0|    b|
|  15.0|    b|
|  16.0|    b|
|  17.0|    b|
|  24.0|    b|
|  25.0|    b|
|  26.0|    b|
|   0.0|    a|
|   2.0|    a|
+------+-----+
only showing top 20 rows



In [None]:
# missed this one, check back later. df.sort('number')

In [38]:
df.replace('a', 'D').show()

+------+-----+
|number|group|
+------+-----+
|  15.0|    b|
|  23.0|    c|
|   6.0|    c|
|   NaN|    c|
|  26.0|    b|
|  12.0|    b|
|   8.0|    D|
|  18.0|    c|
|  14.0|    D|
|  20.0|    c|
|  22.0|    D|
|  21.0|    D|
|   1.0|    c|
|   0.0|    D|
|  17.0|    b|
|   2.0|    D|
|   7.0|    D|
|  16.0|    b|
|  24.0|    b|
|  10.0|    D|
+------+-----+
only showing top 20 rows



In [40]:
# replacements need to be of the same type to be performed.
df.replace(['a', 'b'], ['A', 'B'], ['group']).show()

+------+-----+
|number|group|
+------+-----+
|  15.0|    B|
|  23.0|    c|
|   6.0|    c|
|   NaN|    c|
|  26.0|    B|
|  12.0|    B|
|   8.0|    A|
|  18.0|    c|
|  14.0|    A|
|  20.0|    c|
|  22.0|    A|
|  21.0|    A|
|   1.0|    c|
|   0.0|    A|
|  17.0|    B|
|   2.0|    A|
|   7.0|    A|
|  16.0|    B|
|  24.0|    B|
|  10.0|    A|
+------+-----+
only showing top 20 rows



In [41]:
df.na.drop().show()

+------+-----+
|number|group|
+------+-----+
|  15.0|    b|
|  23.0|    c|
|   6.0|    c|
|  26.0|    b|
|  12.0|    b|
|   8.0|    a|
|  18.0|    c|
|  14.0|    a|
|  20.0|    c|
|  22.0|    a|
|  21.0|    a|
|   1.0|    c|
|   0.0|    a|
|  17.0|    b|
|   2.0|    a|
|   7.0|    a|
|  16.0|    b|
|  24.0|    b|
|  10.0|    a|
|   4.0|    c|
+------+-----+
only showing top 20 rows



In [42]:
df.count(), df.na.drop().count()

(30, 28)

In [43]:
df.na.fill(0).show()

+------+-----+
|number|group|
+------+-----+
|  15.0|    b|
|  23.0|    c|
|   6.0|    c|
|   0.0|    c|
|  26.0|    b|
|  12.0|    b|
|   8.0|    a|
|  18.0|    c|
|  14.0|    a|
|  20.0|    c|
|  22.0|    a|
|  21.0|    a|
|   1.0|    c|
|   0.0|    a|
|  17.0|    b|
|   2.0|    a|
|   7.0|    a|
|  16.0|    b|
|  24.0|    b|
|  10.0|    a|
+------+-----+
only showing top 20 rows



In [44]:
df.na.fill(0, ['number']).show()

+------+-----+
|number|group|
+------+-----+
|  15.0|    b|
|  23.0|    c|
|   6.0|    c|
|   0.0|    c|
|  26.0|    b|
|  12.0|    b|
|   8.0|    a|
|  18.0|    c|
|  14.0|    a|
|  20.0|    c|
|  22.0|    a|
|  21.0|    a|
|   1.0|    c|
|   0.0|    a|
|  17.0|    b|
|   2.0|    a|
|   7.0|    a|
|  16.0|    b|
|  24.0|    b|
|  10.0|    a|
+------+-----+
only showing top 20 rows



In [47]:
# fill the nas with the average value of the column:

row = df.na.drop().agg(expr('avg(number)')).first()
the_average = row['avg(number)']
type(the_average)

float

In [48]:
df.na.fill(the_average, ['number']).sort('number').show()

+---------+-----+
|   number|group|
+---------+-----+
|      0.0|    a|
|      1.0|    c|
|      2.0|    a|
|      4.0|    c|
|      5.0|    a|
|      6.0|    c|
|      7.0|    a|
|      8.0|    a|
|     10.0|    a|
|     11.0|    a|
|     12.0|    b|
|     13.0|    b|
|     14.0|    a|
|     15.0|    b|
|15.107142|    c|
|15.107142|    a|
|     16.0|    b|
|     17.0|    b|
|     18.0|    c|
|     19.0|    c|
+---------+-----+
only showing top 20 rows



In [50]:
for row in df.head(5):
    print('type(row): {}'.format(type(row)))
    print('type(row.group): {}'.format(type(row.group)))
    print('row.group: {}'.format(row.group))
    print('-------')

type(row): <class 'pyspark.sql.types.Row'>
type(row.group): <class 'str'>
row.group: b
-------
type(row): <class 'pyspark.sql.types.Row'>
type(row.group): <class 'str'>
row.group: c
-------
type(row): <class 'pyspark.sql.types.Row'>
type(row.group): <class 'str'>
row.group: c
-------
type(row): <class 'pyspark.sql.types.Row'>
type(row.group): <class 'str'>
row.group: c
-------
type(row): <class 'pyspark.sql.types.Row'>
type(row.group): <class 'str'>
row.group: b
-------


## Built-in Functions:

In [54]:
import pyspark.sql.functions as F

In [53]:
# if you do this:
# from pyspark.sql.functions import *
# this wont work:
# sum([1,2,3,4])
# because you have overwritten the conventional python built-in sum function with the pysprak.sql sum function.

In [57]:
df = (spark.read.csv('./sa311/case.csv', header=True, inferSchema=True))

In [60]:
# remember string formatting?
# remember padding?
# dead memes remember.
df.select(format_string('%010d', df.council_district)).show()

+--------------------------------------+
|format_string(%010d, council_district)|
+--------------------------------------+
|                            0000000005|
|                            0000000003|
|                            0000000003|
|                            0000000003|
|                            0000000007|
|                            0000000007|
|                            0000000004|
|                            0000000004|
|                            0000000004|
|                            0000000004|
|                            0000000004|
|                            0000000004|
|                            0000000004|
|                            0000000004|
|                            0000000004|
|                            0000000004|
|                            0000000004|
|                            0000000004|
|                            0000000004|
|                            0000000004|
+--------------------------------------+
only showing top

In [63]:
# so this is a column:
type(format_string('%010d', df.council_district))

pyspark.sql.column.Column

In [65]:
# we can assign it!:
formatted_district = format_string('%010d', df.council_district)
## oops missed the other stuff

In [71]:
(df\
 .select('request_address')\
 .show(truncate=False))

+----------------------------------------+
|request_address                         |
+----------------------------------------+
|2315  EL PASO ST, San Antonio, 78207    |
|2215  GOLIAD RD, San Antonio, 78223     |
|102  PALFREY ST W, San Antonio, 78223   |
|114  LA GARDE ST, San Antonio, 78223    |
|734  CLEARVIEW DR, San Antonio, 78228   |
|BANDERA RD and BRESNAHAN                |
|10133  FIGARO CANYON, San Antonio, 78251|
|10133  FIGARO CANYON, San Antonio, 78251|
|10133  FIGARO CANYON, San Antonio, 78251|
|10133  FIGARO CANYON, San Antonio, 78251|
|10133  FIGARO CANYON, San Antonio, 78251|
|10133  FIGARO CANYON, San Antonio, 78251|
|10129  BOXING PASS, San Antonio, 78251  |
|10129  BOXING PASS, San Antonio, 78251  |
|10129  BOXING PASS, San Antonio, 78251  |
|834  BARREL POINT, San Antonio, 78251   |
|834  BARREL POINT, San Antonio, 78251   |
|834  BARREL POINT, San Antonio, 78251   |
|834  BARREL POINT, San Antonio, 78251   |
|834  BARREL POINT, San Antonio, 78251   |
+----------

In [72]:
(df\
 .select(upper(df.request_address))\
 .show(truncate=False))# if you just pass 'request_address' in this one its gonna break.

+----------------------------------------+
|upper(request_address)                  |
+----------------------------------------+
|2315  EL PASO ST, SAN ANTONIO, 78207    |
|2215  GOLIAD RD, SAN ANTONIO, 78223     |
|102  PALFREY ST W, SAN ANTONIO, 78223   |
|114  LA GARDE ST, SAN ANTONIO, 78223    |
|734  CLEARVIEW DR, SAN ANTONIO, 78228   |
|BANDERA RD AND BRESNAHAN                |
|10133  FIGARO CANYON, SAN ANTONIO, 78251|
|10133  FIGARO CANYON, SAN ANTONIO, 78251|
|10133  FIGARO CANYON, SAN ANTONIO, 78251|
|10133  FIGARO CANYON, SAN ANTONIO, 78251|
|10133  FIGARO CANYON, SAN ANTONIO, 78251|
|10133  FIGARO CANYON, SAN ANTONIO, 78251|
|10129  BOXING PASS, SAN ANTONIO, 78251  |
|10129  BOXING PASS, SAN ANTONIO, 78251  |
|10129  BOXING PASS, SAN ANTONIO, 78251  |
|834  BARREL POINT, SAN ANTONIO, 78251   |
|834  BARREL POINT, SAN ANTONIO, 78251   |
|834  BARREL POINT, SAN ANTONIO, 78251   |
|834  BARREL POINT, SAN ANTONIO, 78251   |
|834  BARREL POINT, SAN ANTONIO, 78251   |
+----------

In [75]:
# substrings: first 20 chars of each address: 
# note substring is starting character and then number charcters to take.  Start on index 0, go for 20 chars.
(df\
 .select(substring(df.request_address, 0, 20))\
 .show(3, truncate=False))

+---------------------------------+
|substring(request_address, 0, 20)|
+---------------------------------+
|2315  EL PASO ST, Sa             |
|2215  GOLIAD RD, San             |
|102  PALFREY ST W, S             |
+---------------------------------+
only showing top 3 rows



In [78]:
(df\
.select(regexp_extract(df.request_address, r'\d+$',0).alias('zip'))\
.show(10, truncate=False))

+-----+
|zip  |
+-----+
|78207|
|78223|
|78223|
|78223|
|78228|
|     |
|78251|
|78251|
|78251|
|78251|
+-----+
only showing top 10 rows



In [89]:
address_re = r'^.+,\s(.+),\s+(\d+)$'

(df
.select(df.request_address, 
        regexp_extract(df.request_address, address_re, 1).alias('city'), 
        regexp_extract(df.request_address, address_re, 2).alias('zip')).show(10, truncate=False))




+----------------------------------------+-----------+-----+
|request_address                         |city       |zip  |
+----------------------------------------+-----------+-----+
|2315  EL PASO ST, San Antonio, 78207    |San Antonio|78207|
|2215  GOLIAD RD, San Antonio, 78223     |San Antonio|78223|
|102  PALFREY ST W, San Antonio, 78223   |San Antonio|78223|
|114  LA GARDE ST, San Antonio, 78223    |San Antonio|78223|
|734  CLEARVIEW DR, San Antonio, 78228   |San Antonio|78228|
|BANDERA RD and BRESNAHAN                |           |     |
|10133  FIGARO CANYON, San Antonio, 78251|San Antonio|78251|
|10133  FIGARO CANYON, San Antonio, 78251|San Antonio|78251|
|10133  FIGARO CANYON, San Antonio, 78251|San Antonio|78251|
|10133  FIGARO CANYON, San Antonio, 78251|San Antonio|78251|
+----------------------------------------+-----------+-----+
only showing top 10 rows



In [90]:
df.printSchema()

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: integer (nullable = true)



In [94]:
df.select(df.case_opened_date).show()

+----------------+
|case_opened_date|
+----------------+
|     1/1/18 0:42|
|     1/1/18 0:46|
|     1/1/18 0:48|
|     1/1/18 1:29|
|     1/1/18 1:34|
|     1/1/18 6:28|
|     1/1/18 6:57|
|     1/1/18 6:58|
|     1/1/18 6:58|
|     1/1/18 6:59|
|     1/1/18 7:00|
|     1/1/18 7:02|
|     1/1/18 7:02|
|     1/1/18 7:03|
|     1/1/18 7:04|
|     1/1/18 7:04|
|     1/1/18 7:05|
|     1/1/18 7:06|
|     1/1/18 7:06|
|     1/1/18 7:07|
+----------------+
only showing top 20 rows



In [92]:
# let's fix the columns that are represented as strings that should not be that way.
(df.select(df.case_opened_date.cast('timestamp'))).show()

+----------------+
|case_opened_date|
+----------------+
|            null|
|            null|
|            null|
|            null|
|            null|
|            null|
|            null|
|            null|
|            null|
|            null|
|            null|
|            null|
|            null|
|            null|
|            null|
|            null|
|            null|
|            null|
|            null|
|            null|
+----------------+
only showing top 20 rows



In [163]:
(df
.select(to_timestamp(df.case_opened_date, 'M/d/y H:mm').alias('ts'))
.show(10))

AttributeError: 'NoneType' object has no attribute '_jvm'

In [96]:
# Go back and get this stuff.  Spent too much time trying to troubleshoot those nulls here.

In [97]:
df.printSchema()

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: integer (nullable = true)



In [98]:
df.select(df.num_days_late).show()

+-------------------+
|      num_days_late|
+-------------------+
| -998.5087616000001|
|-2.0126041669999997|
|       -3.022337963|
|       -15.01148148|
|0.37216435200000003|
|       -29.74398148|
|       -14.70673611|
|       -14.70662037|
|       -14.70662037|
|       -14.70649306|
|       -14.70649306|
|       -14.70636574|
|          -14.70625|
|       -14.70636574|
|       -14.70623843|
|-14.705891199999998|
|       -14.70600694|
|       -14.70576389|
|       -14.70576389|
|       -14.70564815|
+-------------------+
only showing top 20 rows



In [100]:
my_col = when(df.num_days_late >= 0, df.num_days_late).otherwise(0)

(df
.select(df.num_days_late)
.select(my_col)
.show())

+------------------------------------------------------------+
|CASE WHEN (num_days_late >= 0) THEN num_days_late ELSE 0 END|
+------------------------------------------------------------+
|                                                         0.0|
|                                                         0.0|
|                                                         0.0|
|                                                         0.0|
|                                         0.37216435200000003|
|                                                         0.0|
|                                                         0.0|
|                                                         0.0|
|                                                         0.0|
|                                                         0.0|
|                                                         0.0|
|                                                         0.0|
|                                                      

In [104]:
(df
.select(df.case_closed, df.case_late, df.num_days_late)
.select((df.case_closed =='YES').alias('case_closed'), 
       (df.case_late =='YES').alias('case_late'), 
       df.num_days_late)
.filter(col('case_late'))
.show())

+-----------+---------+-------------------+
|case_closed|case_late|      num_days_late|
+-----------+---------+-------------------+
|       true|     true|0.37216435200000003|
|       true|     true|         0.03150463|
|       true|     true|        80.74537037|
|       true|     true|0.38280092600000004|
|       true|     true|        0.376655093|
|       true|     true|        46.41153935|
|       true|     true|        0.048368056|
|       true|     true|         36.1630787|
|       true|     true|        25.36005787|
|       true|     true| 1.8262268519999998|
|       true|     true|         46.3819213|
|       true|     true|        46.38175926|
|       true|     true|        72.39403935|
|       true|     true| 113.73300929999999|
|       true|     true|        79.13157407|
|       true|     true|        3.450983796|
|       true|     true|        73.16055556|
|       true|     true|        1.339675926|
|       true|     true|        68.02585648|
|       true|     true|        7

In [106]:
(df
.select(df.case_closed, df.case_late, df.num_days_late)
.withColumn('case_closed', df.case_closed == 'YES')
 .withColumn('case_late', df.case_late == 'YES')
 .where(col('case_late'))
 .where(col('case_closed'))
.show())

+-----------+---------+-------------------+
|case_closed|case_late|      num_days_late|
+-----------+---------+-------------------+
|       true|     true|0.37216435200000003|
|       true|     true|         0.03150463|
|       true|     true|        80.74537037|
|       true|     true|0.38280092600000004|
|       true|     true|        0.376655093|
|       true|     true|        46.41153935|
|       true|     true|        0.048368056|
|       true|     true|         36.1630787|
|       true|     true|        25.36005787|
|       true|     true| 1.8262268519999998|
|       true|     true|         46.3819213|
|       true|     true|        46.38175926|
|       true|     true|        72.39403935|
|       true|     true| 113.73300929999999|
|       true|     true|        79.13157407|
|       true|     true|        3.450983796|
|       true|     true|        73.16055556|
|       true|     true|        1.339675926|
|       true|     true|        68.02585648|
|       true|     true|        7

In [107]:
(df
.select(df.case_closed, df.case_late, df.num_days_late)
.withColumn('case_closed', df.case_closed == 'YES')
 .withColumn('case_late', df.case_late == 'YES')
 .filter(col('case_late') | col('case_closed'))
.show())

+-----------+---------+-------------------+
|case_closed|case_late|      num_days_late|
+-----------+---------+-------------------+
|       true|    false| -998.5087616000001|
|       true|    false|-2.0126041669999997|
|       true|    false|       -3.022337963|
|       true|    false|       -15.01148148|
|       true|     true|0.37216435200000003|
|       true|    false|       -29.74398148|
|       true|    false|       -14.70673611|
|       true|    false|       -14.70662037|
|       true|    false|       -14.70662037|
|       true|    false|       -14.70649306|
|       true|    false|       -14.70649306|
|       true|    false|       -14.70636574|
|       true|    false|          -14.70625|
|       true|    false|       -14.70636574|
|       true|    false|       -14.70623843|
|       true|    false|-14.705891199999998|
|       true|    false|       -14.70600694|
|       true|    false|       -14.70576389|
|       true|    false|       -14.70576389|
|       true|    false|       -1

## Joins:

In [109]:
users = (spark.read.csv('./users.csv', header=True, inferSchema=True))
roles = (spark.read.csv('./roles.csv', header=True, inferSchema=True))

In [110]:
users.join(roles, users.role_id == roles.id, 'outer').show()

+----+-----+-----------------+-------+----+---------+
|  id| name|            email|role_id|  id|     name|
+----+-----+-----------------+-------+----+---------+
|   5| jane| jane@example.com|   NULL|null|     null|
|   6| mike| mike@example.com|   NULL|null|     null|
|   1|  bob|  bob@example.com|      1|   1|    admin|
|   3|sally|sally@example.com|      3|   3| reviewer|
|   4| adam| adam@example.com|      3|   3| reviewer|
|null| null|             null|   null|   4|commenter|
|   2|  joe|  joe@example.com|      2|   2|   author|
+----+-----+-----------------+-------+----+---------+



In [111]:
users.join(roles, users.role_id == roles.id, 'left').show()

+---+-----+-----------------+-------+----+--------+
| id| name|            email|role_id|  id|    name|
+---+-----+-----------------+-------+----+--------+
|  1|  bob|  bob@example.com|      1|   1|   admin|
|  2|  joe|  joe@example.com|      2|   2|  author|
|  3|sally|sally@example.com|      3|   3|reviewer|
|  4| adam| adam@example.com|      3|   3|reviewer|
|  5| jane| jane@example.com|   NULL|null|    null|
|  6| mike| mike@example.com|   NULL|null|    null|
+---+-----+-----------------+-------+----+--------+



In [112]:
users.join(roles, users.role_id == roles.id, 'right').show()

+----+-----+-----------------+-------+---+---------+
|  id| name|            email|role_id| id|     name|
+----+-----+-----------------+-------+---+---------+
|   1|  bob|  bob@example.com|      1|  1|    admin|
|   2|  joe|  joe@example.com|      2|  2|   author|
|   4| adam| adam@example.com|      3|  3| reviewer|
|   3|sally|sally@example.com|      3|  3| reviewer|
|null| null|             null|   null|  4|commenter|
+----+-----+-----------------+-------+---+---------+



In [114]:
# lots of weird join methods we can use here.  anti, semi, cross... play around with it

In [116]:
users.join(roles, users.role_id == roles.id, 'left_anti').show()

+---+----+----------------+-------+
| id|name|           email|role_id|
+---+----+----------------+-------+
|  5|jane|jane@example.com|   NULL|
|  6|mike|mike@example.com|   NULL|
+---+----+----------------+-------+



In [118]:
users.crossJoin(roles).show()

+---+-----+-----------------+-------+---+---------+
| id| name|            email|role_id| id|     name|
+---+-----+-----------------+-------+---+---------+
|  1|  bob|  bob@example.com|      1|  1|    admin|
|  1|  bob|  bob@example.com|      1|  2|   author|
|  1|  bob|  bob@example.com|      1|  3| reviewer|
|  1|  bob|  bob@example.com|      1|  4|commenter|
|  2|  joe|  joe@example.com|      2|  1|    admin|
|  2|  joe|  joe@example.com|      2|  2|   author|
|  2|  joe|  joe@example.com|      2|  3| reviewer|
|  2|  joe|  joe@example.com|      2|  4|commenter|
|  3|sally|sally@example.com|      3|  1|    admin|
|  3|sally|sally@example.com|      3|  2|   author|
|  3|sally|sally@example.com|      3|  3| reviewer|
|  3|sally|sally@example.com|      3|  4|commenter|
|  4| adam| adam@example.com|      3|  1|    admin|
|  4| adam| adam@example.com|      3|  2|   author|
|  4| adam| adam@example.com|      3|  3| reviewer|
|  4| adam| adam@example.com|      3|  4|commenter|
|  5| jane| 

In [119]:
# unions work like they do in sql
users.union(users).show()

+---+-----+-----------------+-------+
| id| name|            email|role_id|
+---+-----+-----------------+-------+
|  1|  bob|  bob@example.com|      1|
|  2|  joe|  joe@example.com|      2|
|  3|sally|sally@example.com|      3|
|  4| adam| adam@example.com|      3|
|  5| jane| jane@example.com|   NULL|
|  6| mike| mike@example.com|   NULL|
|  1|  bob|  bob@example.com|      1|
|  2|  joe|  joe@example.com|      2|
|  3|sally|sally@example.com|      3|
|  4| adam| adam@example.com|      3|
|  5| jane| jane@example.com|   NULL|
|  6| mike| mike@example.com|   NULL|
+---+-----+-----------------+-------+



In [121]:
train, test = df.randomSplit([.6, .4], seed=123)

In [122]:
train, validate, test = df.randomSplit([.6, .2, .2], seed=3)

In [123]:
train.count()

504429

In [127]:
# you can pull this with just pure ratios, just make sure your numbers are in floats
train, test = df.randomSplit([3.0, 1.0], seed=2)

## Aggregation

In [155]:
df = spark.read.csv('./example_data.csv', header=True)

In [156]:
df.show()

+----+---+
|   n|  g|
+----+---+
|15.0|  b|
|23.0|  c|
| 6.0|  c|
| NaN|  c|
|26.0|  b|
|12.0|  b|
| 8.0|  a|
|18.0|  c|
|14.0|  a|
|20.0|  c|
|22.0|  a|
|21.0|  a|
| 1.0|  c|
| 0.0|  a|
|17.0|  b|
| 2.0|  a|
| 7.0|  a|
|16.0|  b|
|24.0|  b|
|10.0|  a|
+----+---+
only showing top 20 rows



In [157]:
df = df.na.drop()

In [158]:
df.show()

+----+---+
|   n|  g|
+----+---+
|15.0|  b|
|23.0|  c|
| 6.0|  c|
| NaN|  c|
|26.0|  b|
|12.0|  b|
| 8.0|  a|
|18.0|  c|
|14.0|  a|
|20.0|  c|
|22.0|  a|
|21.0|  a|
| 1.0|  c|
| 0.0|  a|
|17.0|  b|
| 2.0|  a|
| 7.0|  a|
|16.0|  b|
|24.0|  b|
|10.0|  a|
+----+---+
only showing top 20 rows



In [151]:
df.describe().show()

df.agg(stddev(df.n)).show()

+-------+---+----+
|summary|  n|   g|
+-------+---+----+
|  count| 30|  30|
|   mean|NaN|null|
| stddev|NaN|null|
|    min|0.0|   a|
|    max|NaN|   c|
+-------+---+----+

+--------------+
|stddev_samp(n)|
+--------------+
|           NaN|
+--------------+



In [152]:
df.groupBy('g').agg(count(df.n)).show()
df.groupBy('g').agg(expr('count(n)')).show()

+---+--------+
|  g|count(n)|
+---+--------+
|  c|      10|
|  b|       8|
|  a|      12|
+---+--------+

+---+--------+
|  g|count(n)|
+---+--------+
|  c|      10|
|  b|       8|
|  a|      12|
+---+--------+



In [None]:
## OOPS
# missed stuff
# df.groupBy('g')
# .agg(expr('count(n)'))

In [159]:
df = (spark.read.csv('./sa311/case.csv', header=True, inferSchema=True))

In [160]:
(df
.select('case_late', 'case_closed', 'num_days_late')
.crosstab('case_late', 'case_closed')
.show())

+---------------------+-----+------+
|case_late_case_closed|   NO|   YES|
+---------------------+-----+------+
|                  YES| 6525| 87978|
|                   NO|11585|735616|
+---------------------+-----+------+



In [161]:
# lets try making this crosstab manually using pivot tables
(df
.select('case_late', 'case_closed', 'num_days_late')
 .groupBy('case_late') # rows
 .pivot('case_closed') #columns
 .agg(avg(col('num_days_late'))) # values
 .show())

+---------+-------------------+-------------------+
|case_late|                 NO|                YES|
+---------+-------------------+-------------------+
|      YES|  70.02952998931113| 22.111760411319253|
|       NO|-53.775669814418364|-58.571301683913845|
+---------+-------------------+-------------------+



In [162]:
spark.stop()