In [2]:
from pyspark.sql import SparkSession

In [4]:
spark = (
    SparkSession.builder.appName('4_fiter').getOrCreate()
)
spark

24/03/15 05:14:27 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


## Load Df

In [18]:
df = spark.read.csv('./taxables.csv', header=True, inferSchema=True)
df.show(3)

+-----+--------------------+---------------+------+--------+
|Index|              "Item"|         "Cost"| "Tax"| "Total"|
+-----+--------------------+---------------+------+--------+
|  1.0| "Fruit of the Lo...|           7.97|   0.6|    8.57|
|  2.0| "Rawlings Little...|           2.97|  0.22|    3.19|
|  3.0| "Secret Antipers...|           1.29|   0.1|    1.39|
+-----+--------------------+---------------+------+--------+
only showing top 3 rows



In [19]:
df.printSchema()

root
 |-- Index: double (nullable = true)
 |--  "Item": string (nullable = true)
 |--  "Cost": string (nullable = true)
 |--  "Tax": double (nullable = true)
 |--  "Total": double (nullable = true)



In [22]:
for i in df.columns:
    strip_col_name = i.replace('"', '').strip()
    df = df.withColumnRenamed(i, strip_col_name)

In [23]:
df.show()

+-----+--------------------+--------------------+----+-----+
|Index|                Item|                Cost| Tax|Total|
+-----+--------------------+--------------------+----+-----+
|  1.0| "Fruit of the Lo...|                7.97| 0.6| 8.57|
|  2.0| "Rawlings Little...|                2.97|0.22| 3.19|
|  3.0| "Secret Antipers...|                1.29| 0.1| 1.39|
|  4.0|      "Deadpool DVD"|                 ...|1.12|16.08|
|  5.0| "Maxwell House C...|                7.28|0.55| 7.83|
|  6.0| "Banana Boat Sun...|               8 oz"|6.68|  0.5|
|  7.0|         "Wrench Set|          18 pieces"|10.0| 0.75|
|  8.0|            "M and M|              42 oz"|8.98| 0.67|
|  9.0| "Bertoli Alfredo...|                2.12|0.16| 2.28|
| 10.0|   "Large Paperclips|           10 boxes"|6.19| 0.46|
+-----+--------------------+--------------------+----+-----+



## Filter operations

In [29]:
## Tax less than 5
df.filter('Tax>=5.0').show()

+-----+--------------------+-----------+----+-----+
|Index|                Item|       Cost| Tax|Total|
+-----+--------------------+-----------+----+-----+
|  6.0| "Banana Boat Sun...|      8 oz"|6.68|  0.5|
|  7.0|         "Wrench Set| 18 pieces"|10.0| 0.75|
|  8.0|            "M and M|     42 oz"|8.98| 0.67|
| 10.0|   "Large Paperclips|  10 boxes"|6.19| 0.46|
+-----+--------------------+-----------+----+-----+



In [30]:
df.filter('Tax>=5.').select(['Item', 'Cost']).show()

+--------------------+-----------+
|                Item|       Cost|
+--------------------+-----------+
| "Banana Boat Sun...|      8 oz"|
|         "Wrench Set| 18 pieces"|
|            "M and M|     42 oz"|
|   "Large Paperclips|  10 boxes"|
+--------------------+-----------+



In [32]:
df.filter(df['Tax']>=5.0).show()

+-----+--------------------+-----------+----+-----+
|Index|                Item|       Cost| Tax|Total|
+-----+--------------------+-----------+----+-----+
|  6.0| "Banana Boat Sun...|      8 oz"|6.68|  0.5|
|  7.0|         "Wrench Set| 18 pieces"|10.0| 0.75|
|  8.0|            "M and M|     42 oz"|8.98| 0.67|
| 10.0|   "Large Paperclips|  10 boxes"|6.19| 0.46|
+-----+--------------------+-----------+----+-----+



In [34]:
# And
df.filter((df['Tax']>=5.0) & (df['Total']>=.5)).show()

+-----+--------------------+-----------+----+-----+
|Index|                Item|       Cost| Tax|Total|
+-----+--------------------+-----------+----+-----+
|  6.0| "Banana Boat Sun...|      8 oz"|6.68|  0.5|
|  7.0|         "Wrench Set| 18 pieces"|10.0| 0.75|
|  8.0|            "M and M|     42 oz"|8.98| 0.67|
+-----+--------------------+-----------+----+-----+



In [35]:
# OR
df.filter((df['Tax']>=5.0) | (df['Total']>=.5)).show()

+-----+--------------------+--------------------+----+-----+
|Index|                Item|                Cost| Tax|Total|
+-----+--------------------+--------------------+----+-----+
|  1.0| "Fruit of the Lo...|                7.97| 0.6| 8.57|
|  2.0| "Rawlings Little...|                2.97|0.22| 3.19|
|  3.0| "Secret Antipers...|                1.29| 0.1| 1.39|
|  4.0|      "Deadpool DVD"|                 ...|1.12|16.08|
|  5.0| "Maxwell House C...|                7.28|0.55| 7.83|
|  6.0| "Banana Boat Sun...|               8 oz"|6.68|  0.5|
|  7.0|         "Wrench Set|          18 pieces"|10.0| 0.75|
|  8.0|            "M and M|              42 oz"|8.98| 0.67|
|  9.0| "Bertoli Alfredo...|                2.12|0.16| 2.28|
| 10.0|   "Large Paperclips|           10 boxes"|6.19| 0.46|
+-----+--------------------+--------------------+----+-----+



In [36]:
# Inverse
df.filter(~(df['Tax']>=5.0)).show()

+-----+--------------------+--------------------+----+-----+
|Index|                Item|                Cost| Tax|Total|
+-----+--------------------+--------------------+----+-----+
|  1.0| "Fruit of the Lo...|                7.97| 0.6| 8.57|
|  2.0| "Rawlings Little...|                2.97|0.22| 3.19|
|  3.0| "Secret Antipers...|                1.29| 0.1| 1.39|
|  4.0|      "Deadpool DVD"|                 ...|1.12|16.08|
|  5.0| "Maxwell House C...|                7.28|0.55| 7.83|
|  9.0| "Bertoli Alfredo...|                2.12|0.16| 2.28|
+-----+--------------------+--------------------+----+-----+

