## Transformations
- No immediate execution
- immutable
- Lazy evaluation

In [1]:
from pyspark.sql import SparkSession;
from pyspark.context import SparkContext;
spark = SparkSession \
    .builder \
    .appName("Transformations vs actions") \
    .getOrCreate()

sc = spark.sparkContext

### Narrow Transformations :
Narrow transformations are the result of map() and filter() functions and these compute data that live on a single partition meaning there will not be any data movement between partitions to execute narrow transformations.

- map: applies a function to each element in the RDD and returns a new RDD containing the - results.
- flatMap: applies a function to each element in the RDD and returns a new RDD containing the  concatenated results.
- filter: returns a new RDD containing only the elements that satisfy a given predicate.
- union: returns a new RDD containing the union of two RDDs.
- distinct: returns a new RDD containing the distinct elements of an RDD.
- sample: returns a random sample of the elements in an RDD.
- sortBy: sorts the elements of an RDD based on one or more key functions.


In [2]:
data =  spark.read.csv('./data/SalesAnalysis.csv',inferSchema=True,header=True)

In [3]:
data.show(5)

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|
|    NULL|                NULL|            NULL|      NULL|          NULL|                NULL|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|
|  176560|    Wired Headphones|               1|     11.99|04/12/19 14:38|669 Spruce St, Lo...|
+--------+--------------------+----------------+----------+--------------+--------------------+
only showing top 5 rows



In [4]:
data = data.dropna()
data.show(5)

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|
|  176560|    Wired Headphones|               1|     11.99|04/12/19 14:38|669 Spruce St, Lo...|
|  176561|    Wired Headphones|               1|     11.99|04/30/19 09:27|333 8th St, Los A...|
+--------+--------------------+----------------+----------+--------------+--------------------+
only showing top 5 rows



In [16]:
price_rdd = data.select("Price Each").rdd.map(lambda row: row[0]*84)
price_rdd.collect()

[1003.8,
 8399.16,
 50400.0,
 1007.16,
 1007.16,
 1003.8,
 8399.16,
 1003.8,
 142800.0,
 1007.16,
 50400.0,
 1255.8,
 32759.16,
 322.56,
 1255.8,
 12600.0,
 1003.8,
 50400.0,
 1003.8,
 251.16000000000003,
 12600.0,
 12600.0,
 12600.0,
 322.56,
 1003.8,
 58800.0,
 8399.16,
 251.16000000000003,
 25200.0,
 8399.16,
 8399.16,
 251.16000000000003,
 50400.0,
 12599.16,
 9239.16,
 1255.8,
 50400.0,
 12600.0,
 1003.8,
 1255.8,
 1007.16,
 1007.16,
 12600.0,
 1003.8,
 251.16000000000003,
 1255.8,
 32759.16,
 1003.8,
 1255.8,
 1003.8,
 1003.8,
 1003.8,
 50400.0,
 251.16000000000003,
 58800.0,
 12600.0,
 251.16000000000003,
 8399.16,
 12600.0,
 1003.8,
 322.56,
 1255.8,
 1003.8,
 12600.0,
 1255.8,
 25200.0,
 1003.8,
 1255.8,
 322.56,
 12599.16,
 322.56,
 251.16000000000003,
 8399.16,
 1007.16,
 1255.8,
 1003.8,
 50400.0,
 1003.8,
 251.16000000000003,
 83999.16,
 1255.8,
 33600.0,
 322.56,
 251.16000000000003,
 1003.8,
 142800.0,
 1007.16,
 1255.8,
 322.56,
 142800.0,
 1003.8,
 251.16000000000003,


In [17]:
# map cannot be used directly with data frames 
data = data.withColumn("Price_used",col("Price Each")*84)
data.show(5)


+--------+--------------------+----------------+----------+--------------+--------------------+----------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|Price_used|
+--------+--------------------+----------------+----------+--------------+--------------------+----------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|    1003.8|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|   8399.16|
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|   50400.0|
|  176560|    Wired Headphones|               1|     11.99|04/12/19 14:38|669 Spruce St, Lo...|   1007.16|
|  176561|    Wired Headphones|               1|     11.99|04/30/19 09:27|333 8th St, Los A...|   1007.16|
+--------+--------------------+----------------+----------+--------------+--------------------+----------+
only showing top 5 rows



In [18]:
data = data.withColumnRenamed("Price_used","Price_usd")
data.show(5)

+--------+--------------------+----------------+----------+--------------+--------------------+---------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|Price_usd|
+--------+--------------------+----------------+----------+--------------+--------------------+---------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|   1003.8|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|  8399.16|
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|  50400.0|
|  176560|    Wired Headphones|               1|     11.99|04/12/19 14:38|669 Spruce St, Lo...|  1007.16|
|  176561|    Wired Headphones|               1|     11.99|04/30/19 09:27|333 8th St, Los A...|  1007.16|
+--------+--------------------+----------------+----------+--------------+--------------------+---------+
only showing top 5 rows



In [None]:
product_rdd = data.select("Product").rdd.map(lambda x: x[0].split(" "))
product_rdd.collect()
# map returns a list after mapping so list of lists

[['USB-C', 'Charging', 'Cable'],
 ['Bose', 'SoundSport', 'Headphones'],
 ['Google', 'Phone'],
 ['Wired', 'Headphones'],
 ['Wired', 'Headphones'],
 ['USB-C', 'Charging', 'Cable'],
 ['Bose', 'SoundSport', 'Headphones'],
 ['USB-C', 'Charging', 'Cable'],
 ['Macbook', 'Pro', 'Laptop'],
 ['Wired', 'Headphones'],
 ['Google', 'Phone'],
 ['Lightning', 'Charging', 'Cable'],
 ['27in', '4K', 'Gaming', 'Monitor'],
 ['AA', 'Batteries', '(4-pack)'],
 ['Lightning', 'Charging', 'Cable'],
 ['Apple', 'Airpods', 'Headphones'],
 ['USB-C', 'Charging', 'Cable'],
 ['Google', 'Phone'],
 ['USB-C', 'Charging', 'Cable'],
 ['AAA', 'Batteries', '(4-pack)'],
 ['Apple', 'Airpods', 'Headphones'],
 ['Apple', 'Airpods', 'Headphones'],
 ['Apple', 'Airpods', 'Headphones'],
 ['AA', 'Batteries', '(4-pack)'],
 ['USB-C', 'Charging', 'Cable'],
 ['iPhone'],
 ['Bose', 'SoundSport', 'Headphones'],
 ['AAA', 'Batteries', '(4-pack)'],
 ['Flatscreen', 'TV'],
 ['Bose', 'SoundSport', 'Headphones'],
 ['Bose', 'SoundSport', 'Headphones']

In [49]:
product_rdd = data.select("Product").rdd.flatMap(lambda x: x[0].split(" "))
product_rdd.collect()
product_rdd.count()
# Flatmap returns a list after mapping

510249

In [31]:
products = sc.parallelize(set(product_rdd.collect()))
products.collect()
products.count()

36

In [None]:
price_rdd.filter(lambda x: x> 1000).collect()

[1003.8,
 8399.16,
 50400.0,
 1007.16,
 1007.16,
 1003.8,
 8399.16,
 1003.8,
 142800.0,
 1007.16,
 50400.0,
 1255.8,
 32759.16,
 1255.8,
 12600.0,
 1003.8,
 50400.0,
 1003.8,
 12600.0,
 12600.0,
 12600.0,
 1003.8,
 58800.0,
 8399.16,
 25200.0,
 8399.16,
 8399.16,
 50400.0,
 12599.16,
 9239.16,
 1255.8,
 50400.0,
 12600.0,
 1003.8,
 1255.8,
 1007.16,
 1007.16,
 12600.0,
 1003.8,
 1255.8,
 32759.16,
 1003.8,
 1255.8,
 1003.8,
 1003.8,
 1003.8,
 50400.0,
 58800.0,
 12600.0,
 8399.16,
 12600.0,
 1003.8,
 1255.8,
 1003.8,
 12600.0,
 1255.8,
 25200.0,
 1003.8,
 1255.8,
 12599.16,
 8399.16,
 1007.16,
 1255.8,
 1003.8,
 50400.0,
 1003.8,
 83999.16,
 1255.8,
 33600.0,
 1003.8,
 142800.0,
 1007.16,
 1255.8,
 142800.0,
 1003.8,
 142800.0,
 50400.0,
 12599.16,
 1003.8,
 1255.8,
 58800.0,
 50400.0,
 1003.8,
 12600.0,
 50400.0,
 83999.16,
 9239.16,
 1007.16,
 31919.16,
 8399.16,
 83999.16,
 1007.16,
 32759.16,
 12599.16,
 12600.0,
 1255.8,
 1003.8,
 9239.16,
 50400.0,
 31919.16,
 12600.0,
 1003.8,
 

In [40]:
data.filter(data.Price_usd> 1000).show()

+--------+--------------------+----------------+----------+--------------+--------------------+---------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|Price_usd|      products_split|
+--------+--------------------+----------------+----------+--------------+--------------------+---------+--------------------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|   1003.8|[USB-C, Charging,...|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|  8399.16|[Bose, SoundSport...|
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|  50400.0|     [Google, Phone]|
|  176560|    Wired Headphones|               1|     11.99|04/12/19 14:38|669 Spruce St, Lo...|  1007.16| [Wired, Headphones]|
|  176561|    Wired Headphones|               1|     11.99|04/30/19 09:27|333 8th St, Los A...|  1007.16| [Wire

In [37]:
from pyspark.sql.functions import explode, split
data = data.withColumn("products_split",split(data.Product," "))
data.select("products_split").show(5)

+--------------------+
|      products_split|
+--------------------+
|[USB-C, Charging,...|
|[Bose, SoundSport...|
|     [Google, Phone]|
| [Wired, Headphones]|
| [Wired, Headphones]|
+--------------------+
only showing top 5 rows



In [41]:
## Union
price_rdd.union(product_rdd).collect()

[1003.8,
 8399.16,
 50400.0,
 1007.16,
 1007.16,
 1003.8,
 8399.16,
 1003.8,
 142800.0,
 1007.16,
 50400.0,
 1255.8,
 32759.16,
 322.56,
 1255.8,
 12600.0,
 1003.8,
 50400.0,
 1003.8,
 251.16000000000003,
 12600.0,
 12600.0,
 12600.0,
 322.56,
 1003.8,
 58800.0,
 8399.16,
 251.16000000000003,
 25200.0,
 8399.16,
 8399.16,
 251.16000000000003,
 50400.0,
 12599.16,
 9239.16,
 1255.8,
 50400.0,
 12600.0,
 1003.8,
 1255.8,
 1007.16,
 1007.16,
 12600.0,
 1003.8,
 251.16000000000003,
 1255.8,
 32759.16,
 1003.8,
 1255.8,
 1003.8,
 1003.8,
 1003.8,
 50400.0,
 251.16000000000003,
 58800.0,
 12600.0,
 251.16000000000003,
 8399.16,
 12600.0,
 1003.8,
 322.56,
 1255.8,
 1003.8,
 12600.0,
 1255.8,
 25200.0,
 1003.8,
 1255.8,
 322.56,
 12599.16,
 322.56,
 251.16000000000003,
 8399.16,
 1007.16,
 1255.8,
 1003.8,
 50400.0,
 1003.8,
 251.16000000000003,
 83999.16,
 1255.8,
 33600.0,
 322.56,
 251.16000000000003,
 1003.8,
 142800.0,
 1007.16,
 1255.8,
 322.56,
 142800.0,
 1003.8,
 251.16000000000003,


### Wide Transformations :
Wide transformations are transformations in Spark that require shuffling of data between partitions.

- groupByKey: groups the values for each key in an RDD and returns a new RDD containing the - grouped values.
- reduceByKey: aggregates the values for each key in an RDD and returns a new RDD containing the - reduced values.
- aggregateByKey: aggregates the values for each key in an RDD using a user-defined aggregation - function and returns a new RDD containing the aggregated values.
- join: joins two RDDs on a key and returns a new RDD containing the joined values.
- cogroup: groups the values for each key in two RDDs and returns a new RDD containing the - grouped values.
- repartition: rearranges the partitions of an RDD and returns a new RDD with the desired number - of partitions.
- sortByKey: sorts the elements of an RDD based on the keys and returns a new RDD sorted by the - keys.
- coalesce:

In [50]:
product_rdd.collect()

['USB-C',
 'Charging',
 'Cable',
 'Bose',
 'SoundSport',
 'Headphones',
 'Google',
 'Phone',
 'Wired',
 'Headphones',
 'Wired',
 'Headphones',
 'USB-C',
 'Charging',
 'Cable',
 'Bose',
 'SoundSport',
 'Headphones',
 'USB-C',
 'Charging',
 'Cable',
 'Macbook',
 'Pro',
 'Laptop',
 'Wired',
 'Headphones',
 'Google',
 'Phone',
 'Lightning',
 'Charging',
 'Cable',
 '27in',
 '4K',
 'Gaming',
 'Monitor',
 'AA',
 'Batteries',
 '(4-pack)',
 'Lightning',
 'Charging',
 'Cable',
 'Apple',
 'Airpods',
 'Headphones',
 'USB-C',
 'Charging',
 'Cable',
 'Google',
 'Phone',
 'USB-C',
 'Charging',
 'Cable',
 'AAA',
 'Batteries',
 '(4-pack)',
 'Apple',
 'Airpods',
 'Headphones',
 'Apple',
 'Airpods',
 'Headphones',
 'Apple',
 'Airpods',
 'Headphones',
 'AA',
 'Batteries',
 '(4-pack)',
 'USB-C',
 'Charging',
 'Cable',
 'iPhone',
 'Bose',
 'SoundSport',
 'Headphones',
 'AAA',
 'Batteries',
 '(4-pack)',
 'Flatscreen',
 'TV',
 'Bose',
 'SoundSport',
 'Headphones',
 'Bose',
 'SoundSport',
 'Headphones',
 'AAA'

In [51]:
## groupByKey require key value pairs 
product_rdd = product_rdd.map(lambda x: (x,1))
product_rdd.collect()

[('USB-C', 1),
 ('Charging', 1),
 ('Cable', 1),
 ('Bose', 1),
 ('SoundSport', 1),
 ('Headphones', 1),
 ('Google', 1),
 ('Phone', 1),
 ('Wired', 1),
 ('Headphones', 1),
 ('Wired', 1),
 ('Headphones', 1),
 ('USB-C', 1),
 ('Charging', 1),
 ('Cable', 1),
 ('Bose', 1),
 ('SoundSport', 1),
 ('Headphones', 1),
 ('USB-C', 1),
 ('Charging', 1),
 ('Cable', 1),
 ('Macbook', 1),
 ('Pro', 1),
 ('Laptop', 1),
 ('Wired', 1),
 ('Headphones', 1),
 ('Google', 1),
 ('Phone', 1),
 ('Lightning', 1),
 ('Charging', 1),
 ('Cable', 1),
 ('27in', 1),
 ('4K', 1),
 ('Gaming', 1),
 ('Monitor', 1),
 ('AA', 1),
 ('Batteries', 1),
 ('(4-pack)', 1),
 ('Lightning', 1),
 ('Charging', 1),
 ('Cable', 1),
 ('Apple', 1),
 ('Airpods', 1),
 ('Headphones', 1),
 ('USB-C', 1),
 ('Charging', 1),
 ('Cable', 1),
 ('Google', 1),
 ('Phone', 1),
 ('USB-C', 1),
 ('Charging', 1),
 ('Cable', 1),
 ('AAA', 1),
 ('Batteries', 1),
 ('(4-pack)', 1),
 ('Apple', 1),
 ('Airpods', 1),
 ('Headphones', 1),
 ('Apple', 1),
 ('Airpods', 1),
 ('Headpho

In [53]:

for key, values in product_rdd.groupByKey().collect():
    print(f"{key}: {list(values)}")

Lightning: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [57]:
product_rdd.reduceByKey(lambda a,b: a+b).collect()

[('Lightning', 21658),
 ('Monitor', 24019),
 ('Airpods', 15549),
 ('AAA', 20641),
 ('LG', 1312),
 ('ThinkPad', 4128),
 ('Bose', 13325),
 ('SoundSport', 13325),
 ('4K', 6230),
 ('Gaming', 6230),
 ('Batteries', 41218),
 ('(4-pack)', 41218),
 ('TV', 4800),
 ('Vareebadd', 2065),
 ('Washing', 666),
 ('Machine', 666),
 ('Headphones', 47756),
 ('Phone', 7590),
 ('Macbook', 4724),
 ('Laptop', 8852),
 ('27in', 13737),
 ('iPhone', 6842),
 ('20in', 4101),
 ('Ultrawide', 6181),
 ('USB-C', 21903),
 ('Charging', 43561),
 ('Cable', 43561),
 ('Google', 5525),
 ('Wired', 18882),
 ('Pro', 4724),
 ('AA', 20577),
 ('Apple', 15549),
 ('Flatscreen', 4800),
 ('FHD', 7507),
 ('Dryer', 646),
 ('34in', 6181)]

In [58]:
for key, value in product_rdd.reduceByKey(lambda a,b: a+b).collect():
    print(f"{key}: {value}")

Lightning: 21658
Monitor: 24019
Airpods: 15549
AAA: 20641
LG: 1312
ThinkPad: 4128
Bose: 13325
SoundSport: 13325
4K: 6230
Gaming: 6230
Batteries: 41218
(4-pack): 41218
TV: 4800
Vareebadd: 2065
Washing: 666
Machine: 666
Headphones: 47756
Phone: 7590
Macbook: 4724
Laptop: 8852
27in: 13737
iPhone: 6842
20in: 4101
Ultrawide: 6181
USB-C: 21903
Charging: 43561
Cable: 43561
Google: 5525
Wired: 18882
Pro: 4724
AA: 20577
Apple: 15549
Flatscreen: 4800
FHD: 7507
Dryer: 646
34in: 6181


## Actions 
- 

In [7]:
from pyspark.sql.functions import col
from pyspark.sql.functions import sum,min,max,avg,count
from pyspark.sql.functions import upper,round,to_date,date_format,to_timestamp

In [8]:
sales = spark.read.csv('./data/SalesAnalysis.csv',header=True,inferSchema=True)
sales.show(5)

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|
|    NULL|                NULL|            NULL|      NULL|          NULL|                NULL|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|
|  176560|    Wired Headphones|               1|     11.99|04/12/19 14:38|669 Spruce St, Lo...|
+--------+--------------------+----------------+----------+--------------+--------------------+
only showing top 5 rows



In [9]:
type(sales)

pyspark.sql.dataframe.DataFrame

In [59]:
sales.createOrReplaceTempView('sales_view')

In [60]:
spark.sql('select lower(Product) from sales_view').show()

+--------------------+
|      lower(Product)|
+--------------------+
|usb-c charging cable|
|                NULL|
|bose soundsport h...|
|        google phone|
|    wired headphones|
|    wired headphones|
|usb-c charging cable|
|bose soundsport h...|
|usb-c charging cable|
|  macbook pro laptop|
|    wired headphones|
|        google phone|
|lightning chargin...|
|27in 4k gaming mo...|
|aa batteries (4-p...|
|lightning chargin...|
|apple airpods hea...|
|usb-c charging cable|
|        google phone|
|usb-c charging cable|
+--------------------+
only showing top 20 rows



In [11]:
spark.sql('Select distinct(Product) from sales_view;').show()

+--------------------+
|             Product|
+--------------------+
|    Wired Headphones|
|  Macbook Pro Laptop|
|Apple Airpods Hea...|
|              iPhone|
|Lightning Chargin...|
|Bose SoundSport H...|
|USB-C Charging Cable|
|AAA Batteries (4-...|
|        20in Monitor|
|    27in FHD Monitor|
|     Vareebadd Phone|
|34in Ultrawide Mo...|
|            LG Dryer|
|AA Batteries (4-p...|
|        Google Phone|
|       Flatscreen TV|
|  LG Washing Machine|
|             Product|
|27in 4K Gaming Mo...|
|     ThinkPad Laptop|
+--------------------+
only showing top 20 rows



In [61]:
from pyspark.sql.functions import regexp_extract
spark.sql("Select distinct(Product) from sales_view where lower(product) rlike 'usb'").show()

+--------------------+
|             Product|
+--------------------+
|USB-C Charging Cable|
+--------------------+



In [None]:
spark.sql("Select distinct(Product) from sales_view where lower(product) rlike 'usb'").show()

In [None]:
data.withColumn("first_word", regexp_extract("Product", r"(\w+)", 0)).show()


+--------+--------------------+----------------+----------+--------------+--------------------+------------------+--------------------+----------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|         Price_usd|      products_split|first_word|
+--------+--------------------+----------------+----------+--------------+--------------------+------------------+--------------------+----------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|            1003.8|[USB-C, Charging,...|       USB|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|           8399.16|[Bose, SoundSport...|      Bose|
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|           50400.0|     [Google, Phone]|    Google|
|  176560|    Wired Headphones|               1|     11.99|04/12/19 14:38|669 Spruce St, Lo...|           1007.16| [Wi

In [67]:
data.withColumn("Door",regexp_extract("Purchase Address", r"(\d+)", 0)).show()

+--------+--------------------+----------------+----------+--------------+--------------------+------------------+--------------------+----+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|         Price_usd|      products_split|Door|
+--------+--------------------+----------------+----------+--------------+--------------------+------------------+--------------------+----+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|            1003.8|[USB-C, Charging,...| 917|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|           8399.16|[Bose, SoundSport...| 682|
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|           50400.0|     [Google, Phone]| 669|
|  176560|    Wired Headphones|               1|     11.99|04/12/19 14:38|669 Spruce St, Lo...|           1007.16| [Wired, Headphones]| 669|
|  176561|   

In [68]:
sales.groupBy('Product').agg(sum('Price Each')/count('Product')).show()

+--------------------+----------------------------------+
|             Product|(sum(Price Each) / count(Product))|
+--------------------+----------------------------------+
|    Wired Headphones|                11.989999999999549|
|  Macbook Pro Laptop|                            1700.0|
|Apple Airpods Hea...|                             150.0|
|              iPhone|                             700.0|
|                NULL|                              NULL|
|Lightning Chargin...|                14.949999999998477|
|Bose SoundSport H...|                 99.98999999999525|
|USB-C Charging Cable|                11.949999999998806|
|AAA Batteries (4-...|                2.9899999999998124|
|        20in Monitor|                 109.9900000000018|
|    27in FHD Monitor|                 149.9899999999961|
|     Vareebadd Phone|                             400.0|
|34in Ultrawide Mo...|                379.98999999999336|
|            LG Dryer|                             600.0|
|AA Batteries 

In [69]:
spark.sql("select * from sales_view where 'Quantity Ordered' > 2 ;").show()

+--------+-------+----------------+----------+----------+----------------+
|Order ID|Product|Quantity Ordered|Price Each|Order Date|Purchase Address|
+--------+-------+----------------+----------+----------+----------------+
+--------+-------+----------------+----------+----------+----------------+



In [70]:
sales  = sales.withColumnRenamed('Quantity Ordered','Quantity_Ordered')

In [71]:
sales.show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity_Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|
|    NULL|                NULL|            NULL|      NULL|          NULL|                NULL|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|
|  176560|    Wired Headphones|               1|     11.99|04/12/19 14:38|669 Spruce St, Lo...|
|  176561|    Wired Headphones|               1|     11.99|04/30/19 09:27|333 8th St, Los A...|
|  176562|USB-C Charging Cable|               1|     11.95|04/29/19 13:03|381 Wilson St, Sa...|
|  176563|Bose SoundSport H...|         

In [73]:
sales.createOrReplaceTempView('sales_view')

In [74]:
spark.sql("select * from sales_view where Quantity_Ordered > 1 limit 5;").show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity_Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|
|  176583|AAA Batteries (4-...|               2|      2.99|04/20/19 12:00|146 Jackson St, P...|
|  176586|AAA Batteries (4-...|               2|      2.99|04/10/19 17:00|365 Center St, Sa...|
|  176593|Lightning Chargin...|               2|     14.95|04/15/19 13:45|906 7th St, Portl...|
|  176595|    Wired Headphones|               3|     11.99|04/02/19 09:11|383 6th St, Los A...|
+--------+--------------------+----------------+----------+--------------+--------------------+



In [75]:
sales.filter(sales.Quantity_Ordered> 2).show(5)

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity_Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  176595|    Wired Headphones|               3|     11.99|04/02/19 09:11|383 6th St, Los A...|
|  176645|AAA Batteries (4-...|               3|      2.99|04/26/19 18:38|514 Lake St, Dall...|
|  176674|AAA Batteries (4-...|               3|      2.99|04/20/19 20:53|907 West St, Aust...|
|  176841|AAA Batteries (4-...|               3|      2.99|04/26/19 22:50|177 Highland St, ...|
|  176847|AA Batteries (4-p...|               3|      3.84|04/03/19 16:00|616 9th St, Austi...|
+--------+--------------------+----------------+----------+--------------+--------------------+
only showing top 5 rows



- The problem is that you are using the filter method of the DataFrame class, which expects a SQL expression as a string or a Column object as the condition. However, you are passing a lambda function as the condition, which is not a valid argument for this method.

In [76]:
sales.filter(col('Quantity_Ordered') > 2).show(5)

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity_Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  176595|    Wired Headphones|               3|     11.99|04/02/19 09:11|383 6th St, Los A...|
|  176645|AAA Batteries (4-...|               3|      2.99|04/26/19 18:38|514 Lake St, Dall...|
|  176674|AAA Batteries (4-...|               3|      2.99|04/20/19 20:53|907 West St, Aust...|
|  176841|AAA Batteries (4-...|               3|      2.99|04/26/19 22:50|177 Highland St, ...|
|  176847|AA Batteries (4-p...|               3|      3.84|04/03/19 16:00|616 9th St, Austi...|
+--------+--------------------+----------------+----------+--------------+--------------------+
only showing top 5 rows



In [77]:
# order by
sales.filter(col('Quantity_Ordered') > 2).orderBy('Product',ascending=False).show(5)

+--------+----------------+----------------+----------+--------------+--------------------+
|Order ID|         Product|Quantity_Ordered|Price Each|    Order Date|    Purchase Address|
+--------+----------------+----------------+----------+--------------+--------------------+
|  291169|Wired Headphones|               3|     11.99|11/01/19 21:30|228 Highland St, ...|
|  315983|Wired Headphones|               4|     11.99|12/12/19 12:59|520 Wilson St, Sa...|
|  182857|Wired Headphones|               3|     11.99|04/24/19 12:12|817 Lake St, New ...|
|  146446|Wired Headphones|               3|     11.99|01/01/19 18:20|678 2nd St, New Y...|
|  221301|Wired Headphones|               3|     11.99|06/15/19 13:15|785 Hill St, New ...|
+--------+----------------+----------------+----------+--------------+--------------------+
only showing top 5 rows



In [78]:
# order by
sales.filter(col('Quantity_Ordered') > 2).orderBy(col('Product').desc(),col('Order ID').asc()).show(5)

+--------+----------------+----------------+----------+--------------+--------------------+
|Order ID|         Product|Quantity_Ordered|Price Each|    Order Date|    Purchase Address|
+--------+----------------+----------------+----------+--------------+--------------------+
|  146446|Wired Headphones|               3|     11.99|01/01/19 18:20|678 2nd St, New Y...|
|  146495|Wired Headphones|               3|     11.99|01/18/19 10:17|296 7th St, Bosto...|
|  147101|Wired Headphones|               3|     11.99|01/26/19 10:08|898 Center St, Sa...|
|  150417|Wired Headphones|               3|     11.99|01/23/19 22:08|351 13th St, Atla...|
|  150439|Wired Headphones|               3|     11.99|01/27/19 20:08|193 9th St, San F...|
+--------+----------------+----------------+----------+--------------+--------------------+
only showing top 5 rows



In [79]:
sales = sales.withColumnRenamed('Order ID','Order_ID')

In [80]:
sales.select('Product','Order_ID').filter(col('Quantity_Ordered') > 2).orderBy(col('Product').desc(),col('Order ID').asc()).show(5)

+----------------+--------+
|         Product|Order_ID|
+----------------+--------+
|Wired Headphones|  146446|
|Wired Headphones|  146495|
|Wired Headphones|  147101|
|Wired Headphones|  150417|
|Wired Headphones|  150439|
+----------------+--------+
only showing top 5 rows



In [81]:
sales = sales.withColumnRenamed('Price Each','Price_Each')

In [82]:
sales.select('Product','Order_ID').filter((col('Quantity_Ordered') > 2) & (col('Price_Each') == 11.99 )).orderBy(col('Product').desc(),col('Order ID').asc()).show(5)

+----------------+--------+
|         Product|Order_ID|
+----------------+--------+
|Wired Headphones|  146446|
|Wired Headphones|  146495|
|Wired Headphones|  147101|
|Wired Headphones|  150417|
|Wired Headphones|  150439|
+----------------+--------+
only showing top 5 rows



In [83]:
sales.groupBy('Product').agg(sum('Quantity_Ordered')).show(5)

+--------------------+---------------------+
|             Product|sum(Quantity_Ordered)|
+--------------------+---------------------+
|    Wired Headphones|                20557|
|  Macbook Pro Laptop|                 4728|
|Apple Airpods Hea...|                15661|
|              iPhone|                 6849|
|                NULL|                 NULL|
+--------------------+---------------------+
only showing top 5 rows



In [84]:
#having
sales.groupBy('Product') \
    .agg(sum('Quantity_Ordered').alias('total_quantity')) \
    .filter(col('total_quantity') > 100) \
    .show(5)

+--------------------+--------------+
|             Product|total_quantity|
+--------------------+--------------+
|    Wired Headphones|         20557|
|  Macbook Pro Laptop|          4728|
|Apple Airpods Hea...|         15661|
|              iPhone|          6849|
|Lightning Chargin...|         23217|
+--------------------+--------------+
only showing top 5 rows



### functions

In [85]:
#upper
from pyspark.sql.functions import upper
sales.withColumn("product_upp",upper(col('Product'))).select(col('product'),col('product_upp')).show(5)

+--------------------+--------------------+
|             product|         product_upp|
+--------------------+--------------------+
|USB-C Charging Cable|USB-C CHARGING CABLE|
|                NULL|                NULL|
|Bose SoundSport H...|BOSE SOUNDSPORT H...|
|        Google Phone|        GOOGLE PHONE|
|    Wired Headphones|    WIRED HEADPHONES|
+--------------------+--------------------+
only showing top 5 rows



In [86]:
sales.show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order_ID|             Product|Quantity_Ordered|Price_Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|
|    NULL|                NULL|            NULL|      NULL|          NULL|                NULL|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|
|  176560|    Wired Headphones|               1|     11.99|04/12/19 14:38|669 Spruce St, Lo...|
|  176561|    Wired Headphones|               1|     11.99|04/30/19 09:27|333 8th St, Los A...|
|  176562|USB-C Charging Cable|               1|     11.95|04/29/19 13:03|381 Wilson St, Sa...|
|  176563|Bose SoundSport H...|         

In [87]:
sales = sales.withColumnRenamed('Purchase Address','Purchase_Address')

In [88]:
#substring
sales.withColumn('subs',col('Purchase_Address').substr(1,4)).show(5)

+--------+--------------------+----------------+----------+--------------+--------------------+----+
|Order_ID|             Product|Quantity_Ordered|Price_Each|    Order Date|    Purchase_Address|subs|
+--------+--------------------+----------------+----------+--------------+--------------------+----+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|917 |
|    NULL|                NULL|            NULL|      NULL|          NULL|                NULL|NULL|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|682 |
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|669 |
|  176560|    Wired Headphones|               1|     11.99|04/12/19 14:38|669 Spruce St, Lo...|669 |
+--------+--------------------+----------------+----------+--------------+--------------------+----+
only showing top 5 rows



In [89]:
sales.printSchema()

root
 |-- Order_ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity_Ordered: integer (nullable = true)
 |-- Price_Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase_Address: string (nullable = true)



In [90]:
sales = sales.withColumnRenamed('Price Each','Price_Each')

In [91]:
sales = sales.withColumnRenamed('Order Date','Order_Date')

In [100]:
sales.withColumn('Revenue',round(col('Quantity_Ordered')*col('Price_Each'),2)).select('Product','Revenue').show()

+--------------------+-------+
|             Product|Revenue|
+--------------------+-------+
|USB-C Charging Cable|   23.9|
|                NULL|   NULL|
|Bose SoundSport H...|  99.99|
|        Google Phone|  600.0|
|    Wired Headphones|  11.99|
|    Wired Headphones|  11.99|
|USB-C Charging Cable|  11.95|
|Bose SoundSport H...|  99.99|
|USB-C Charging Cable|  11.95|
|  Macbook Pro Laptop| 1700.0|
|    Wired Headphones|  11.99|
|        Google Phone|  600.0|
|Lightning Chargin...|  14.95|
|27in 4K Gaming Mo...| 389.99|
|AA Batteries (4-p...|   3.84|
|Lightning Chargin...|  14.95|
|Apple Airpods Hea...|  150.0|
|USB-C Charging Cable|  11.95|
|        Google Phone|  600.0|
|USB-C Charging Cable|  11.95|
+--------------------+-------+
only showing top 20 rows



- Date functions

In [101]:
sales.printSchema()

root
 |-- Order_ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity_Ordered: integer (nullable = true)
 |-- Price_Each: double (nullable = true)
 |-- Order_Date: string (nullable = true)
 |-- Purchase_Address: string (nullable = true)
 |-- std_fmt: date (nullable = true)



In [102]:
sales = sales.withColumn('std_fmt', to_date(col('Order_Date'), 'MM/dd/yy HH:mm'))

In [95]:
#spark.conf.set('spark.sql.legacy.timeParserPolicy', 'LEGACY')
#spark.conf.set('spark.sql.legacy.timeParserPolicy', 'CORRECTED')

In [96]:
sales.show(4)

+--------+--------------------+----------------+----------+--------------+--------------------+----------+
|Order_ID|             Product|Quantity_Ordered|Price_Each|    Order_Date|    Purchase_Address|   std_fmt|
+--------+--------------------+----------------+----------+--------------+--------------------+----------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|2019-04-19|
|    NULL|                NULL|            NULL|      NULL|          NULL|                NULL|      NULL|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|2019-04-07|
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|2019-04-12|
+--------+--------------------+----------------+----------+--------------+--------------------+----------+
only showing top 4 rows



In [97]:
sales.withColumn('k_fmt', date_format(to_timestamp(col('Order_Date'),'MM/dd/yy HH:mm'),'dd/MM/yy HH:mm')).show(4)

+--------+--------------------+----------------+----------+--------------+--------------------+----------+--------------+
|Order_ID|             Product|Quantity_Ordered|Price_Each|    Order_Date|    Purchase_Address|   std_fmt|         k_fmt|
+--------+--------------------+----------------+----------+--------------+--------------------+----------+--------------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|2019-04-19|19/04/19 08:46|
|    NULL|                NULL|            NULL|      NULL|          NULL|                NULL|      NULL|          NULL|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|2019-04-07|07/04/19 22:30|
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|2019-04-12|12/04/19 14:38|
+--------+--------------------+----------------+----------+--------------+--------------------+----------+--------------+
only showing top 4 rows


In [98]:
sales.agg(min('Price_Each')).show()

+---------------+
|min(Price_Each)|
+---------------+
|           2.99|
+---------------+



### Joins

In [99]:
j_df = df1.join(df2, df1['d']==df2['k',"inner"])

NameError: name 'df1' is not defined

In [None]:
df.na.fill("",["column"])

### Creating new columns

In [None]:
from pyspark.sql.functions import lit
from pyspark.sql.types import IntegerType
sales = sales.withColumn('new',lit(None).cast(IntegerType()))
#None can be replaced with default values

In [None]:
sales.printSchema()

root
 |-- Order_ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity_Ordered: integer (nullable = true)
 |-- Price_Each: double (nullable = true)
 |-- Order_Date: string (nullable = true)
 |-- Purchase_Address: string (nullable = true)
 |-- std_fmt: date (nullable = true)
 |-- date_fmt: string (nullable = true)
 |-- new: integer (nullable = true)



### drop columns

In [None]:
sales = sales.drop('new')

In [None]:
sales.printSchema()

root
 |-- Order_ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity_Ordered: integer (nullable = true)
 |-- Price_Each: double (nullable = true)
 |-- Order_Date: string (nullable = true)
 |-- Purchase_Address: string (nullable = true)
 |-- std_fmt: date (nullable = true)
 |-- date_fmt: string (nullable = true)

