In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *


spark = SparkSession \
    .builder \
    .getOrCreate()

sc = spark.sparkContext

Low Level Transformations (map, flatMap, filter)<br>
map: map(f, preservesPartitioning=False)<br>
- Perform row level transformations where one record transforms into another record.
- Number of records in input is equals to output.
- Return a new RDD by applying a function to each element of this RDD
- When we apply a map function to an RDD, a pipeline RDD is formed, a subclasss of RDD. It has all the APIs defined in the RDD. 

**Obs: Function MAP returns each line in a list.**


Creating RDD

In [3]:
ord = sc.textFile('/home/phillipefs/spark_dev/pyspark-end-to-end-developer/0 - PracticeFiles/Orders')
ord_items = sc.textFile('/home/phillipefs/spark_dev/pyspark-end-to-end-developer/0 - PracticeFiles/Order_items')

#### Project all the orders_ids

In [18]:
ord.take(5)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE']

Using Function

In [63]:
def get_ids(line):
    element = line.split(",")
    return element[0]

ord.map(get_ids).take(5)

['1', '2', '3', '4', '5']

Using Lambda

In [62]:
ord.map(lambda x:x.split(',')[0]).take(5)

['1', '2', '3', '4', '5']

#### Project all the Orders and their status

Using Function

In [67]:
def get_order_status(line):
    element = line.split(",")
    return '#'.join((element[0],element[3]))

ord.map(get_order_status).take(5)

['1#CLOSED', '2#PENDING_PAYMENT', '3#COMPLETE', '4#CLOSED', '5#COMPLETE']

Using Lambda

In [49]:
ord.map(lambda x : x.split(',')[0] +"#"+ x.split(',')[3]).take(5)
ord.map(lambda x : "#".join((x.split(',')[0],x.split(',')[3]))).take(5)

['1#CLOSED', '2#PENDING_PAYMENT', '3#COMPLETE', '4#CLOSED', '5#COMPLETE']

#### Convert the Order date into YYYY/MM/DD Format

Using Lambda

In [51]:
ord.map(lambda x: x.split(',')[1].split(" ")[0].replace("-", "/")).take(5)

['2013/07/25', '2013/07/25', '2013/07/25', '2013/07/25', '2013/07/25']

#### Convert the Order date into YYYY/MM/DD Format and Return Full Line

Using Function

In [69]:
def convert_date(line):
    list_elements = line.split(',')
    date = list_elements[1]
    convert_date = date.replace("-","/")
    list_elements[1] = convert_date
    return ",".join(list_elements)

ord.map(convert_date).take(5)

['1,2013/07/25 00:00:00.0,11599,CLOSED',
 '2,2013/07/25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013/07/25 00:00:00.0,12111,COMPLETE',
 '4,2013/07/25 00:00:00.0,8827,CLOSED',
 '5,2013/07/25 00:00:00.0,11318,COMPLETE']

Using Lambda

In [70]:
ord.map(lambda x : ",".join((x.split(',')[0], x.split(',')[1].replace("-", "/"), x.split(',')[2], x.split(',')[3]))).take(5)

['1,2013/07/25 00:00:00.0,11599,CLOSED',
 '2,2013/07/25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013/07/25 00:00:00.0,12111,COMPLETE',
 '4,2013/07/25 00:00:00.0,8827,CLOSED',
 '5,2013/07/25 00:00:00.0,11318,COMPLETE']

#### Create key-value pairs key as Order Id and values as whole records

Using Lambda

In [13]:
ord.map(lambda a : (a.split(',')[0],a)).take(5)

[('1', '1,2013-07-25 00:00:00.0,11599,CLOSED'),
 ('2', '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT'),
 ('3', '3,2013-07-25 00:00:00.0,12111,COMPLETE'),
 ('4', '4,2013-07-25 00:00:00.0,8827,CLOSED'),
 ('5', '5,2013-07-25 00:00:00.0,11318,COMPLETE')]

Using function

In [16]:
def get_order_value(line):
    elements = line.split(',')
    id = elements[0]
    return (id,line)


ord.map(get_order_value).take(5)

[('1', '1,2013-07-25 00:00:00.0,11599,CLOSED'),
 ('2', '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT'),
 ('3', '3,2013-07-25 00:00:00.0,12111,COMPLETE'),
 ('4', '4,2013-07-25 00:00:00.0,8827,CLOSED'),
 ('5', '5,2013-07-25 00:00:00.0,11318,COMPLETE')]

#### Project all the Order_item_id and their subtotal

In [19]:
ord_items.take(5)

['1,1,957,1,299.98,299.98',
 '2,2,1073,1,199.99,199.99',
 '3,2,502,5,250.0,50.0',
 '4,2,403,1,129.99,129.99',
 '5,4,897,2,49.98,24.99']

Using Lambda

In [21]:
ord_items.map(lambda a : (a.split(',')[0],a.split(',')[4])).take(5)

[('1', '299.98'),
 ('2', '199.99'),
 ('3', '250.0'),
 ('4', '129.99'),
 ('5', '49.98')]

Applied user defined function to convert status into lowercase

Using Function

In [24]:
def status_lower_case(line):
    elements = line.split(',')
    status_lower_case = elements[3].lower()
    elements[3] = status_lower_case
    return ",".join(elements)

ord.map(status_lower_case).take(5)

['1,2013-07-25 00:00:00.0,11599,closed',
 '2,2013-07-25 00:00:00.0,256,pending_payment',
 '3,2013-07-25 00:00:00.0,12111,complete',
 '4,2013-07-25 00:00:00.0,8827,closed',
 '5,2013-07-25 00:00:00.0,11318,complete']

In [25]:
ord.take(5)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE']