In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf, SparkContext

In [3]:
conf = SparkConf().setAppName('RDD-Join').setMaster('local[4]')

In [4]:
sc = SparkContext(conf=conf)

# Read Data

In [5]:
order_items_rdd = sc.textFile('order_items.csv') \
.filter(lambda x:'orderItemName' not in x)

In [6]:
order_items_rdd.take(5)

['1,1,957,1,299.98,299.98',
 '2,2,1073,1,199.99,199.99',
 '3,2,502,5,250.0,50.0',
 '4,2,403,1,129.99,129.99',
 '5,4,897,2,49.98,24.99']

In [9]:
products_rdd = sc.textFile('products.csv') \
.filter(lambda x: 'productCategoryId' not in x)

In [10]:
products_rdd.take(5)

['1,2,Quest Q64 10 FT. x 10 FT. Slant Leg Instant U,,59.98,http://images.acmesports.sports/Quest+Q64+10+FT.+x+10+FT.+Slant+Leg+Instant+Up+Canopy',
 "2,2,Under Armour Men's Highlight MC Football Clea,,129.99,http://images.acmesports.sports/Under+Armour+Men%27s+Highlight+MC+Football+Cleat",
 "3,2,Under Armour Men's Renegade D Mid Football Cl,,89.99,http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat",
 "4,2,Under Armour Men's Renegade D Mid Football Cl,,89.99,http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat",
 '5,2,Riddell Youth Revolution Speed Custom Footbal,,199.99,http://images.acmesports.sports/Riddell+Youth+Revolution+Speed+Custom+Football+Helmet']

# Convert Data to Pair RDD

In [11]:
def make_order_items_pair_rdd(line):
    orderItemName = line.split(',')[0]
    orderItemOrderId = line.split(',')[1]
    orderItemProductId = line.split(',')[2]
    orderItemQuantity = line.split(',')[3]
    orderItemSubTotal = line.split(',')[4]
    orderItemProductPrice = line.split(',')[5]

    #Foreign key => orderItemProductId
    return (orderItemProductId, (orderItemName,orderItemOrderId,orderItemQuantity,orderItemSubTotal,orderItemProductPrice
))

In [12]:
order_items_pair_rdd = order_items_rdd.map(make_order_items_pair_rdd)

In [13]:
order_items_pair_rdd.take(5)

[('957', ('1', '1', '1', '299.98', '299.98')),
 ('1073', ('2', '2', '1', '199.99', '199.99')),
 ('502', ('3', '2', '5', '250.0', '50.0')),
 ('403', ('4', '2', '1', '129.99', '129.99')),
 ('897', ('5', '4', '2', '49.98', '24.99'))]

In [18]:
def make_products_pair_rdd(line):
    productId = line.split(',')[0]
    productCategoryId = line.split(',')[1]
    productName = line.split(',')[2]
    productDescription = line.split(',')[3]
    productPrice = line.split(',')[4]
    productImage = line.split(',')[5]
    
    #Foreign key => productId
    return (productId, (productCategoryId,productName,productDescription,productPrice,productImage
))

In [19]:
products_pair_rdd = products_rdd.map(make_products_pair_rdd)

In [20]:
products_pair_rdd.take(5)

[('1',
  ('2',
   'Quest Q64 10 FT. x 10 FT. Slant Leg Instant U',
   '',
   '59.98',
   'http://images.acmesports.sports/Quest+Q64+10+FT.+x+10+FT.+Slant+Leg+Instant+Up+Canopy')),
 ('2',
  ('2',
   "Under Armour Men's Highlight MC Football Clea",
   '',
   '129.99',
   'http://images.acmesports.sports/Under+Armour+Men%27s+Highlight+MC+Football+Cleat')),
 ('3',
  ('2',
   "Under Armour Men's Renegade D Mid Football Cl",
   '',
   '89.99',
   'http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat')),
 ('4',
  ('2',
   "Under Armour Men's Renegade D Mid Football Cl",
   '',
   '89.99',
   'http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat')),
 ('5',
  ('2',
   'Riddell Youth Revolution Speed Custom Footbal',
   '',
   '199.99',
   'http://images.acmesports.sports/Riddell+Youth+Revolution+Speed+Custom+Football+Helmet'))]

# Join process

In [24]:
order_items_products_pair_rdd = order_items_pair_rdd.join(products_pair_rdd)

In [26]:
order_items_products_pair_rdd.take(5)

[('957',
  (('1', '1', '1', '299.98', '299.98'),
   ('43',
    "Diamondback Women's Serene Classic Comfort Bi",
    '',
    '299.98',
    'http://images.acmesports.sports/Diamondback+Women%27s+Serene+Classic+Comfort+Bike+2014'))),
 ('957',
  (('9', '5', '1', '299.98', '299.98'),
   ('43',
    "Diamondback Women's Serene Classic Comfort Bi",
    '',
    '299.98',
    'http://images.acmesports.sports/Diamondback+Women%27s+Serene+Classic+Comfort+Bike+2014'))),
 ('957',
  (('12', '5', '1', '299.98', '299.98'),
   ('43',
    "Diamondback Women's Serene Classic Comfort Bi",
    '',
    '299.98',
    'http://images.acmesports.sports/Diamondback+Women%27s+Serene+Classic+Comfort+Bike+2014'))),
 ('957',
  (('15', '7', '1', '299.98', '299.98'),
   ('43',
    "Diamondback Women's Serene Classic Comfort Bi",
    '',
    '299.98',
    'http://images.acmesports.sports/Diamondback+Women%27s+Serene+Classic+Comfort+Bike+2014'))),
 ('957',
  (('34', '12', '1', '299.98', '299.98'),
   ('43',
    "Diamondb

# Check the operation / Checksum

In [27]:
order_items_pair_rdd.count()

172198

In [28]:
products_pair_rdd.count()

1345

In [30]:
order_items_products_pair_rdd.count()

172198