In [0]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

SpSession = SparkSession \
  .builder \
  .appName("DBA - Spark Key Value") \
  .getOrCreate()

SpContext = SpSession.sparkContext

In [0]:
collData = SpContext.parallelize([4,3,8,5,8])
inputPath = "/FileStore/tables/auto_data.csv"
autoData = SpContext.textFile(inputPath)
autoData.cache()

Out[2]: /FileStore/tables/auto_data.csv MapPartitionsRDD[56] at textFile at NativeMethodAccessorImpl.java:0

In [0]:
cylData = autoData.map(lambda x: (x.split(",")[0], x.split(",")[7]))
print("map_rddr")
for i in cylData.take(5):
  print(i)

map_rddr
('MAKE', 'HP')
('subaru', '69')
('chevrolet', '48')
('mazda', '68')
('toyota', '62')


In [0]:
cylData.keys().collect()

Out[4]: ['MAKE',
 'subaru',
 'chevrolet',
 'mazda',
 'toyota',
 'mitsubishi',
 'honda',
 'nissan',
 'dodge',
 'plymouth',
 'mazda',
 'mitsubishi',
 'dodge',
 'plymouth',
 'chevrolet',
 'toyota',
 'dodge',
 'honda',
 'toyota',
 'honda',
 'chevrolet',
 'nissan',
 'mitsubishi',
 'dodge',
 'plymouth',
 'mazda',
 'isuzu',
 'mazda',
 'nissan',
 'honda',
 'toyota',
 'toyota',
 'mitsubishi',
 'subaru',
 'nissan',
 'subaru',
 'honda',
 'toyota',
 'honda',
 'honda',
 'nissan',
 'nissan',
 'mazda',
 'subaru',
 'nissan',
 'subaru',
 'dodge',
 'plymouth',
 'mitsubishi',
 'toyota',
 'subaru',
 'volkswagen',
 'toyota',
 'nissan',
 'honda',
 'toyota',
 'toyota',
 'dodge',
 'plymouth',
 'volkswagen',
 'volkswagen',
 'nissan',
 'subaru',
 'toyota',
 'mitsubishi',
 'volkswagen',
 'toyota',
 'nissan',
 'toyota',
 'toyota',
 'mazda',
 'volkswagen',
 'mitsubishi',
 'toyota',
 'honda',
 'mazda',
 'dodge',
 'plymouth',
 'toyota',
 'nissan',
 'honda',
 'subaru',
 'toyota',
 'mitsubishi',
 'mitsubishi',
 'toyot

In [0]:
header = cylData.first()
cylHPData = cylData.filter(lambda line: line!=header)
cylHPData.take(5)

Out[5]: [('subaru', '69'),
 ('chevrolet', '48'),
 ('mazda', '68'),
 ('toyota', '62'),
 ('mitsubishi', '68')]

In [0]:
# Encuentre HP promedio por marca
# Agregue un recuento 1 a cada registro y luego reduzca para encontrar totales de HP y recuentos
addOne = cylHPData.mapValues(lambda x: (x,1))

print("map_count")
for i in addOne.take(5):
  print(i)

map_count
('subaru', ('69', 1))
('chevrolet', ('48', 1))
('mazda', ('68', 1))
('toyota', ('62', 1))
('mitsubishi', ('68', 1))


In [0]:
addOne.collect()

Out[7]: [('subaru', ('69', 1)),
 ('chevrolet', ('48', 1)),
 ('mazda', ('68', 1)),
 ('toyota', ('62', 1)),
 ('mitsubishi', ('68', 1)),
 ('honda', ('60', 1)),
 ('nissan', ('69', 1)),
 ('dodge', ('68', 1)),
 ('plymouth', ('68', 1)),
 ('mazda', ('68', 1)),
 ('mitsubishi', ('68', 1)),
 ('dodge', ('68', 1)),
 ('plymouth', ('68', 1)),
 ('chevrolet', ('70', 1)),
 ('toyota', ('62', 1)),
 ('dodge', ('68', 1)),
 ('honda', ('58', 1)),
 ('toyota', ('62', 1)),
 ('honda', ('76', 1)),
 ('chevrolet', ('70', 1)),
 ('nissan', ('69', 1)),
 ('mitsubishi', ('68', 1)),
 ('dodge', ('68', 1)),
 ('plymouth', ('68', 1)),
 ('mazda', ('68', 1)),
 ('isuzu', ('78', 1)),
 ('mazda', ('68', 1)),
 ('nissan', ('69', 1)),
 ('honda', ('76', 1)),
 ('toyota', ('62', 1)),
 ('toyota', ('70', 1)),
 ('mitsubishi', ('88', 1)),
 ('subaru', ('73', 1)),
 ('nissan', ('55', 1)),
 ('subaru', ('82', 1)),
 ('honda', ('76', 1)),
 ('toyota', ('70', 1)),
 ('honda', ('76', 1)),
 ('honda', ('76', 1)),
 ('nissan', ('69', 1)),
 ('nissan', ('69'

In [0]:
type(addOne)

Out[8]: pyspark.rdd.PipelinedRDD

In [0]:
brandValues = addOne \
  .reduceByKey(lambda x,y: (int(x[0]) + int(y[0]), x[1] + y[1]))

print("brandvalues")
for i in brandValues.take(5):
  print(i)

brandvalues
('chevrolet', (188, 3))
('mazda', (1390, 16))
('mitsubishi', (1353, 13))
('nissan', (1846, 18))
('dodge', (675, 8))


In [0]:
brandValues.collect()

Out[10]: [('chevrolet', (188, 3)),
 ('mazda', (1390, 16)),
 ('mitsubishi', (1353, 13)),
 ('nissan', (1846, 18)),
 ('dodge', (675, 8)),
 ('plymouth', (607, 7)),
 ('saab', (760, 6)),
 ('volvo', (1408, 11)),
 ('alfa-romero', (376, 3)),
 ('mercedes-benz', (1170, 8)),
 ('jaguar', (614, 3)),
 ('subaru', (1035, 12)),
 ('toyota', (2969, 32)),
 ('honda', (1043, 13)),
 ('isuzu', (168, 2)),
 ('volkswagen', (973, 12)),
 ('peugot', (1098, 11)),
 ('audi', (687, 6)),
 ('bmw', (1111, 8)),
 ('mercury', ('175', 1)),
 ('porsche', (764, 4))]

In [0]:
promedio = brandValues.mapValues(lambda x: int(x[0])/int(x[1]))
promedio.collect()

Out[11]: [('chevrolet', 62.666666666666664),
 ('mazda', 86.875),
 ('mitsubishi', 104.07692307692308),
 ('nissan', 102.55555555555556),
 ('dodge', 84.375),
 ('plymouth', 86.71428571428571),
 ('saab', 126.66666666666667),
 ('volvo', 128.0),
 ('alfa-romero', 125.33333333333333),
 ('mercedes-benz', 146.25),
 ('jaguar', 204.66666666666666),
 ('subaru', 86.25),
 ('toyota', 92.78125),
 ('honda', 80.23076923076923),
 ('isuzu', 84.0),
 ('volkswagen', 81.08333333333333),
 ('peugot', 99.81818181818181),
 ('audi', 114.5),
 ('bmw', 138.875),
 ('mercury', 175.0),
 ('porsche', 191.0)]