In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]") \
                    .appName('Acumuladores') \
                    .getOrCreate()

In [2]:
SpContext = spark.sparkContext

#### Carga de Datos

In [3]:
inputPath = "auto-data.csv"
autoData = SpContext.textFile(inputPath)
autoData.cache()

auto-data.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

#### Trabajar con RDD de clave / valor

In [4]:
cylData = autoData.map(lambda x: (x.split(",")[0],x.split(",")[7]))
for i in cylData.take(5):
    print(i)

('MAKE', 'HP')
('subaru', '69')
('chevrolet', '48')
('mazda', '68')
('toyota', '62')


In [5]:
cylData.keys().collect()

['MAKE',
 'subaru',
 'chevrolet',
 'mazda',
 'toyota',
 'mitsubishi',
 'honda',
 'nissan',
 'dodge',
 'plymouth',
 'mazda',
 'mitsubishi',
 'dodge',
 'plymouth',
 'chevrolet',
 'toyota',
 'dodge',
 'honda',
 'toyota',
 'honda',
 'chevrolet',
 'nissan',
 'mitsubishi',
 'dodge',
 'plymouth',
 'mazda',
 'isuzu',
 'mazda',
 'nissan',
 'honda',
 'toyota',
 'toyota',
 'mitsubishi',
 'subaru',
 'nissan',
 'subaru',
 'honda',
 'toyota',
 'honda',
 'honda',
 'nissan',
 'nissan',
 'mazda',
 'subaru',
 'nissan',
 'subaru',
 'dodge',
 'plymouth',
 'mitsubishi',
 'toyota',
 'subaru',
 'volkswagen',
 'toyota',
 'nissan',
 'honda',
 'toyota',
 'toyota',
 'dodge',
 'plymouth',
 'volkswagen',
 'volkswagen',
 'nissan',
 'subaru',
 'toyota',
 'mitsubishi',
 'volkswagen',
 'toyota',
 'nissan',
 'toyota',
 'toyota',
 'mazda',
 'volkswagen',
 'mitsubishi',
 'toyota',
 'honda',
 'mazda',
 'dodge',
 'plymouth',
 'toyota',
 'nissan',
 'honda',
 'subaru',
 'toyota',
 'mitsubishi',
 'mitsubishi',
 'toyota',
 'vo

In [6]:
# Eliminar fila del encabezado
header = cylData.first()
cylHPData = cylData.filter(lambda x: x!=header)

In [7]:
# Encuentre HP promedio por marca
addOne = cylHPData.mapValues(lambda x: (x,1))
addOne.collect()

[('subaru', ('69', 1)),
 ('chevrolet', ('48', 1)),
 ('mazda', ('68', 1)),
 ('toyota', ('62', 1)),
 ('mitsubishi', ('68', 1)),
 ('honda', ('60', 1)),
 ('nissan', ('69', 1)),
 ('dodge', ('68', 1)),
 ('plymouth', ('68', 1)),
 ('mazda', ('68', 1)),
 ('mitsubishi', ('68', 1)),
 ('dodge', ('68', 1)),
 ('plymouth', ('68', 1)),
 ('chevrolet', ('70', 1)),
 ('toyota', ('62', 1)),
 ('dodge', ('68', 1)),
 ('honda', ('58', 1)),
 ('toyota', ('62', 1)),
 ('honda', ('76', 1)),
 ('chevrolet', ('70', 1)),
 ('nissan', ('69', 1)),
 ('mitsubishi', ('68', 1)),
 ('dodge', ('68', 1)),
 ('plymouth', ('68', 1)),
 ('mazda', ('68', 1)),
 ('isuzu', ('78', 1)),
 ('mazda', ('68', 1)),
 ('nissan', ('69', 1)),
 ('honda', ('76', 1)),
 ('toyota', ('62', 1)),
 ('toyota', ('70', 1)),
 ('mitsubishi', ('88', 1)),
 ('subaru', ('73', 1)),
 ('nissan', ('55', 1)),
 ('subaru', ('82', 1)),
 ('honda', ('76', 1)),
 ('toyota', ('70', 1)),
 ('honda', ('76', 1)),
 ('honda', ('76', 1)),
 ('nissan', ('69', 1)),
 ('nissan', ('69', 1)),
 

In [8]:
brandValues = addOne\
    .reduceByKey(lambda x,y: (int(x[0])+int(y[0]), int(x[1])+int(y[1])))
brandValues.collect()

[('chevrolet', (188, 3)),
 ('mazda', (1390, 16)),
 ('mitsubishi', (1353, 13)),
 ('nissan', (1846, 18)),
 ('dodge', (675, 8)),
 ('plymouth', (607, 7)),
 ('saab', (760, 6)),
 ('volvo', (1408, 11)),
 ('alfa-romero', (376, 3)),
 ('mercedes-benz', (1170, 8)),
 ('jaguar', (614, 3)),
 ('subaru', (1035, 12)),
 ('toyota', (2969, 32)),
 ('honda', (1043, 13)),
 ('isuzu', (168, 2)),
 ('volkswagen', (973, 12)),
 ('peugot', (1098, 11)),
 ('audi', (687, 6)),
 ('bmw', (1111, 8)),
 ('mercury', ('175', 1)),
 ('porsche', (764, 4))]

In [9]:
promedio = brandValues.mapValues(lambda x: int(x[0])/int(x[1])).collect()

In [10]:
promedio

[('chevrolet', 62.666666666666664),
 ('mazda', 86.875),
 ('mitsubishi', 104.07692307692308),
 ('nissan', 102.55555555555556),
 ('dodge', 84.375),
 ('plymouth', 86.71428571428571),
 ('saab', 126.66666666666667),
 ('volvo', 128.0),
 ('alfa-romero', 125.33333333333333),
 ('mercedes-benz', 146.25),
 ('jaguar', 204.66666666666666),
 ('subaru', 86.25),
 ('toyota', 92.78125),
 ('honda', 80.23076923076923),
 ('isuzu', 84.0),
 ('volkswagen', 81.08333333333333),
 ('peugot', 99.81818181818181),
 ('audi', 114.5),
 ('bmw', 138.875),
 ('mercury', 175.0),
 ('porsche', 191.0)]

In [11]:
brandValues.mapValues(lambda x: int(x[0])/int(x[1])).sortBy(lambda x: x[1]).collect()

[('chevrolet', 62.666666666666664),
 ('honda', 80.23076923076923),
 ('volkswagen', 81.08333333333333),
 ('isuzu', 84.0),
 ('dodge', 84.375),
 ('subaru', 86.25),
 ('plymouth', 86.71428571428571),
 ('mazda', 86.875),
 ('toyota', 92.78125),
 ('peugot', 99.81818181818181),
 ('nissan', 102.55555555555556),
 ('mitsubishi', 104.07692307692308),
 ('audi', 114.5),
 ('alfa-romero', 125.33333333333333),
 ('saab', 126.66666666666667),
 ('volvo', 128.0),
 ('bmw', 138.875),
 ('mercedes-benz', 146.25),
 ('mercury', 175.0),
 ('porsche', 191.0),
 ('jaguar', 204.66666666666666)]

#### Acumuladores y variables de difusión (broadcast)

In [12]:
sedanCount = SpContext.accumulator(0)
hatchbackCount = SpContext.accumulator(0)

In [13]:
print("sedanCount: ",sedanCount)
print("hatchbackCount: ",hatchbackCount)

sedanCount:  0
hatchbackCount:  0


In [14]:
sedanText = SpContext.broadcast("sedan")
hatchbackText = SpContext.broadcast("hatchback")

In [15]:
def splitLines(line):
    
    global sedanCount
    global hatchbackCount
    
    if sedanText.value in line:
        sedanCount+=1
    if hatchbackText.value in line:
        hatchbackCount+=1
        
    return line.split(",")

In [16]:
splitData = autoData.map(splitLines)
splitData.take(3)

[['MAKE',
  'FUELTYPE',
  'ASPIRE',
  'DOORS',
  'BODY',
  'DRIVE',
  'CYLINDERS',
  'HP',
  'RPM',
  'MPG-CITY',
  'MPG-HWY',
  'PRICE'],
 ['subaru',
  'gas',
  'std',
  'two',
  'hatchback',
  'fwd',
  'four',
  '69',
  '4900',
  '31',
  '36',
  '5118'],
 ['chevrolet',
  'gas',
  'std',
  'two',
  'hatchback',
  'fwd',
  'three',
  '48',
  '5100',
  '47',
  '53',
  '5151']]

In [17]:
print("splitData.count(): ",splitData.count())
print("sedanCount: ",sedanCount)
print("hatchbackCount: ",hatchbackCount)

splitData.count():  198
sedanCount:  92
hatchbackCount:  69


#### Particiones

In [18]:
collData = SpContext.parallelize([4,3,8,5,8])
collData.cache()
collData.getNumPartitions()
conteo = collData.count()
print("conteo: ",conteo)
particiones = collData.getNumPartitions()
print("particiones: ",particiones)

conteo:  5
particiones:  12
