# Apache Beam 

* With beam we can process data for streaming or batch
* We can choose our runner, like spark or dataflow
* Beam works in parallel

## Showing the results

to show the elements we can use .LogElements() or .Map(print)

In [5]:
#!pip install apache-beam
#!pip install apache-beam[interactive]

In [6]:
import apache_beam as beam

with beam.Pipeline() as p:

  (p | beam.Create(['Hello Beam'])
     | beam.LogElements())
  


Hello Beam


## Branching

In [7]:
import matplotlib.pyplot as plt

In [8]:
import apache_beam as beam

with beam.Pipeline() as p:

  hello_beam = (p 
     | beam.Create(['Hello Beam']))
  
  hello = (hello_beam
    | beam.Map(lambda x: x.split()[0])
    | "Print Hello" >> beam.Map(print))
    
  beam = (hello_beam 
    | beam.Map(lambda x: x.split()[1])
    | "Print Beam" >> beam.Map(print))  


Beam
Hello


## Combiners

### Count

In [35]:
import apache_beam as beam

my_array_of_fruits = [
    ("laranja", 1),
    ("maça", 1),
    ("laranja", 1),
    ("maça", 2),
    ("banana", 4)
  ]

print("How many itens has in my array")

with beam.Pipeline() as p:

  numbers = (p | beam.Create(my_array_of_fruits))

  count = (numbers 
         | beam.combiners.Count.Globally()
         | "count" >> beam.LogElements())
  
print("\nCalculating repetead keys")

with beam.Pipeline() as p:
  numbers = (p | beam.Create(my_array_of_fruits))

  count_by_key = (numbers 
           | beam.combiners.Count().PerKey()
           | beam.LogElements())
  
print("\nCalculatind repetead elements")
with beam.Pipeline() as p:
  numbers = (p | beam.Create(my_array_of_fruits))
  
  count_by_elements = ((numbers 
           | beam.combiners.Count().PerElement()
           | beam.LogElements()))

How many itens has in my array
5

Calculating repetead keys
('laranja', 2)
('maça', 2)
('banana', 1)

Calculatind repetead elements
(('laranja', 1), 2)
(('maça', 1), 1)
(('maça', 2), 1)
(('banana', 4), 1)


### Some metrics

.Top
.Mean
.ToSet
.ToDict
.Sample

In [59]:
# Also we have .perKey() in Top
with beam.Pipeline() as p:

  numbers = (p | beam.Create(range(1, 11)))
  
  smallest = (numbers
     | beam.combiners.Top.Smallest(3)
     | "smallest" >> beam.LogElements())
  
  largest = (numbers
     | beam.combiners.Top.Largest(3)
     | "largest" >> beam.LogElements())
  
  average = (numbers
     | beam.combiners.Mean().Globally()
     | "average" >> beam.LogElements())

  transform_in_set = (numbers
      | beam.combiners.ToSet()
      | "transform in set" >> beam.LogElements())

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
5.5
[1, 2, 3]
[10, 9, 8]


### Our own functions
.CombineFn

In [74]:
class OddEvenCounter(beam.CombineFn):
    def create_accumulator(self):
        return {'odd_count': 0, 'even_count': 0}           # This is a space to store our values
    
    def add_input(self, accumulator, element):
        if element % 2 == 0:
            accumulator['even_count'] += 1              # Here we are adding value in our space
        else:
            accumulator['odd_count'] += 1
        return accumulator
            
    def merge_accumulators(self, accumulators):
        result = {'odd_count': 0, 'even_count': 0}
        for accumulator in accumulators:                    # Now we are grouping all our spaces and aggregating them
            result['odd_count'] += accumulator['odd_count']
            result['even_count'] += accumulator['even_count']
        return result
    
    def extract_output(self, accumulator):               # here we retorn our output
        return accumulator

    
with beam.Pipeline() as p:

  (p | beam.Create([10, 3, 5, 70, 90])
     | beam.CombineGlobally(OddEvenCounter())
     | beam.LogElements())

{'odd_count': 2, 'even_count': 3}
