https://play.beam.apache.org/

!pip install -r requirements.txt

In [1]:
import apache_beam as beam

In [2]:
from IPython.display import clear_output

In [3]:
with beam.Pipeline() as p:

  (p | beam.Create(range(0, 11,2)[1:])
     | beam.combiners.Mean.Globally()
     | beam.io.textio.WriteToText('example-output')
    )
clear_output()
!cat example-output*

6.0


In [101]:

import apache_beam as beam

with beam.Pipeline() as p:

  (p | beam.Create(range(1, 11))
     | beam.Filter(lambda num: num % 2 == 0)
     | 'Write' >> beam.io.textio.WriteToText('example-output'))
    
clear_output()
!cat example-output*

2
4
6
8
10


In [103]:

import apache_beam as beam

with beam.Pipeline() as p:

  (p | beam.Create(range(1, 11))
     | beam.Map(lambda num: num * 2)
     | 'Write' >> beam.io.textio.WriteToText('example-output'))
    
clear_output()
!cat example-output*

2
4
6
8
10
12
14
16
18
20


In [106]:
class FilterOutEvenNumber(beam.DoFn):

    def process(self, element):
        if element % 2 == 0:
            yield element


with beam.Pipeline() as p:
  (p | beam.Create(range(1, 11))
     | beam.ParDo(FilterOutEvenNumber())
    | 'Write' >> beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*

2
4
6
8
10


In [4]:
import numpy as np

In [5]:
list(range(0, 11,2)),np.mean(list(range(0, 11,2)))

([0, 2, 4, 6, 8, 10], 5.0)

In [6]:


with beam.Pipeline() as p:

  (p | beam.Create(range(1, 11))
     | beam.combiners.Top.Largest(4)
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*

[10, 9, 8, 7]


In [7]:


with beam.Pipeline() as p:

  (p | beam.Create(range(1, 11))
     | beam.combiners.Top.Smallest(2)
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*

[1, 2]


In [8]:


with beam.Pipeline() as p:

  (p | beam.Create(range(1, 11))
     | beam.CombineGlobally(min)
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*

1


In [9]:
%%writefile inputtext.txt
10
20
30
40
100

Overwriting inputtext.txt


In [10]:
import apache_beam as beam


with beam.Pipeline() as p:
    # | beam.Create([10, 20, 30, 40, 50])
  (p | beam.io.ReadFromText('inputtext.txt')
     | beam.Map(lambda num: 2*int(num) + 5)
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*

25
45
65
85
205


In [11]:
%%writefile inputtext2.txt
10,20,30
20,30,35
30,35,40
40,45,50
100,105,110

Overwriting inputtext2.txt


In [12]:
import apache_beam as beam


with beam.Pipeline() as p:
    # | beam.Create([10, 20, 30, 40, 50])
  (p | beam.io.ReadFromText('inputtext2.txt')
     | beam.Map(lambda line: line.split(","))
     | beam.Map(lambda nums: [2*int(num) + 5 for num in nums])
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*

[25, 45, 65]
[45, 65, 75]
[65, 75, 85]
[85, 95, 105]
[205, 215, 225]


In [13]:
%%writefile inputtext3.txt
10,20,30,B,C
20,30,35,A,D
30,35,40,K,E
40,45,50,N,C
100,105,110,M,C

Overwriting inputtext3.txt


In [14]:
def strint(line):
    col=[]
    for x in line:
        try:
            col.append(int(x))
        except:
            col.append(x)
    return col

In [15]:
import apache_beam as beam


with beam.Pipeline() as p:
    # | beam.Create([10, 20, 30, 40, 50])
  (p | beam.io.ReadFromText('inputtext3.txt')
     | beam.Map(lambda line : line.split(","))
     | beam.Map(lambda num : strint(num))
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*

[10, 20, 30, 'B', 'C']
[20, 30, 35, 'A', 'D']
[30, 35, 40, 'K', 'E']
[40, 45, 50, 'N', 'C']
[100, 105, 110, 'M', 'C']


In [16]:
import datetime
import time

In [17]:
datetime.datetime.fromtimestamp(time.time())

datetime.datetime(2022, 12, 22, 8, 15, 37, 677568)

In [18]:
datetime.datetime.strptime("22-12-21 17:57:43","%y-%m-%d %H:%M:%S")

datetime.datetime(2022, 12, 21, 17, 57, 43)

In [19]:
%%writefile inputtext4.txt
10,20,30,B,C,22-12-21 17:57:43
20,30,35,A,D,22-12-21 17:57:43
30,35,40,K,E,22-12-21 17:57:43
40,45,50,N,C,22-12-21 17:57:43
100,105,110,M,C,22-12-21 17:57:43
10,20,30,B,C,22-12-21 17:57:43
20,30,35,A,D,22-12-21 17:57:43
30,35,40,K,E,22-12-21 17:57:43
40,45,50,N,C,22-12-21 17:57:43
100,105,110,M,C,22-12-21 17:57:43
10,20,30,B,C,22-12-21 17:57:43
20,30,35,A,D,22-12-21 17:57:43
30,35,40,K,E,22-12-21 17:57:43
40,45,50,N,C,22-12-21 17:57:43
100,105,110,M,C,22-12-21 17:57:43
10,20,30,B,C,22-12-21 17:57:43
20,30,35,A,D,22-12-21 17:57:43
30,35,40,K,E,22-12-21 17:57:43
40,45,50,N,C,22-12-21 17:57:43
100,105,110,M,C,22-12-21 17:57:43


Overwriting inputtext4.txt


In [41]:
def strintdate(line):
    col=[]
    for x in line:
        try:
            try:
                try:
                    col.append(int(x))
                except:
                    col.append(datetime.datetime.strptime(x,"%y-%m-%d %H:%M:%S"))
            except:
                 col.append(x)
        except:pass
    return col

In [42]:
def stats(line):
    col=[]
    sums=0
    for x in line:
        col.append(x)
    arr=[]
    for i,x in enumerate(line):
        if i<=2:
            arr.append(int(x))
        else:
            pass
    col.append(np.round(np.mean(arr)))    
    col.append(np.round(np.std(arr)))
    col.append(np.round(np.var(arr))) 
    col.append(np.round(np.min(arr)))    
    col.append(np.round(np.max(arr)))    
    col.append(np.round(np.ptp(arr)))      
    return col

In [43]:
import apache_beam as beam

with beam.Pipeline() as p:
    # | beam.Create([10, 20, 30, 40, 50])
  (p | beam.io.ReadFromText('inputtext4.txt')
     | beam.Map(lambda line : line.split(","))
     | beam.Map(lambda line : strintdate(line))
     | beam.Map(lambda line : stats(line))
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*

[10, 20, 30, 'B', 'C', datetime.datetime(2022, 12, 21, 17, 57, 43), 20.0, 8.0, 67.0, 10, 30, 20]
[20, 30, 35, 'A', 'D', datetime.datetime(2022, 12, 21, 17, 57, 43), 28.0, 6.0, 39.0, 20, 35, 15]
[30, 35, 40, 'K', 'E', datetime.datetime(2022, 12, 21, 17, 57, 43), 35.0, 4.0, 17.0, 30, 40, 10]
[40, 45, 50, 'N', 'C', datetime.datetime(2022, 12, 21, 17, 57, 43), 45.0, 4.0, 17.0, 40, 50, 10]
[100, 105, 110, 'M', 'C', datetime.datetime(2022, 12, 21, 17, 57, 43), 105.0, 4.0, 17.0, 100, 110, 10]
[10, 20, 30, 'B', 'C', datetime.datetime(2022, 12, 21, 17, 57, 43), 20.0, 8.0, 67.0, 10, 30, 20]
[20, 30, 35, 'A', 'D', datetime.datetime(2022, 12, 21, 17, 57, 43), 28.0, 6.0, 39.0, 20, 35, 15]
[30, 35, 40, 'K', 'E', datetime.datetime(2022, 12, 21, 17, 57, 43), 35.0, 4.0, 17.0, 30, 40, 10]
[40, 45, 50, 'N', 'C', datetime.datetime(2022, 12, 21, 17, 57, 43), 45.0, 4.0, 17.0, 40, 50, 10]
[100, 105, 110, 'M', 'C', datetime.datetime(2022, 12, 21, 17, 57, 43), 105.0, 4.0, 17.0, 100, 110, 10]
[10, 20, 30, 'B', 

In [23]:
#CoGroupByKey

class WordsAlphabet:

    def __init__(self, alphabet, fruit, country,nationality):
        self.alphabet = alphabet
        self.fruit = fruit
        self.country = country
        self.nationality = nationality

    def __str__(self):
        return "WordsAlphabet(alphabet:'%s', fruit='%s', country='%s', nationality='%s')" % (self.alphabet, self.fruit, self.country,self.nationality)


def apply_transforms(fruits, countries,nationalities):
    def map_to_alphabet_kv(word):
        return (word[0], word)

    def cogbk_result_to_wordsalphabet(cgbk_result):
        (alphabet, words) = cgbk_result
        return WordsAlphabet(alphabet, words['fruits'][0], words['countries'][0],words['nationalities'][0])

    fruits_kv = (fruits | 'Fruit to KV' >> beam.Map(map_to_alphabet_kv))
    countries_kv = (countries | 'Country to KV' >> beam.Map(map_to_alphabet_kv))
    nationalities_kv = (nationalities | 'nationality to KV' >> beam.Map(map_to_alphabet_kv))
    return ({'fruits': fruits_kv, 'countries': countries_kv,'nationalities': nationalities_kv}
            | beam.CoGroupByKey()
            | beam.Map(cogbk_result_to_wordsalphabet))


with beam.Pipeline() as p:

    fruits = p | 'Fruits' >> beam.Create(['apple', 'banana', 'cherry','doctor'])
    countries = p | 'Countries' >> beam.Create(['australia', 'brazil', 'canada','duchland'])
    nationalities = p | 'Nationalities' >> beam.Create(['australian', 'brazilian', 'canadian','duch'])

    (apply_transforms(fruits, countries,nationalities)
    | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*

WordsAlphabet(alphabet:'a', fruit='apple', country='australia', nationality='australian')
WordsAlphabet(alphabet:'b', fruit='banana', country='brazil', nationality='brazilian')
WordsAlphabet(alphabet:'c', fruit='cherry', country='canada', nationality='canadian')
WordsAlphabet(alphabet:'d', fruit='doctor', country='duchland', nationality='duch')


In [24]:
import numpy as np

example_list=[10, 20, 50, 70, 90]


class AverageFn(beam.CombineFn):

    def create_accumulator(self):
        return 0.0, 0

    def add_input(self, accumulator, element):
        (sum, i) = accumulator
        # print(accumulator)
        # print(element)
        return sum + element, i + 1

    def merge_accumulators(self, accumulators):
        sums, counts = zip(*accumulators)
        print(accumulators)
        return sum(sums), sum(counts)

    def extract_output(self, accumulator):
        (sum, count) = accumulator
        return sum / count if count else float('NaN')


with beam.Pipeline() as p:
    clear_output()
    (p | beam.Create(example_list)
     | beam.CombineGlobally(AverageFn())
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*
np.mean(example_list)

48.0


48.0

In [25]:
with beam.Pipeline() as p:
  (p | beam.Create(range(1, 11))
     | beam.combiners.Mean.Globally()
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*

5.5


In [26]:
with beam.Pipeline() as p:
  (p | beam.Create(example_list)
     | beam.combiners.Mean.Globally()
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*

48.0


In [27]:
with beam.Pipeline() as p:
  (p | beam.Create(range(1, 11))
     | beam.combiners.Count.Globally()
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*

10


In [28]:
with beam.Pipeline() as p:
  (p | beam.Create(range(1, 11))
     | beam.combiners.Latest.Globally()
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*

10


In [29]:
with beam.Pipeline() as p:
  (p | beam.Create(range(1, 11))
     | beam.combiners.ToList()
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [30]:
#   Licensed to the Apache Software Foundation (ASF) under one
#   or more contributor license agreements.  See the NOTICE file
#   distributed with this work for additional information
#   regarding copyright ownership.  The ASF licenses this file
#   to you under the Apache License, Version 2.0 (the
#   "License"); you may not use this file except in compliance
#   with the License.  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

import datetime
import pytz

import apache_beam as beam
from apache_beam.transforms import window




class Event:
    def __init__(self, id, event, timestamp):
        self.id = id
        self.event = event
        self.timestamp = timestamp

    def __str__(self) -> str:
        return f'Event({self.id}, {self.event}, {self.timestamp})'


class AddTimestampDoFn(beam.DoFn):

    def process(self, element, **kwargs):
        unix_timestamp = element.timestamp.timestamp()
        yield window.TimestampedValue(element, unix_timestamp)


with beam.Pipeline() as p:

  (p | beam.Create(
      [Event(str(x), f'book-order_{x}', datetime.datetime(2020, 3, x, x, 20-x, 0, 0, tzinfo=pytz.UTC)) \
       for x in range(1,19) ] 
  )
     | beam.ParDo(AddTimestampDoFn())
    | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*



Event(1, book-order_1, 2020-03-01 01:19:00+00:00)
Event(2, book-order_2, 2020-03-02 02:18:00+00:00)
Event(3, book-order_3, 2020-03-03 03:17:00+00:00)
Event(4, book-order_4, 2020-03-04 04:16:00+00:00)
Event(5, book-order_5, 2020-03-05 05:15:00+00:00)
Event(6, book-order_6, 2020-03-06 06:14:00+00:00)
Event(7, book-order_7, 2020-03-07 07:13:00+00:00)
Event(8, book-order_8, 2020-03-08 08:12:00+00:00)
Event(9, book-order_9, 2020-03-09 09:11:00+00:00)
Event(10, book-order_10, 2020-03-10 10:10:00+00:00)
Event(11, book-order_11, 2020-03-11 11:09:00+00:00)
Event(12, book-order_12, 2020-03-12 12:08:00+00:00)
Event(13, book-order_13, 2020-03-13 13:07:00+00:00)
Event(14, book-order_14, 2020-03-14 14:06:00+00:00)
Event(15, book-order_15, 2020-03-15 15:05:00+00:00)
Event(16, book-order_16, 2020-03-16 16:04:00+00:00)
Event(17, book-order_17, 2020-03-17 17:03:00+00:00)
Event(18, book-order_18, 2020-03-18 18:02:00+00:00)


In [31]:
%%writefile  wordcount.py
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""A word-counting workflow."""

# pytype: skip-file

import argparse
import logging
import re

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions


class WordExtractingDoFn(beam.DoFn):
  """Parse each line of input text into words."""
  def process(self, element):
    """Returns an iterator over the words of this element.

    The element is a line of text.  If the line is blank, note that, too.

    Args:
      element: the element being processed

    Returns:
      The processed element.
    """
    # return re.findall(r'[\w\']+', element, re.UNICODE)
    return element.split()
    


def run(argv=None, save_main_session=True):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflow-samples/shakespeare/kinglear.txt',
      
      help='Input file to process.')
  parser.add_argument(
      '--output',
      dest='output',
      required=True,
      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session

  # The pipeline will be run on exiting the with block.
  with beam.Pipeline(options=pipeline_options) as p:

    # Read the text file[pattern] into a PCollection.
    lines = p | 'Read' >> ReadFromText(known_args.input)

    counts = (
        lines
        | 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
        | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
        | 'GroupAndSum' >> beam.CombinePerKey(sum))

    # Format the counts into a PCollection of strings.
    def format_result(word, count):
      return '%s: %d' % (word, count)

    output = counts | 'Format' >> beam.MapTuple(format_result)

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'Write' >> WriteToText(known_args.output)


if __name__ == '__main__':
  logging.getLogger().setLevel(logging.INFO)
  run()


Overwriting wordcount.py


In [32]:
!python3 wordcount.py --output wordcount_output
clear_output()
!head -n 5 wordcount_output*

==> wordcount_output-00000-of-00001 <==
KING: 242
LEAR: 222
DRAMATIS: 1
PERSONAE: 1
king: 29

==> wordcount_output2-00000-of-00001 <==
KING: 243
LEAR: 236
DRAMATIS: 1
PERSONAE: 1
king: 65


In [33]:
%%writefile wordcount_with_metric.py
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""A word-counting workflow."""

# pytype: skip-file

import argparse
import logging
import re

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions


class WordExtractingDoFn(beam.DoFn):
  """Parse each line of input text into words."""
  def __init__(self):
    # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3.
    # super().__init__()
    beam.DoFn.__init__(self)
    self.words_counter = Metrics.counter(self.__class__, 'words')
    self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths')
    self.word_lengths_dist = Metrics.distribution(
        self.__class__, 'word_len_dist')
    self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')

  def process(self, element):
    """Returns an iterator over the words of this element.

    The element is a line of text.  If the line is blank, note that, too.

    Args:
      element: the element being processed

    Returns:
      The processed element.
    """
    text_line = element.strip()
    if not text_line:
      self.empty_line_counter.inc(1)
    words = re.findall(r'[\w\']+', text_line, re.UNICODE)
    for w in words:
      self.words_counter.inc()
      self.word_lengths_counter.inc(len(w))
      self.word_lengths_dist.update(len(w))
    return words


def main(argv=None, save_main_session=True):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflow-samples/shakespeare/kinglear.txt',
      help='Input file to process.')
  parser.add_argument(
      '--output',
      dest='output',
      required=True,
      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection.
  lines = p | 'read' >> ReadFromText(known_args.input)

  # Count the occurrences of each word.
  def count_ones(word_ones):
    (word, ones) = word_ones
    return (word, sum(ones))

  counts = (
      lines
      | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
      | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
      | 'group' >> beam.GroupByKey()
      | 'count' >> beam.Map(count_ones))

  # Format the counts into a PCollection of strings.
  def format_result(word_count):
    (word, count) = word_count
    return '%s: %d' % (word, count)

  output = counts | 'format' >> beam.Map(format_result)

  # Write the output using a "Write" transform that has side effects.
  # pylint: disable=expression-not-assigned
  output | 'write' >> WriteToText(known_args.output)

  result = p.run()
  result.wait_until_finish()

  # Do not query metrics when creating a template which doesn't run
  if (not hasattr(result, 'has_job')  # direct runner
      or result.has_job):  # not just a template creation
    empty_lines_filter = MetricsFilter().with_name('empty_lines')
    query_result = result.metrics().query(empty_lines_filter)
    if query_result['counters']:
      empty_lines_counter = query_result['counters'][0]
      logging.info('number of empty lines: %d', empty_lines_counter.result)

    word_lengths_filter = MetricsFilter().with_name('word_len_dist')
    query_result = result.metrics().query(word_lengths_filter)
    if query_result['distributions']:
      word_lengths_dist = query_result['distributions'][0]
      logging.info('average word length: %d', word_lengths_dist.result.mean)


if __name__ == '__main__':
  logging.getLogger().setLevel(logging.INFO)
  main()


Overwriting wordcount_with_metric.py


In [34]:
!python3 wordcount_with_metric.py --output wordcount_output2
clear_output()
!head -n 5 wordcount_output2*

KING: 243
LEAR: 236
DRAMATIS: 1
PERSONAE: 1
king: 65


In [35]:
%%writefile wordcounttest.py
# -*- coding: utf-8 -*-
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Test for the wordcount example."""

# pytype: skip-file

import collections
import logging
import re
import tempfile
import unittest

import pytest

from apache_beam.examples import wordcount
from apache_beam.testing.util import open_shards


@pytest.mark.examples_postcommit
class WordCountTest(unittest.TestCase):

  SAMPLE_TEXT = (
      u'a b c a b a\nacento gráfico\nJuly 30, 2018\n\n aa bb cc aa bb aa')

  def create_temp_file(self, contents):
    with tempfile.NamedTemporaryFile(delete=False) as f:
      f.write(contents.encode('utf-8'))
      return f.name

  def test_basics(self):
    temp_path = self.create_temp_file(self.SAMPLE_TEXT)
    expected_words = collections.defaultdict(int)
    for word in re.findall(r'[\w\']+', self.SAMPLE_TEXT, re.UNICODE):
      expected_words[word] += 1
    wordcount.run(['--input=%s*' % temp_path, '--output=%s.result' % temp_path],
                  save_main_session=False)
    # Parse result file and compare.
    results = []
    with open_shards(temp_path + '.result-*-of-*') as result_file:
      for line in result_file:
        match = re.search(r'(\S+): ([0-9]+)', line)
        if match is not None:
          results.append((match.group(1), int(match.group(2))))
    self.assertEqual(sorted(results), sorted(expected_words.items()))


if __name__ == '__main__':
  logging.getLogger().setLevel(logging.INFO)
  unittest.main()


Overwriting wordcounttest.py


In [36]:
!python3 wordcounttest.py

INFO:root:Missing pipeline option (runner). Executing pipeline using the default runner: DirectRunner.
INFO:root:Default Python SDK image for environment is apache/beam_python3.9_sdk:2.43.0
INFO:apache_beam.runners.worker.statecache:Creating state cache with size 104857600
INFO:apache_beam.runners.portability.fn_api_runner.worker_handlers:Created Worker handler <apache_beam.runners.portability.fn_api_runner.worker_handlers.EmbeddedWorkerHandler object at 0x7f761be898b0> for environment ref_Environment_default_environment_1 (beam:env:embedded_python:v1, b'')
INFO:apache_beam.io.filebasedsink:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:apache_beam.io.filebasedsink:Renamed 1 shards in 0.00 seconds.
.
----------------------------------------------------------------------
Ran 1 test in 0.553s

OK


In [37]:
import apache_beam as beam

with beam.Pipeline() as p:

  (p | beam.Create(['apple', 'ball', 'car', 'bear', 'cheetah', 'ant'])
     | beam.Map(lambda word: (word[0], word))
     | beam.GroupByKey()
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*



('a', ['apple', 'ant'])
('b', ['ball', 'bear'])
('c', ['car', 'cheetah'])


In [38]:
#   Licensed to the Apache Software Foundation (ASF) under one
#   or more contributor license agreements.  See the NOTICE file
#   distributed with this work for additional information
#   regarding copyright ownership.  The ASF licenses this file
#   to you under the Apache License, Version 2.0 (the
#   "License"); you may not use this file except in compliance
#   with the License.  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

import apache_beam as beam

class WordsAlphabet:

    def __init__(self, alphabet, fruit, country):
        self.alphabet = alphabet
        self.fruit = fruit
        self.country = country

    def __str__(self):
        return "WordsAlphabet(alphabet:'%s', fruit='%s', country='%s')" % (self.alphabet, self.fruit, self.country)


def apply_transforms(fruits, countries):
    def map_to_alphabet_kv(word):
        return (word[0], word)

    def cogbk_result_to_wordsalphabet(cgbk_result):
        (alphabet, words) = cgbk_result
        return WordsAlphabet(alphabet, words['fruits'][0], words['countries'][0])

    fruits_kv = (fruits | 'Fruit to KV' >> beam.Map(map_to_alphabet_kv))
    countries_kv = (countries | 'Country to KV' >> beam.Map(map_to_alphabet_kv))

    return ({'fruits': fruits_kv, 'countries': countries_kv}
            | beam.CoGroupByKey()
            | beam.Map(cogbk_result_to_wordsalphabet))


with beam.Pipeline() as p:

  fruits = p | 'Fruits' >> beam.Create(['apple', 'banana', 'cherry'])
  countries = p | 'Countries' >> beam.Create(['australia', 'brazil', 'canada'])

  (apply_transforms(fruits, countries)
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*



WordsAlphabet(alphabet:'a', fruit='apple', country='australia')
WordsAlphabet(alphabet:'b', fruit='banana', country='brazil')
WordsAlphabet(alphabet:'c', fruit='cherry', country='canada')


In [39]:
#combine per key
import apache_beam as beam

PLAYER_1 = 'Player 1'
PLAYER_2 = 'Player 2'
PLAYER_3 = 'Player 3'

with beam.Pipeline() as p:

  (p | beam.Create([(PLAYER_1, 15), (PLAYER_2, 10), (PLAYER_1, 100),
                    (PLAYER_3, 25), (PLAYER_2, 75)])
     | beam.CombinePerKey(sum)
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*

('Player 1', 115)
('Player 2', 85)
('Player 3', 25)


In [40]:
#combine simple function
import apache_beam as beam

def sum(numbers):
    total = 0

    for num in numbers:
        total += num

    return total


with beam.Pipeline() as p:

  (p | beam.Create([1, 2, 3, 4, 5])
     | beam.CombineGlobally(sum)
     | beam.io.textio.WriteToText('example-output'))
clear_output()
!cat example-output*

15


In [94]:
#side input
import apache_beam as beam

class Person:
    def __init__(self, name, city, country='',continent='',num1="",num2="",avg=""):
        self.name = name
        self.city = city
        self.country = country
        self.continent= continent
        self.num1=num1
        self.num2=num2
        self.avg=avg      

    def __str__(self):
        return 'Person[' + self.name + ',' + self.city + ',' + self.country + ',' + self.continent +',' + self.num1+',' + self.num2+','  +self.avg+']'


class EnrichCountryDoFn(beam.DoFn):

    def process(self, element, cities_to_countries):
        yield Person(element.name, element.city,
                     cities_to_countries[element.city])
        
class EnrichContinentDoFn(beam.DoFn):

    def process(self, element, cities_to_continents):
        yield Person(element.name, element.city,element.country,
                     cities_to_continents[element.city])
        
class NumDoFn(beam.DoFn):

    def process(self, element, name_to_num1):
        yield Person(element.name, element.city,element.country,element.continent,
                     name_to_num1[element.name])
        
class NumDoFn2(beam.DoFn):

    def process(self, element, name_to_num1):
        yield Person(element.name, element.city,element.country,element.continent,element.num1,
                     name_to_num2[element.name])

class AvgDoFn(beam.DoFn):

    def process(self, element):
        yield Person(element.name, element.city,element.country,element.continent,element.num1,element.num2,
                     str(round((int(element.num1)+int(element.num2))/2,2)))
        
with beam.Pipeline() as p:

    cities_to_countries = {
      'Beijing': 'China',
      'London': 'United Kingdom',
      'San Francisco': 'United States',
      'Singapore': 'Singapore',
      'Sydney': 'Australia'
    }

    cities_to_continents = {
      'Beijing': 'Asia',
      'London': 'Europe',
      'San Francisco': 'America',
      'Singapore': 'Asia',
      'Sydney': 'Australia'
    }
    
    name_to_num1 = {
      'Henry': "25",
      'Jane': "30",
      'Lee': "35",
      'John': "45",
      'Alfred': "50"
    }
    
    name_to_num2 = {
      'Henry': "50",
      'Jane': "45",
      'Lee': "35",
      'John': "30",
      'Alfred': "25"
    }
    
    persons = [
      Person('Henry', 'Singapore'),
      Person('Jane', 'San Francisco'),
      Person('Lee', 'Beijing'),
      Person('John', 'Sydney'),
      Person('Alfred', 'London')
    ]
    
    
    (p  | 'Create' >> beam.Create(persons)
        | 'Add-countries' >> beam.ParDo(EnrichCountryDoFn(), cities_to_countries)
        | 'Add-continents' >> beam.ParDo(EnrichContinentDoFn(), cities_to_continents)
        | 'Add-num' >> beam.ParDo(NumDoFn(), name_to_num1)
        | 'Add-num2' >> beam.ParDo(NumDoFn2(), name_to_num2)
        | 'avg' >> beam.ParDo(AvgDoFn())     
        | 'Write' >> beam.io.textio.WriteToText('example-output'))
    
clear_output()
!cat example-output*

Person[Henry,Singapore,Singapore,Asia,25,50,37.5]
Person[Jane,San Francisco,United States,America,30,45,37.5]
Person[Lee,Beijing,China,Asia,35,35,35.0]
Person[John,Sydney,Australia,Australia,45,30,37.5]
Person[Alfred,London,United Kingdom,Europe,50,25,37.5]


# TESTS

In [108]:
%%writefile filtertest.py
#filter test
import logging
import time
import unittest

import pytest
from hamcrest.core.core.allof import all_of

import apache_beam as beam
from apache_beam.examples.cookbook import filters
from apache_beam.io.gcp.tests import utils
from apache_beam.io.gcp.tests.bigquery_matcher import BigqueryMatcher
from apache_beam.testing.pipeline_verifiers import PipelineStateMatcher
from apache_beam.testing.test_pipeline import TestPipeline
from apache_beam.testing.util import assert_that
from apache_beam.testing.util import equal_to


class FiltersTest(unittest.TestCase):

  # The default checksum is a SHA-1 hash generated from sorted rows reading
  # from expected Bigquery table.
  DEFAULT_CHECKSUM = '813b6da1624334732fad4467d74a7c8a62559c6b'

  # Note that 'removed' should be projected away by the pipeline
  input_data = [
      {
          'year': 2010, 'month': 1, 'day': 1, 'mean_temp': 3, 'removed': 'a'
      },
      {
          'year': 2012, 'month': 1, 'day': 2, 'mean_temp': 3, 'removed': 'a'
      },
      {
          'year': 2011, 'month': 1, 'day': 3, 'mean_temp': 5, 'removed': 'a'
      },
      {
          'year': 2013, 'month': 2, 'day': 1, 'mean_temp': 3, 'removed': 'a'
      },
      {
          'year': 2011, 'month': 3, 'day': 3, 'mean_temp': 5, 'removed': 'a'
      },
  ]

  def _get_result_for_month(self, pipeline, month):
    rows = (pipeline | 'create' >> beam.Create(self.input_data))
    results = filters.filter_cold_days(rows, month)
    return results

  def test_basics(self):
    """Test that the correct result is returned for a simple dataset."""
    with TestPipeline() as p:
      results = self._get_result_for_month(p, 1)
      assert_that(
          results,
          equal_to([{
              'year': 2010, 'month': 1, 'day': 1, 'mean_temp': 3
          }, {
              'year': 2012, 'month': 1, 'day': 2, 'mean_temp': 3
          }]))

  def test_basic_empty(self):
    """Test that the correct empty result is returned for a simple dataset."""
    with TestPipeline() as p:
      results = self._get_result_for_month(p, 3)
      assert_that(results, equal_to([]))

  def test_basic_empty_missing(self):
    """Test that the correct empty result is returned for a missing month."""
    with TestPipeline() as p:
      results = self._get_result_for_month(p, 4)
      assert_that(results, equal_to([]))

  @pytest.mark.examples_postcommit
  def test_filters_output_bigquery_matcher(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    project = test_pipeline.get_option('project')

    dataset = 'FiltersTestIT'
    table = 'cold_days_%s' % int(round(time.time() * 1000))
    output_table = '.'.join([dataset, table])
    query = 'SELECT year, month, day, mean_temp FROM `%s`' % output_table

    pipeline_verifiers = [
        PipelineStateMatcher(),
        BigqueryMatcher(
            project=project, query=query, checksum=self.DEFAULT_CHECKSUM)
    ]
    extra_opts = {
        'output': output_table,
        'on_success_matcher': all_of(*pipeline_verifiers)
    }

    # Register cleanup before pipeline execution.
    # Note that actual execution happens in reverse order.
    self.addCleanup(utils.delete_bq_table, project, dataset, table)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    filters.run(test_pipeline.get_full_options_as_args(**extra_opts))


if __name__ == '__main__':
  logging.getLogger().setLevel(logging.INFO)
  unittest.main()

Writing filtertest.py


In [110]:
!python3 filtertest.py

INFO:root:Missing pipeline option (runner). Executing pipeline using the default runner: DirectRunner.
INFO:root:Default Python SDK image for environment is apache/beam_python3.9_sdk:2.43.0
INFO:apache_beam.runners.worker.statecache:Creating state cache with size 104857600
INFO:apache_beam.runners.portability.fn_api_runner.worker_handlers:Created Worker handler <apache_beam.runners.portability.fn_api_runner.worker_handlers.EmbeddedWorkerHandler object at 0x7f3e960b7d30> for environment ref_Environment_default_environment_1 (beam:env:embedded_python:v1, b'')
.INFO:root:Missing pipeline option (runner). Executing pipeline using the default runner: DirectRunner.
INFO:root:Default Python SDK image for environment is apache/beam_python3.9_sdk:2.43.0
INFO:apache_beam.runners.worker.statecache:Creating state cache with size 104857600
INFO:apache_beam.runners.portability.fn_api_runner.worker_handlers:Created Worker handler <apache_beam.runners.portability.fn_api_runner.worker_handlers.Embedded