# Apache Beam Word Count Example

- パッケージインストール後、ランタイムを再起動すること（Colab）

The example is adopted from https://beam.apache.org/get-started/wordcount-example/ for Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/Building-ML-Pipelines/building-machine-learning-pipelines/blob/master/chapters/intro_tfx/Apache_beam_example_notebook.ipynb)


In [1]:
!pip install google-cloud-core==1.6.0 apache_beam[gcp]==2.28.0

Collecting google-cloud-core==1.6.0
  Downloading google_cloud_core-1.6.0-py2.py3-none-any.whl (28 kB)
Collecting apache_beam[gcp]==2.28.0
  Downloading apache_beam-2.28.0-cp37-cp37m-manylinux2010_x86_64.whl (9.0 MB)
[K     |████████████████████████████████| 9.0 MB 7.7 MB/s 
Collecting requests<3.0.0,>=2.24.0
  Downloading requests-2.26.0-py2.py3-none-any.whl (62 kB)
[K     |████████████████████████████████| 62 kB 1.0 MB/s 
Collecting dill<0.3.2,>=0.3.1.1
  Downloading dill-0.3.1.1.tar.gz (151 kB)
[K     |████████████████████████████████| 151 kB 18.0 MB/s 
Collecting future<1.0.0,>=0.18.2
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 17.7 MB/s 
Collecting fastavro<2,>=0.21.4
  Downloading fastavro-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 792 kB/s 
[?25hCollecting pyarrow<3.0.0,>=0.15.1
  Downloading pyarrow-2.0.0-cp37-cp37m-manylinux2014_x86_64.whl (17.7

In [1]:
import re

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

input_file = "gs://dataflow-samples/shakespeare/kinglear.txt"
output_file = "/content/output.txt"

# TODO explain these lines
pipeline_options = PipelineOptions()
# pipeline_options.view_as(SetupOptions).save_main_session = True

with beam.Pipeline(options=pipeline_options) as p:

    # Read the text file[pattern] into a PCollection.
    lines = p | ReadFromText(input_file)

    # Count the occurrences of each word.
    counts = (
        lines
        | 'Split' >> (beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)))
                      # .with_output_types(unicode))
        | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
        | 'GroupAndSum' >> beam.CombinePerKey(sum))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
        (word, count) = word_count
        return '%s: %s' % (word, count)

    output = counts | 'Format' >> beam.Map(format_result)

    # Write the output using a "Write" transform that has side effects.
    output | WriteToText(output_file)

Connecting anonymously.




In [2]:
!head /content/output.txt*

KING: 243
LEAR: 236
DRAMATIS: 1
PERSONAE: 1
king: 65
of: 447
Britain: 2
OF: 15
FRANCE: 10
DUKE: 3
