In [None]:
#pip install apache-beam
#Make sure you have python 2.
#https://github.com/apache/beam/blob/master/sdks/python/apache_beam/examples/wordcount_minimal.py#L116

In [1]:
from __future__ import absolute_import

import argparse
import logging
import re
from past.builtins import unicode

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
import apache_beam.transforms.window as window
from apache_beam.examples.wordcount import WordExtractingDoFn
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import StandardOptions

In [6]:
def run(argv=None): #Main entry point; defines and runs the wordcount pipeline
    parser = argparse.ArgumentParser()    
    parser.add_argument('--input',
                      dest='input',                      
                      default='data/kinglear.txt',
                      help='Input file to process.')
    parser.add_argument('--output',
                      dest='output',
                      default='data/output.txt', # for outputting the results
                      help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend(['--runner=DirectRunner',
                              '--project=PROJECTID',
                              '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
                              '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
                              '--job_name=your-wordcount-job',])
    # We use the save_main_session option because one or more DoFn's in this workflow rely on global context 
    # (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True    
    with beam.Pipeline(options=pipeline_options) as p:
        lines = p | ReadFromText(known_args.input) # Read the text file[pattern] into a PCollection.
        counts = (  # Count the occurrences of each word.
            lines | 'Split' >> (beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)).with_output_types(unicode))
            | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
            | 'GroupAndSum' >> beam.CombinePerKey(sum))

In [7]:
# Format the counts into a PCollection of strings.
def format_result(word_count):
    (word, count) = word_count
    return '%s: %s' % (word, count)
    output = counts | 'Format' >> beam.Map(format_result)
    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | WriteToText(known_args.output)

In [8]:
if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)
    run()

INFO:root:Running ((ref_AppliedPTransform_ReadFromText/Read_3)+((ref_AppliedPTransform_Split_4)+((ref_AppliedPTransform_PairWithOne_5)+(GroupAndSum/Precombine))))+(GroupAndSum/Group/Write)
INFO:root:start <DataOutputOperation GroupAndSum/Group/Write >
INFO:root:start <PGBKCVOperation GroupAndSum/Precombine >
INFO:root:start <DoOperation PairWithOne output_tags=['out']>
INFO:root:start <DoOperation Split output_tags=['out']>
INFO:root:start <ReadOperation ReadFromText/Read source=SourceBundle(weight=1.0, source=<apache_beam.io.textio._TextSource object at 0x0000000008EE2EB8>, start_position=None, stop_position=None)>
INFO:root:finish <ReadOperation ReadFromText/Read source=SourceBundle(weight=1.0, source=<apache_beam.io.textio._TextSource object at 0x0000000008EE2EB8>, start_position=None, stop_position=None), receivers=[ConsumerSet[ReadFromText/Read.out0, coder=WindowedValueCoder[FastPrimitivesCoder], len(consumers)=1]]>
INFO:root:finish <DoOperation Split output_tags=['out'], receiver

In [40]:
import pandas as pd
Test= pd.read_csv('data/beam-temp-output.txt-2c48bfe120b711e99b0474e6e20db312\\549f2477-f9da-487d-9b45-d6a5f85a4f90.output.txt')

In [41]:
Test.head()

               Appear: 1
0              pardon: 6
1           justicers: 1
2          ungovern'd: 1
3              vermin: 1
4             needful: 2
5               foul: 15
6                four: 1
7               hath: 52
8             protest: 1
9             nursery: 1
10              sleep: 8
11            hanging: 1
12          conjuring: 1
13            garters: 1
14           appetite: 2
15               use: 14
16               hate: 5
17              Until: 2
18           marching: 1
19              Plate: 1
20                gad: 1
21               Obey: 1
22              Thou: 47
23            poorest: 2
24            starv'd: 1
25            Enforce: 1
26               His: 18
27          Vengeance: 1
28           mutinies: 1
29              under: 9
...                  ...
4870             star: 3
4871             Like: 3
4872         respects: 1
4873       oppression: 1
4874        dismantle: 1
4875        frustrate: 1
4876           numb'd: 1
4877           chance: 5
