In [None]:
import apache_beam as beam


class SplitRow(beam.DoFn):
    def process(self, element):
        return [element.split(",")]


class FilterByVirginia(beam.DoFn):
    def process(self, element):
        if (element[4] == "Virginia"):
            element.remove("Virginia")
            return [element]


p = beam.Pipeline()

(
    p
    | beam.io.ReadFromText("../../data/data.csv", skip_header_lines=1)
    | "Split CSV" >> beam.ParDo(SplitRow())
    | "Filter by Virginia" >> beam.ParDo(FilterByVirginia())
    | beam.Map(print)
)

p.run()


In [None]:
class SplitWords(beam.DoFn):
    def process(self, element):
        return map(self.cleanData, element.split(" "))

    def cleanData(self, w):
        punctuation_chars = [",", ".", "?"]
        for char in punctuation_chars:
            w = w.replace(char, "")
        return w.lower()


class FindWord(beam.DoFn):
    def process(self, element):
        words = ["peter", "piper", "picked", "peck", "pepper"]
        if element in words:
            return [element]


class MapToTuple(beam.DoFn):
    def process(self, element):
        return [(element, 1)]


class CalculateFrequency(beam.DoFn):
    def process(self, element):
        (k, v) = element
        return [(k, sum(v))]


class PrepareOutput(beam.DoFn):
    def process(self, element):
        (k, v) = element
        return ["{0}: {1}".format(k, v)]


p = beam.Pipeline()

(
    p
    | "Read data" >> beam.io.ReadFromText("../../data/peter_piper.txt")
    | "Split words by space" >> beam.ParDo(SplitWords())
    | "Filter expected words" >> beam.ParDo(FindWord())
    | "Map to a tuple" >> beam.ParDo(MapToTuple())
    | "Group by key" >> beam.GroupByKey()
    | "Combine using word as key" >> beam.ParDo(CalculateFrequency())
    | "Prepare output" >> beam.ParDo(PrepareOutput())
    | "Print output" >> beam.Map(print)
)

p.run()
