In [3]:
import apache_beam as beam


class SplitRow(beam.DoFn):
    def process(self, element):
        return [element.split(",")]


class FilterByVirginia(beam.DoFn):
    def process(self, element):
        if (element[4] == "Virginia"):
            element.remove("Virginia")
            return [element]


p = beam.Pipeline()

(
    p
    | beam.io.ReadFromText("../../data/data.csv", skip_header_lines=1)
    | "Split CSV" >> beam.ParDo(SplitRow())
    | "Filter by Virginia" >> beam.ParDo(FilterByVirginia())
    | beam.Map(print)
)

p.run()




['Sally Perez', '29', '366 Alyssa Landing', 'East Nicole', '81829', '21.695083', '-74.639022']
['Katherine Brown', '44', '54781 Lisa Ridge Apt. 024', 'Maryberg', '49378', '176.898896', '-78.0468395']
['Dakota Johns', '35', '4958 Ferguson Estate Apt. 028', 'Washingtonton', '43020', '-38.787624', '5.6548105']
['Chase Nguyen', '23', '42875 Steven Plain', 'Rebeccafurt', '57347', '1.338469', '88.8614485']
['Shawn Medina', '28', '3779 Mark Oval Suite 314', 'Port Wendyville', '13774', '11.908235', '-57.610376']
['Ashley Young', '65', '93845 Maxwell Ridges', 'Crossville', '14322', '-179.118863', '87.081446']
['Austin Bailey', '60', '49198 Judy Stravenue Suite 529', 'Gonzaleshaven', '80156', '-131.177666', '-41.618640']
['Eddie Anderson', '53', '29452 Brandy Estate Apt. 333', 'Woodsmouth', '07592', '115.144444', '23.936091']
['Kenneth Wallace', '65', '73898 Smith Prairie', 'New Edward', '34955', '-116.117139', '14.131123']
['Christine Medina', '64', '4335 Mary Forges Apt. 379', 'South Bryan', '

<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x120eb96d8>

In [13]:
class SplitWords(beam.DoFn):
    def process(self, element):
        return map(self.cleanData, element.split(" "))

    def cleanData(self, w):
        punctuation_chars = [",", ".", "?"]
        for char in punctuation_chars:
            w = w.replace(char, "")
        return w.lower()


class FindWord(beam.DoFn):
    def process(self, element):
        words = ["peter", "piper", "picked", "peck", "pepper"]
        if element in words:
            return [element]


class MapToTuple(beam.DoFn):
    def process(self, element):
        return [(element, 1)]


class CalculateFrequency(beam.DoFn):
    def process(self, element):
        (k, v) = element
        return [(k, sum(v))]


class PrepareOutput(beam.DoFn):
    def process(self, element):
        (k, v) = element
        return ["{0}: {1}".format(k, v)]


p = beam.Pipeline()

(
    p
    | "Read data" >> beam.io.ReadFromText("../../data/peter_piper.txt")
    | "Split words by space" >> beam.ParDo(SplitWords())
    | "Filter expected words" >> beam.ParDo(FindWord())
    | "Map to a tuple" >> beam.ParDo(MapToTuple())
    | "Group by key" >> beam.GroupByKey()
    | "Combine using word as key" >> beam.ParDo(CalculateFrequency())
    | "Prepare output" >> beam.ParDo(PrepareOutput())
    | "Print output" >> beam.Map(print)
)

p.run()




peter: 4
piper: 4
picked: 4
peck: 4


<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x123ac8630>