In [17]:
import apache_beam as beam

p = beam.Pipeline()

side_input_retrieved_from_file = [
    "Arizona",
    "Carolina",
    "Colorado",
    "Delaware",
    "Georgia",
    "Indiana",
    "Kentucky",
    "Montana",
    "Massachusetts",
    "Michigan",
    "Oregon",
    "Pennsylvania",
    "South Carolina",
    "North Carolina",
    "Utah",
    "Texas",
    "Wyoming"
]


class SplitRow(beam.DoFn):
    def process(self, element, excluded_states):
        row = element.split(",")
        state = row[4]
        if state not in excluded_states:
            return [row]


class ProcessUsersByState(beam.DoFn):
    def process(self, element, state, start_char):
        if element[4] == state:
            yield element
        else:
            yield beam.pvalue.TaggedOutput("users_other", element)

        if element[0].startswith(start_char):
            yield beam.pvalue.TaggedOutput("names_m", element)


users = (
    p
    | "Readt data from CSV file" >> beam.io.ReadFromText("../../data/data.csv", skip_header_lines=1)
    | "Split row and remove excluded states" >> beam.ParDo(SplitRow(), side_input_retrieved_from_file)
    | "Prepare side outputs" >> beam.ParDo(ProcessUsersByState(), state="Minnesota", start_char="M").with_outputs("users_other", "names_m", main="users_minnesota")
)

users_minnesota = users.users_minnesota
users_other = users.users_other
names_m = users.names_m

users_minnesota | "Output with Minnesota users" >> beam.io.WriteToText(
    "../../data/users_minnesota", ".csv")
users_other | "Output with valid users from other states" >> beam.io.WriteToText(
    "../../data/users_other_states", ".csv")
names_m | "Output with valid users with names starting with M" >> beam.io.WriteToText(
    "../../data/user_names_starting_with_m", ".csv")

p.run()




<apache_beam.runners.portability.fn_api_runner.fn_runner.RunnerResult at 0x12a8d2cf8>

In [18]:
!cat ../data/user_names_starting_with_m-*.csv

['Mark Daniels', '48', '3082 Walker Roads', 'Webbburgh', 'Ohio', '18701', '80.283513', '27.1533405']
['Melissa Richardson', '69', '2000 Beard Groves', 'Ruizfort', 'West Virginia', '70878', '-54.933020', '-56.1475135']
['Maria Johnson', '41', '1564 Hall Forks', 'North Timothyhaven', 'Idaho', '97875', '36.093994', '86.658500']
['Maria Pham', '36', '385 Lisa Roads Suite 840', 'New Jonathan', 'Tennessee', '03741', '-70.544170', '-61.7980625']
['Michael Johnston MD', '56', '074 James Heights', 'East Melissa', 'New Mexico', '84620', '31.804817', '-64.5267475']
['Michelle Guzman', '74', '040 Ashley Plain', 'East Gabriel', 'Mississippi', '63323', '136.243095', '-58.3616125']
['Morgan Heath', '44', '035 Edward Views Apt. 016', 'Andersonview', 'New Hampshire', '27516', '-26.058075', '58.475900']
['Matthew Mendoza', '59', '44424 Thomas Trail', 'Johnport', 'Maryland', '76879', '-49.449148', '-30.5279105']
['Mary Woods', '66', '151 Cook Streets Apt. 292', 'South Cristian', 'Idaho', '45717', '174.16