In [None]:
import apache_beam as beam

p = beam.Pipeline()

side_input_retrieved_from_file = [
    "Arizona",
    "Carolina",
    "Colorado",
    "Delaware",
    "Georgia",
    "Indiana",
    "Kentucky",
    "Montana",
    "Massachusetts",
    "Michigan",
    "Oregon",
    "Pennsylvania",
    "South Carolina",
    "North Carolina",
    "Utah",
    "Texas",
    "Wyoming"
]


class SplitRow(beam.DoFn):
    def process(self, element, excluded_states):
        row = element.split(",")
        state = row[4]
        if state not in excluded_states:
            return [row]


class ProcessUsersByState(beam.DoFn):
    def process(self, element, state, start_char):
        if element[4] == state:
            yield element
        else:
            yield beam.pvalue.TaggedOutput("users_other", element)

        if element[0].startswith(start_char):
            yield beam.pvalue.TaggedOutput("names_m", element)


users = (
    p
    | "Readt data from CSV file" >> beam.io.ReadFromText("../../data/data.csv", skip_header_lines=1)
    | "Split row and remove excluded states" >> beam.ParDo(SplitRow(), side_input_retrieved_from_file)
    | "Prepare side outputs" >> beam.ParDo(ProcessUsersByState(), state="Minnesota", start_char="M").with_outputs("users_other", "names_m", main="users_minnesota")
)

users_minnesota = users.users_minnesota
users_other = users.users_other
names_m = users.names_m

users_minnesota | "Output with Minnesota users" >> beam.io.WriteToText(
    "../../data/users_minnesota", ".csv")
users_other | "Output with valid users from other states" >> beam.io.WriteToText(
    "../../data/users_other_states", ".csv")
names_m | "Output with valid users with names starting with M" >> beam.io.WriteToText(
    "../../data/user_names_starting_with_m", ".csv")

p.run()


In [None]:
!cat ../data/user_names_starting_with_m-*.csv