In [None]:
# ! pip install --upgrade apache_beam
# ! pip install pyarrow

In [None]:
import apache_beam as beam
import avro
import json

from apache_beam.io.filesystem import CompressionTypes


In [None]:
! rm ../data/output*

In [None]:
file_path_prefix = "../../data/output"
append_trailing_newlines = True
num_shards = 3  # usually let beam handle this
shard_name_template = "_SS-NN_"

In [None]:
# CSV --> CSV
p1 = beam.Pipeline()
(p1
 | beam.io.ReadFromText("../../data/data.csv", skip_header_lines=1)
 | beam.Map(lambda r: r.split(","))
 | beam.Filter(lambda r: int(r[1]) == 44)
 | beam.Filter(lambda r: float(r[7]) < 0)
 | beam.Map(lambda r: "{0}: {1}".format(r[0], r[2].upper()))
 | beam.io.WriteToText(
     file_path_prefix=file_path_prefix,
     file_name_suffix="suffix.csv",
     append_trailing_newlines=append_trailing_newlines,
     num_shards=num_shards,
     shard_name_template=shard_name_template,
     compression_type=CompressionTypes.AUTO,  # default
     header="NAME: ADDRESS",
     footer="***************\n"
 )
 )
p1.run()


In [None]:
! cat ../data/output*.csv

In [None]:
# CSV --> AVRO
p2 = beam.Pipeline()
(p2
 | beam.io.ReadFromText("../../data/data.csv", skip_header_lines=1)
 | beam.Map(lambda r: r.split(","))
 | beam.Filter(lambda r: int(r[1]) == 44 and float(r[7]) < 0)
 | beam.Map(lambda r: "{0}: {1}".format(r[0], r[2].upper()))
 | beam.io.WriteToAvro(
     file_path_prefix=file_path_prefix,
     schema=avro.schema.parse(json.dumps({
         "namespace": "output.avro",
         "type": "string"
     })),
     file_name_suffix="suffix.avro",
     num_shards=num_shards,
     shard_name_template=shard_name_template,
     mime_type="application/x-avro",
     use_fastavro=False
 )
 )
p2.run()


In [None]:
def add(x): return lambda y: x + y
def multiply(x): return lambda y: x * y
def print_result(r): return print(r)


p = beam.Pipeline()

(
    p
    | beam.Create([7])
    | "+ 3" >> beam.Map(add(3))
    | "* 5" >> beam.Map(multiply(5))
    | "Print result" >> beam.Map(print_result)
)

p.run()