In [1]:
from pyflink.table import EnvironmentSettings, TableEnvironment
import os
import json
from pyflink.table.expressions import *
from pyflink.table.window import *

# Line only for Local Development
os.environ["IS_LOCAL"] = "True"

print(os.path.abspath(os.getcwd()))
#https://docs.aws.amazon.com/pt_br/managed-flink/latest/java/gs-python-createapp.html
#https://github.com/aws-samples/amazon-kinesis-data-analytics-examples/blob/master/python/FirehoseSink/streaming-firehose-sink.py
#https://github.com/aws-samples/amazon-kinesis-data-analytics-examples/blob/master/python/S3Sink/streaming-file-sink.py



/Users/viniciusdeoliveiramartucci/Documents/GitHub/martucci-glue-streaming/auxiliar-scripts/apache-flink/tutorial-aws-flink/kinesis-stream-to-firehose


In [2]:
# 1. Creates a Table Environment
env_settings = EnvironmentSettings.in_streaming_mode()
table_env = TableEnvironment.create(env_settings)

statement_set = table_env.create_statement_set()

APPLICATION_PROPERTIES_FILE_PATH = "/etc/flink/application_properties.json"  # on Kinesis Data Analytics

is_local = (
    True if os.environ.get("IS_LOCAL") else False
)  # set this env var in your local environment

if is_local:
    # only for local, overwrite variable to properties and pass in your jars delimited by a semicolon (;)
    print("Running Flink locally...")
    APPLICATION_PROPERTIES_FILE_PATH = "application_properties.json"  # local

    CURRENT_DIR = os.path.abspath(os.getcwd())
    table_env.get_config().set(
        "pipeline.jars",
        f"file:///{CURRENT_DIR}/lib/flink-sql-connector-aws-kinesis-firehose-4.1.0-1.17.jar;file:///{CURRENT_DIR}/lib/flink-sql-connector-kinesis-4.1.0-1.17.jar"
    )


Running Flink locally...


In [3]:
def get_application_properties():
    if os.path.isfile(APPLICATION_PROPERTIES_FILE_PATH):
        with open(APPLICATION_PROPERTIES_FILE_PATH, "r") as file:
            contents = file.read()
            properties = json.loads(contents)
            return properties
    else:
        print('A file at "{}" was not found'.format(APPLICATION_PROPERTIES_FILE_PATH))


def property_map(props, property_group_id):
    for prop in props:
        if prop["PropertyGroupId"] == property_group_id:
            return prop["PropertyMap"]

def create_source_table(table_name, stream_name, region, stream_initpos):
    return """ CREATE TABLE IF NOT EXISTS {0} (
                ticker VARCHAR(6),
                price DOUBLE,
                event_time TIMESTAMP(3),
                WATERMARK FOR event_time AS event_time - INTERVAL '5' SECOND

              )
              PARTITIONED BY (ticker)
              WITH (
                'connector' = 'kinesis',
                'stream' = '{1}',
                'aws.region' = '{2}',
                'scan.stream.initpos' = '{3}',
                'format' = 'json',
                'json.timestamp-format.standard' = 'ISO-8601'
              ) """.format(
        table_name, stream_name, region, stream_initpos
    )


def create_print_table(table_name, stream_name, region, stream_initpos):
    return """ CREATE TABLE IF NOT EXISTS {0} (
                ticker VARCHAR(6),
                price DOUBLE,
                event_time TIMESTAMP(3),
                WATERMARK FOR event_time AS event_time - INTERVAL '5' SECOND

              )
              WITH (
                'connector' = 'print'
              ) """.format(
        table_name, stream_name, region, stream_initpos
    )

def create_output_table(table_name, deliver_stream_name, region):
    return f""" CREATE TABLE IF NOT EXISTS {table_name} (
                ticker VARCHAR(6),
                price DOUBLE,
                event_time TIMESTAMP(3)

              )
              WITH (
                  'connector' = 'firehose',
                  'delivery-stream' = '{deliver_stream_name}',
                  'aws.region' = '{region}',
                  'format' = 'json',
                  'json.timestamp-format.standard' = 'ISO-8601'
              ) """

def create_fake_source_table(table_name):
    return """ CREATE TABLE IF NOT EXISTS {0} (
                ticker VARCHAR(6),
                price DOUBLE,
                event_time TIMESTAMP(3),
                WATERMARK FOR event_time AS event_time - INTERVAL '5' SECOND

              )
              PARTITIONED BY (ticker)
              WITH (
                'connector' = 'datagen',
                'number-of-rows' = '10'
              ) """.format(table_name)

def create_sink_table_s3(table_name, bucket_name):
    return """ CREATE TABLE {0} (
                ticker VARCHAR(6),
                price DOUBLE,
                event_time TIMESTAMP(3)
              )
              PARTITIONED BY (ticker)
              WITH (
                  'connector'='filesystem',
                  'path'='s3a://{1}/',
                  'format'='json',
                  'sink.partition-commit.policy.kind'='success-file',
                  'sink.partition-commit.delay' = '1 min'
              ) """.format(table_name, bucket_name)

In [4]:
#def main():
# Application Property Keys
input_property_group_key = "consumer.config.0"
producer_property_group_key = "producer.config.0"

input_stream_key = "input.stream.name"
input_region_key = "aws.region"
input_starting_position_key = "flink.stream.initpos"

output_stream_key = "output.stream.name"
output_region_key = "aws.region"

# tables
input_table_name = "input_table"
output_table_name = "output_table"

# get application properties
props = get_application_properties()

input_property_map = property_map(props, input_property_group_key)
output_property_map = property_map(props, producer_property_group_key)

input_stream = input_property_map[input_stream_key]
input_region = input_property_map[input_region_key]
stream_initpos = input_property_map[input_starting_position_key]

output_stream = output_property_map[output_stream_key]
output_region = output_property_map[output_region_key]

# 0. Creates fake source table from datagen
input_table_fake_name = "fake_source_table"
table_env.execute_sql(
    create_fake_source_table(input_table_fake_name)
)

# 2. Creates a source table from a Kinesis Data Stream
print(f" Creating source table {input_table_name} from source stream {input_stream} in region {input_region} using {stream_initpos}")
table_env.execute_sql(
    create_source_table(input_table_name, input_stream, input_region, stream_initpos)
)


# 3. Creates a print to check Data
print_output_table = output_table_name + "_print"
print(f" Creating print table {output_table_name} from source stream {input_stream} in region {input_region}.")
table_env.execute_sql(
    create_print_table(print_output_table, output_stream, output_region, stream_initpos)
)

# 4. Creates a sink table writing to a Kinesis Firehose Delivery Strem
print(f" Creating destination table {output_table_name} to destination stream {output_stream} in region {output_region}.")
table_env.execute_sql(
    create_output_table(output_table_name,output_stream, output_region)
)


# 5. Insert from Source to Destination/Print
table_result = table_env.execute_sql("INSERT INTO {0} SELECT * FROM {1}"
                                         .format(output_table_name, input_table_name))

table_result.wait()


 Creating source table input_table from source stream martucci-kinesis-stream-firehose in region us-east-1 using LATEST
 Creating print table output_table from source stream martucci-kinesis-stream-firehose in region us-east-1.
 Creating destination table output_table to destination stream martucci-kinesis-delivery-stream in region us-east-1.


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/Users/viniciusdeoliveiramartucci/Library/Python/3.9/lib/python/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/viniciusdeoliveiramartucci/Library/Python/3.9/lib/python/site-packages/py4j/java_gateway.py", line 1217, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 