In [1]:
from pyflink.common import WatermarkStrategy, Row
from pyflink.common.typeinfo import Types
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.connectors import NumberSequenceSource
from pyflink.datastream.functions import RuntimeContext, MapFunction
from pyflink.datastream.state import ValueStateDescriptor

In [2]:
#1. Create streamexecutionenvironment
env = StreamExecutionEnvironment.get_execution_environment()

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/flink-1.15.0/lib/log4j-slf4j-impl-2.17.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/hadoop-2.7.7/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]


In [3]:
# State custom logic implementation
class MyMapFunction(MapFunction):

    def open(self, runtime_context: RuntimeContext):
        print("MyMapFunction open")
        state_desc = ValueStateDescriptor('cnt', Types.LONG())
        #Define value state
        self.cnt_state = runtime_context.get_state(state_desc)
        print ("Count state is ", self.cnt_state)
        

    def map(self, value):
        
        cnt = self.cnt_state.value()
        print("MyMapFunction map", cnt)
        if cnt is None:
            cnt = 0

        new_cnt = cnt + 1
        self.cnt_state.update(new_cnt)
        return value[0], new_cnt

In [4]:
#2. Create data source
seq_num_source = NumberSequenceSource(1, 10)

In [5]:
# ds - data stream , previously we saw table in batch mode
ds = env.from_source(
    source=seq_num_source,
    watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(),
    source_name='seq_num_source',
    type_info=Types.LONG())

In [6]:
#3. Define execution logic
ds = ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \
       .key_by(lambda a: a[0]) \
       .map(MyMapFunction(), output_type=Types.TUPLE([Types.LONG(), Types.LONG()]))

In [7]:
#4. Print the result data
ds.print()

<pyflink.datastream.data_stream.DataStreamSink at 0x7f47783d10d0>

In [8]:
#5. Perform the operation
env.execute()

MyMapFunction open
Count state is  <pyflink.fn_execution.state_impl.SynchronousValueRuntimeState object at 0x7f47782a2e50>
MyMapFunction open
Count state is  <pyflink.fn_execution.state_impl.SynchronousValueRuntimeState object at 0x7f4778230a90>
MyMapFunction open
Count state is  <pyflink.fn_execution.state_impl.SynchronousValueRuntimeState object at 0x7f4778250040>


Exception in thread read_grpc_client_inputs:
Traceback (most recent call last):
  File "/home/rps/miniconda3/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/home/rps/miniconda3/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/rps/miniconda3/lib/python3.8/site-packages/apache_beam/runners/worker/data_plane.py", line 598, in <lambda>
    target=lambda: self._read_inputs(elements_iterator),
  File "/home/rps/miniconda3/lib/python3.8/site-packages/apache_beam/runners/worker/data_plane.py", line 581, in _read_inputs
    for elements in elements_iterator:
  File "/home/rps/miniconda3/lib/python3.8/site-packages/grpc/_channel.py", line 426, in __next__
    return self._next()
  File "/home/rps/miniconda3/lib/python3.8/site-packages/grpc/_channel.py", line 826, in _next
    raise self
grpc._channel._MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:
	status = StatusCode.CANC

MyMapFunction mapMyMapFunction map None
MyMapFunction map 1
 None
MyMapFunction map None
MyMapFunction map 1
MyMapFunction map 2
MyMapFunction map None
MyMapFunction map 1
MyMapFunction map 2
MyMapFunction map 1
3> (3,1)
3> (3,2)
2> (1,1)
2> (1,2)
2> (1,3)
1> (2,1)
1> (0,1)
1> (2,2)
1> (2,3)
1> (0,2)


<pyflink.common.job_execution_result.JobExecutionResult at 0x7f4778241e50>