In [1]:
# your python code here

# SHIFT + ENTER to run the code
# the last expression result shall be printed by default or use print statement

30

In [4]:
from pyflink.table import EnvironmentSettings, TableEnvironment
# Shift + Enter to run the cell

In [5]:
# Flink has two environemnts
# Local Environment - current setup, (Flink Client, Job Manager, Task Manager) all runs on Single JRE
# Remote Environment - Production setup (Flink Client, Job Manager, Task Manager runs on diffent JRE, distributed)

# 3 flink components
# 1. Flink Client, 2. Job Manager, 3. Task Manager

In [7]:
# create environment for batch mode 
# batch mode: work with   historic data, or static data
# data is loaded once, job/task shall be optimized very much to get highest performance compared to stream mode
# heavy optimization applied 
# produce single output at the end of processing [NOT INCREMENTAL OUTPUT]
# in batch mode fucntion shall automatically create either Local or Remote environment based on 
# flink run command [discussed later]
env = EnvironmentSettings.in_batch_mode()

#   create table environment
table_env = TableEnvironment.create(env)

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/flink-1.15.3/lib/log4j-slf4j-impl-2.17.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/hadoop-2.7.7/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]


In [8]:
# hardcoded data
# [] - python type list, mutable - change
# () = type  tuple, immutable - cannot change
data = [ ("Joe", 28), ("Mary", 34), ("Venkat", 40) ] # 3 records
columns = ["name", "age"]
# create virtual table or view
employee_table = table_env.from_elements(data, columns)

In [9]:
# flink always works with structured data means data + meta data 
# meta data = schema , columns, data types etc

# flink too infer schema
# print schema
employee_table.print_schema()

(
  `name` STRING,
  `age` BIGINT
)


In [10]:
# pandas inbuilt, convert table data into pandas dataframe

In [11]:
print("before")
# flink has catalog, a catalog composed of many databases, databases shall have tables
# by default,we have got default catalog, default database
print(table_env.list_tables())

# employee_table is python object reference to table API, where we can use filter, sort, join etc functions
# or we can use Flink SQL, for flink sql, we need to register a table in the default catalog so that
# we can use statments like SELECT * FROM employees2

# whether we use python/java/scala employee_table api or SQL, both will yield same performance
# under the hood, THE API/SQL shall be converted into DATA FLOW GRAPH

# now register virtual table as table for flink sql
# employees2 shall be created in default cataglog, default database
table_env.register_table("employees2", employee_table)

print ("after register")
print(table_env.list_tables())

before
[]
after register
['employees2']


In [16]:
# now we can run queries in SQL syntax
# virtual table2
# Lazy/delayed evaluation, this will not RUN the QUERY NOW
# flink shall parse SQL, analysis the syntax only, not RUN the SQL
# it derives schemas for the table2 obj
# Parsing SQLs, APIs and creating/deriving SCHEMAs all done at Flink Client component before submitting Data Flow Graph to 
# Job Manager
table2 = table_env.sql_query("SELECT name,  age from employees2")
table2.print_schema()

(
  `name` STRING,
  `age` BIGINT
)


In [21]:
# SQL API works with temp view, temp table, table registered in table environment
# SQL is parsed, converted into AST, then generate Data Flow Graph
table3 = table_env.sql_query("SELECT name,age FROM employees2 WHERE age <= 30")
table3.print_schema()
table3.execute().print()
# to_pandas is action, this runs/executes the Query on Job/Task manager
table3.to_pandas() # last cell expression printed

(
  `name` STRING,
  `age` BIGINT
)
+--------------------------------+----------------------+
|                           name |                  age |
+--------------------------------+----------------------+
|                            Joe |                   28 |
+--------------------------------+----------------------+
1 row in set


Unnamed: 0,name,age
0,Joe,28


In [18]:
# SQL API also optimized
# runs on Flink Client, shall use meta data, sources, no QUERY/no job submission
# not using JOB/TASK maanger
print(table3.explain())

== Abstract Syntax Tree ==
LogicalProject(name=[$0], age=[$1])
+- LogicalFilter(condition=[<=($1, 30)])
   +- LogicalTableScan(table=[[default_catalog, default_database, Unregistered_TableSource_33397120, source: [PythonInputFormatTableSource(name, age)]]])

== Optimized Physical Plan ==
Calc(select=[name, age], where=[<=(age, 30)])
+- LegacyTableSourceScan(table=[[default_catalog, default_database, Unregistered_TableSource_33397120, source: [PythonInputFormatTableSource(name, age)]]], fields=[name, age])

== Optimized Execution Plan ==
Calc(select=[name, age], where=[(age <= 30)])
+- LegacyTableSourceScan(table=[[default_catalog, default_database, Unregistered_TableSource_33397120, source: [PythonInputFormatTableSource(name, age)]]], fields=[name, age])



In [19]:
# Flow Graph
# Source ==> Operator 1 ==> Operator 2 ==> Sink

In [20]:
# PYTHON Table API using Python, typically good for unregistered tables/query results/temp views
from pyflink.table.expressions import col

# return temp view
# select is api , similar to SQL select
# execute function parse the APIs like select, filter etc, then generate AST, generate DFG, then optimize and run Data Flow Graph
table4 = employee_table.select ( col("name"), col("age") )
# in spark we have action methods, exeucte, to_pandas are  similar to action method
table4.execute().print()
table4.to_pandas()

+--------------------------------+----------------------+
|                           name |                  age |
+--------------------------------+----------------------+
|                            Joe |                   28 |
|                           Mary |                   34 |
|                         Venkat |                   40 |
+--------------------------------+----------------------+
3 rows in set


Unnamed: 0,name,age
0,Joe,28
1,Mary,34
2,Venkat,40


In [22]:
# Table API

result5 = employee_table.filter ( col ("age") <= 30)
result5.execute().print()

+--------------------------------+----------------------+
|                           name |                  age |
+--------------------------------+----------------------+
|                            Joe |                   28 |
+--------------------------------+----------------------+
1 row in set


In [23]:
# Flow API, calling api on top of another API
# \ mean line continuation, no SPACE after \
result_table = employee_table.select ( col("name"), col("age") )\
                             .filter ( col ("age") <= 30)

result_table.print_schema()
result_table.execute().print()

(
  `name` STRING,
  `age` BIGINT
)
+--------------------------------+----------------------+
|                           name |                  age |
+--------------------------------+----------------------+
|                            Joe |                   28 |
+--------------------------------+----------------------+
1 row in set


In [25]:
# Using API from Java, Scala, Python or SQL statement directly bring the same result
# at runtime, all the API/SQL converted into AST, Into Data Flow Graph DFG , then executed inside Job/Task Manager
print(result_table.explain()) # table API, but the output will be SAME

== Abstract Syntax Tree ==
LogicalFilter(condition=[<=($1, 30)])
+- LogicalProject(name=[$0], age=[$1])
   +- LogicalTableScan(table=[[default_catalog, default_database, Unregistered_TableSource_33397120, source: [PythonInputFormatTableSource(name, age)]]])

== Optimized Physical Plan ==
Calc(select=[name, age], where=[<=(age, 30)])
+- LegacyTableSourceScan(table=[[default_catalog, default_database, Unregistered_TableSource_33397120, source: [PythonInputFormatTableSource(name, age)]]], fields=[name, age])

== Optimized Execution Plan ==
Calc(select=[name, age], where=[(age <= 30)])
+- LegacyTableSourceScan(table=[[default_catalog, default_database, Unregistered_TableSource_33397120, source: [PythonInputFormatTableSource(name, age)]]], fields=[name, age])

