In [1]:
from pyflink.table import EnvironmentSettings, TableEnvironment
# Shift + Enter to run the cell

In [2]:
# create environment for batch mode 
env = EnvironmentSettings.in_batch_mode()

#   create table environment
table_env = TableEnvironment.create(env)

In [3]:
# hardcoded data
# [] - python type list, mutable - change
# () = type  tuple, immutable - cannot change
data = [ ("Joe", 28), ("Mary", 34), ("Venkat", 40) ] # 3 records
columns = ["name", "age"]
# create virtual table or view
employee_table = table_env.from_elements(data, columns)

In [4]:
# flink always works with structured data means data + meta data 
# meta data = schema , columns, data types etc

# flink too infer schema
# print schema
employee_table.print_schema()

(
  `name` STRING,
  `age` BIGINT
)


In [5]:
# pandas inbuilt, convert table data into pandas dataframe

In [6]:
print("before")
print(table_env.list_tables())

# now register virtual table as table for flink sql
# employees2 shall be created in default cataglog, default database
table_env.register_table("employees2", employee_table)

print ("after register")
print(table_env.list_tables())

before
[]
after register
['employees2']


In [7]:
# now we can run queries in SQL syntax
# virtual table2
table2 = table_env.sql_query("SELECT name,age from employees2")
table2.print_schema()

(
  `name` STRING,
  `age` BIGINT
)


In [8]:
# SQL API works with temp view, temp table, table registered in table environment
table3 = table_env.sql_query("SELECT name,age FROM employees2 WHERE age <= 30")
table3.print_schema()
table3.to_pandas() # last cell expression printed

(
  `name` STRING,
  `age` BIGINT
)


Unnamed: 0,name,age
0,Joe,28


In [9]:
# SQL API also optimized
print(table3.explain())

== Abstract Syntax Tree ==
LogicalProject(name=[$0], age=[$1])
+- LogicalFilter(condition=[<=($1, 30)])
   +- LogicalTableScan(table=[[default_catalog, default_database, Unregistered_TableSource_770887563, source: [PythonInputFormatTableSource(name, age)]]])

== Optimized Physical Plan ==
Calc(select=[name, age], where=[<=(age, 30)])
+- LegacyTableSourceScan(table=[[default_catalog, default_database, Unregistered_TableSource_770887563, source: [PythonInputFormatTableSource(name, age)]]], fields=[name, age])

== Optimized Execution Plan ==
Calc(select=[name, age], where=[(age <= 30)])
+- LegacyTableSourceScan(table=[[default_catalog, default_database, Unregistered_TableSource_770887563, source: [PythonInputFormatTableSource(name, age)]]], fields=[name, age])



In [10]:
print(employee_table.explain())

== Abstract Syntax Tree ==
LogicalTableScan(table=[[default_catalog, default_database, Unregistered_TableSource_770887563, source: [PythonInputFormatTableSource(name, age)]]])

== Optimized Physical Plan ==
LegacyTableSourceScan(table=[[default_catalog, default_database, Unregistered_TableSource_770887563, source: [PythonInputFormatTableSource(name, age)]]], fields=[name, age])

== Optimized Execution Plan ==
LegacyTableSourceScan(table=[[default_catalog, default_database, Unregistered_TableSource_770887563, source: [PythonInputFormatTableSource(name, age)]]], fields=[name, age])



In [11]:
# Table API using Python, typically good for unregistered tables/query results/temp views
from pyflink.table.expressions import col

# return temp view
table4 = employee_table.select ( col("name"), col("age") )
# in spark we have action methods, exeucte, to_pandas are  similar to action method
table4.execute().print()
table4.to_pandas()

+--------------------------------+----------------------+
|                           name |                  age |
+--------------------------------+----------------------+
|                            Joe |                   28 |
|                           Mary |                   34 |
|                         Venkat |                   40 |
+--------------------------------+----------------------+
3 rows in set


Unnamed: 0,name,age
0,Joe,28
1,Mary,34
2,Venkat,40


In [12]:
# Table API
result5 = employee_table.filter ( col ("age") <= 30)
result5.execute().print()

+--------------------------------+----------------------+
|                           name |                  age |
+--------------------------------+----------------------+
|                            Joe |                   28 |
+--------------------------------+----------------------+
1 row in set


In [13]:
# Both Table API and SQL API are converted into Flink core before executing query
print(result5.explain ()) # similar to spark explain extended

# Abstract Syntax Tree , AST, compiler design
# Logical in AST means, the plan as is, not optimized
# read teh plan bottom up, bottom most one is executed first
# a + b = infix
# +ab = prefix
# ab+ = postfix
# result logical filter in prefix format <=($1, 30) <= is operator, 30 is filter value, $1 col name age

# Optimized Physical Plan
# flink optimize AST to execute faster based on cost model build without cluster size

 
# == Optimized Execution Plan ==
# final exeuction plan selected from optimized plans

== Abstract Syntax Tree ==
LogicalFilter(condition=[<=($1, 30)])
+- LogicalTableScan(table=[[default_catalog, default_database, Unregistered_TableSource_770887563, source: [PythonInputFormatTableSource(name, age)]]])

== Optimized Physical Plan ==
Calc(select=[name, age], where=[<=(age, 30)])
+- LegacyTableSourceScan(table=[[default_catalog, default_database, Unregistered_TableSource_770887563, source: [PythonInputFormatTableSource(name, age)]]], fields=[name, age])

== Optimized Execution Plan ==
Calc(select=[name, age], where=[(age <= 30)])
+- LegacyTableSourceScan(table=[[default_catalog, default_database, Unregistered_TableSource_770887563, source: [PythonInputFormatTableSource(name, age)]]], fields=[name, age])



In [14]:
# we can use sql api or python api

result6 = table_env.sql_query("SELECT * FROM employees2 WHERE name LIKE 'V%'  ")
result6.execute().print()

+--------------------------------+----------------------+
|                           name |                  age |
+--------------------------------+----------------------+
|                         Venkat |                   40 |
+--------------------------------+----------------------+
1 row in set


In [15]:
# table python api
result7 = employee_table.where (col("name").like("V%"))
result7.execute().print()

+--------------------------------+----------------------+
|                           name |                  age |
+--------------------------------+----------------------+
|                         Venkat |                   40 |
+--------------------------------+----------------------+
1 row in set


In [16]:
result8 = employee_table.select ( col("name").upper_case)
result8.execute().print()

+--------------------------------+
|                            _c0 |
+--------------------------------+
|                            JOE |
|                           MARY |
|                         VENKAT |
+--------------------------------+
3 rows in set
