In [1]:
from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes, CsvTableSource
import datetime
from pyflink.table.expressions import col
from pyflink.table.window import Over, GroupWindow
from pyflink.table.expressions import col, UNBOUNDED_RANGE, CURRENT_RANGE
from pyflink.table.udf import udf
# create a batch TableEnvironment
env_settings = EnvironmentSettings.in_streaming_mode()
table_env = TableEnvironment.create(env_settings)
#table_env.get_config().get_configuration().set_string("pipeline.jars", "file:///opt/flink-1.15.0/lib/*.jar")

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/flink-1.15.0/lib/log4j-slf4j-impl-2.17.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/hadoop-2.7.7/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]


In [2]:
# InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
# 536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/01/2010 8:26,2.55,17850,United Kingdom

source_ddl = """
            CREATE TABLE invoices (
                   InvoiceNo STRING,
                   StockCode  STRING,
                   Description  STRING,
                   Quantity DOUBLE,
                   InvoiceDate TIMESTAMP(3),
                   UnitPrice DOUBLE,
                   CustomerID STRING,
                   Country STRING,
                   WATERMARK FOR InvoiceDate AS InvoiceDate - INTERVAL '1' SECOND

                    ) WITH (
                      'connector' = 'filesystem',          
                      'path' = 'file:///home/krish/flink/data/ecomm/ecommerce-clean.csv', 
                      'format' = 'csv'
                    )"""


table_env.execute_sql(source_ddl) 

invoices = table_env.from_path("invoices")
##############################
print('\nRegistered Tables List')
print(table_env.list_tables())

print('\nFinancial Trxs Schema')
invoices.print_schema()



Registered Tables List
['invoices']

Financial Trxs Schema
(
  `InvoiceNo` STRING,
  `StockCode` STRING,
  `Description` STRING,
  `Quantity` DOUBLE,
  `InvoiceDate` TIMESTAMP(3) *ROWTIME*,
  `UnitPrice` DOUBLE,
  `CustomerID` STRING,
  `Country` STRING,
  WATERMARK FOR `InvoiceDate`: TIMESTAMP(3) AS `InvoiceDate` - INTERVAL '1' SECOND
)


In [3]:
invoices.fetch(3).execute().print()

+----+--------------------------------+--------------------------------+--------------------------------+--------------------------------+-------------------------+--------------------------------+--------------------------------+--------------------------------+
| op |                      InvoiceNo |                      StockCode |                    Description |                       Quantity |             InvoiceDate |                      UnitPrice |                     CustomerID |                        Country |
+----+--------------------------------+--------------------------------+--------------------------------+--------------------------------+-------------------------+--------------------------------+--------------------------------+--------------------------------+
| +I |                         536365 |                         85123A | WHITE HANGING HEART T-LIGHT... |                            6.0 | 2010-12-01 08:26:00.000 |                           2.55 |           

In [4]:
from pyflink.table.expressions import col, lit
from pyflink.table.window import Tumble

result = invoices.filter(col("CustomerID").is_not_null & col("InvoiceNo").is_not_null & col("Country").is_not_null) \
               .select(col("Country").lower_case.alias('Country'), col("Quantity"),  col("InvoiceDate")) \
               .window(Tumble.over(lit(1).hour).on(col("InvoiceDate")).alias("hourly_window")) \
               .group_by(col('hourly_window'), col('Country')) \
               .select(col('Country'), col('hourly_window').end.alias('hour'), col("Quantity").avg.alias('Qty'))
result.print_schema()
result.fetch(5).execute().print()

(
  `Country` STRING,
  `hour` TIMESTAMP(3),
  `Qty` DOUBLE
)
+----+--------------------------------+-------------------------+--------------------------------+
| op |                        Country |                    hour |                            Qty |
+----+--------------------------------+-------------------------+--------------------------------+
| +I |                 united kingdom | 2010-12-01 09:00:00.000 |              5.884615384615385 |
| +I |                 united kingdom | 2010-12-01 10:00:00.000 |             21.582781456953644 |
| +I |                 united kingdom | 2010-12-01 11:00:00.000 |              25.11214953271028 |
| +I |                      australia | 2010-12-01 11:00:00.000 |              7.642857142857143 |
| +I |                         france | 2010-12-01 09:00:00.000 |                          22.45 |
+----+--------------------------------+-------------------------+--------------------------------+
5 rows in set


In [5]:
from pyflink.table.window import Tumble
from pyflink.table.expressions import lit, col


result = invoices.window(Tumble.over(lit(5).minutes).on(col('InvoiceDate')).alias("w")) \
               .group_by(col('Country'), col('w')) \
               .select(col('Country'), col('w').start, col('w').end, col('UnitPrice').count.alias('UniqItems'))

result.print_schema()
result.fetch(5).execute().print()

(
  `Country` STRING,
  `EXPR$0` TIMESTAMP(3),
  `EXPR$1` TIMESTAMP(3),
  `UniqItems` BIGINT NOT NULL
)
+----+--------------------------------+-------------------------+-------------------------+----------------------+
| op |                        Country |                  EXPR$0 |                  EXPR$1 |            UniqItems |
+----+--------------------------------+-------------------------+-------------------------+----------------------+
| +I |                        Germany | 2010-12-01 13:00:00.000 | 2010-12-01 13:05:00.000 |                   15 |
| +I |                        Germany | 2010-12-01 14:30:00.000 | 2010-12-01 14:35:00.000 |                   14 |
| +I |                        Germany | 2010-12-02 10:35:00.000 | 2010-12-02 10:40:00.000 |                    1 |
| +I |                      Australia | 2010-12-01 10:00:00.000 | 2010-12-01 10:05:00.000 |                   14 |
| +I |                    Netherlands | 2010-12-01 11:25:00.000 | 2010-12-01 11:30:00.000 |

In [6]:
from pyflink.table.window import Over
from pyflink.table.expressions import col, UNBOUNDED_RANGE, CURRENT_RANGE


result = invoices.over_window(Over.partition_by(col("Country")).order_by(col("InvoiceDate"))
                            .preceding(UNBOUNDED_RANGE).following(CURRENT_RANGE)
                            .alias("w")) \
               .select(col("Country"), col("Quantity").avg.over(col('w')), col("Quantity").max.over(col('w')), col("Quantity").min.over(col('w')))

result.print_schema()
result.fetch(5).execute().print()

(
  `Country` STRING,
  `_c1` DOUBLE,
  `_c2` DOUBLE,
  `_c3` DOUBLE
)
+----+--------------------------------+--------------------------------+--------------------------------+--------------------------------+
| op |                        Country |                            _c1 |                            _c2 |                            _c3 |
+----+--------------------------------+--------------------------------+--------------------------------+--------------------------------+
| +I |                 United Kingdom |              5.714285714285714 |                            8.0 |                            2.0 |
| +I |                 United Kingdom |              5.714285714285714 |                            8.0 |                            2.0 |
| +I |                 United Kingdom |              5.714285714285714 |                            8.0 |                            2.0 |
| +I |                 United Kingdom |              5.714285714285714 |                       

In [11]:
from pyflink.table.expressions import col, lit, UNBOUNDED_RANGE
from pyflink.table.window import Over, Tumble

# Distinct aggregation on group by
group_by_distinct_result = invoices.group_by(col("Country")) \
                                 .select(col("Country"), col("Quantity").sum.distinct.alias('Qty'))

# Distinct aggregation on time window group by
group_by_window_distinct_result = invoices.window(Tumble.over(lit(5).minutes).on(col("InvoiceDate")).alias("w")) \
    .group_by(col("Country"), col('w')) \
    .select(col("Country"), col("Quantity").sum.distinct.alias('Qty'))

# Distinct aggregation on over window
result = invoices.over_window(Over
                       .partition_by(col("Country"))
                       .order_by(col("InvoiceDate"))
                       .preceding(UNBOUNDED_RANGE)
                       .alias("w")) \
    .select(col("Country"), col("Quantity").avg.distinct.over(col('w')), col("Quantity").max.over(col('w')), col("Quantity").min.over(col('w')))

group_by_distinct_result.print_schema()
group_by_distinct_result.fetch(2).execute().print()


group_by_window_distinct_result.print_schema()
group_by_window_distinct_result.fetch(2).execute().print()


result.print_schema()
result.fetch(2).execute().print()


(
  `Country` STRING,
  `Qty` DOUBLE
)
+----+--------------------------------+--------------------------------+
| op |                        Country |                            Qty |
+----+--------------------------------+--------------------------------+
| +I |                 United Kingdom |                            6.0 |
| -D |                 United Kingdom |                            6.0 |
| +I |                 United Kingdom |                           14.0 |
| -D |                 United Kingdom |                           14.0 |
| +I |                 United Kingdom |                           16.0 |
| -D |                 United Kingdom |                           16.0 |
| +I |                 United Kingdom |                           48.0 |
| -D |                 United Kingdom |                           48.0 |
| +I |                 United Kingdom |                           51.0 |
| -D |                 United Kingdom |                           51.0 |
| +I |      

In [17]:

# Distinct aggregation on over window
stock_codes = invoices.select(col("StockCode"), col("Description"))\
                 .distinct()
table_env.register_table("stock_codes", stock_codes)

stock_codes.fetch(5).execute().print()

                   

+----+--------------------------------+--------------------------------+
| op |                      StockCode |                    Description |
+----+--------------------------------+--------------------------------+
| +I |                         84406B | CREAM CUPID HEARTS COAT HANGER |
| +I |                          22752 |   SET 7 BABUSHKA NESTING BOXES |
| +I |                          22749 | FELTCRAFT PRINCESS CHARLOTT... |
| +I |                          22622 | BOX OF VINTAGE ALPHABET BLOCKS |
| +I |                          21754 |       HOME BUILDING BLOCK WORD |
+----+--------------------------------+--------------------------------+
5 rows in set


In [25]:
# inner join

from pyflink.table.expressions import col

left = table_env.from_path("invoices").select(col('StockCode'), col('Quantity'), col('UnitPrice'))
right = table_env.from_path("stock_codes").select(col('StockCode').alias("SC"), col('Description'))

result = left.join(right).where(col("StockCode") == col('SC')).select(col("StockCode"),col('Quantity'), col('UnitPrice'))
result.print_schema()
result.fetch(5).execute().print()

(
  `StockCode` STRING,
  `Quantity` DOUBLE,
  `UnitPrice` DOUBLE
)
+----+--------------------------------+--------------------------------+--------------------------------+
| op |                      StockCode |                       Quantity |                      UnitPrice |
+----+--------------------------------+--------------------------------+--------------------------------+
| +I |                         85123A |                            6.0 |                           2.55 |
| +I |                         85123A |                            6.0 |                           2.55 |
| +I |                         85123A |                            6.0 |                           2.55 |
| +I |                         85123A |                            6.0 |                           2.55 |
| +I |                         85123A |                            6.0 |                           2.55 |
+----+--------------------------------+--------------------------------+------------

In [27]:
# Outer joins, Left, Right, Full

from pyflink.table.expressions import col

left = table_env.from_path("invoices").select(col('StockCode'), col('Quantity'), col('UnitPrice'))
right = table_env.from_path("stock_codes").select(col('StockCode').alias("SC"), col('Description'))

result = left.left_outer_join(right).where(col("StockCode") == col('SC')).select(col("StockCode"),col('Quantity'), col('UnitPrice'))
result.print_schema()
result.fetch(5).execute().print()


result = left.right_outer_join(right).where(col("StockCode") == col('SC')).select(col("StockCode"),col('Quantity'), col('UnitPrice'))
result.print_schema()
result.fetch(5).execute().print()


result = left.full_outer_join(right).where(col("StockCode") == col('SC')).select(col("StockCode"),col('Quantity'), col('UnitPrice'))
result.print_schema()
result.fetch(5).execute().print()

(
  `StockCode` STRING,
  `Quantity` DOUBLE,
  `UnitPrice` DOUBLE
)
+----+--------------------------------+--------------------------------+--------------------------------+
| op |                      StockCode |                       Quantity |                      UnitPrice |
+----+--------------------------------+--------------------------------+--------------------------------+
| +I |                          22633 |                           16.0 |                            2.1 |
| +I |                          22633 |                            5.0 |                            2.1 |
| +I |                          22633 |                            7.0 |                            2.1 |
| +I |                          22633 |                            2.0 |                            2.1 |
| +I |                          22633 |                            2.0 |                            2.1 |
+----+--------------------------------+--------------------------------+------------

TypeError: right_outer_join() missing 1 required positional argument: 'join_predicate'