In [1]:
from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes, CsvTableSource
import datetime
from pyflink.table.expressions import col
from pyflink.table.window import Over, GroupWindow
from pyflink.table.expressions import col, UNBOUNDED_RANGE, CURRENT_RANGE
from pyflink.table.udf import udf
# create a batch TableEnvironment
env_settings = EnvironmentSettings.in_batch_mode()
table_env = TableEnvironment.create(env_settings)

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/opt/flink-1.15.3/lib/log4j-slf4j-impl-2.17.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/hadoop-2.7.7/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]


In [2]:
# InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
# 536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/01/2010 8:26,2.55,17850,United Kingdom
column_names = ['InvoiceNo', 'StockCode', 'Description', 'Quantity', 
                'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']

column_types = [DataTypes.STRING(), DataTypes.STRING(), DataTypes.STRING(), DataTypes.DOUBLE(), 
                DataTypes.TIMESTAMP(3), DataTypes.DOUBLE(),DataTypes.STRING(), DataTypes.STRING()]

source = CsvTableSource(
   '/home/training/flink-dev/data/ecommerce-clean.csv',  
    column_names,
    column_types,
    ignore_first_line=False,
    quote_character='"',
    lenient=True
)

# source is data source -origin data is from
# table is flink api
# sink is target where data goes to after transformation
table_env.register_table_source('invoices', source)

# invoices is a table
invoices = table_env.from_path('invoices')


##############################
print('\nRegistered Tables List')
print(table_env.list_tables())

print('\nFinancial Trxs Schema')
invoices.print_schema()

invoices.fetch(3).execute().print()


Registered Tables List
['invoices']

Financial Trxs Schema
(
  `InvoiceNo` STRING,
  `StockCode` STRING,
  `Description` STRING,
  `Quantity` DOUBLE,
  `InvoiceDate` TIMESTAMP(3),
  `UnitPrice` DOUBLE,
  `CustomerID` STRING,
  `Country` STRING
)




+--------------------------------+--------------------------------+--------------------------------+--------------------------------+-------------------------+--------------------------------+--------------------------------+--------------------------------+
|                      InvoiceNo |                      StockCode |                    Description |                       Quantity |             InvoiceDate |                      UnitPrice |                     CustomerID |                        Country |
+--------------------------------+--------------------------------+--------------------------------+--------------------------------+-------------------------+--------------------------------+--------------------------------+--------------------------------+
|                        C554528 |                          23110 |         PARISIENNE KEY CABINET |                           -4.0 | 2011-05-24 17:26:00.000 |                           5.75 |                          14217

In [3]:
# what is 1 hour transactions 
# how many transactions per hour
# Tumble window - non overlapping, fixed size window
# data is processed only once in a window
from pyflink.table.expressions import col, lit
from pyflink.table.window import Tumble
# lit - literal, useful constant data
result = invoices\
                 .select(col("InvoiceDate"),   col("Quantity"))\
                 .window(Tumble.over(lit(1).hour).on(col("InvoiceDate")).alias("hourly_window")) \
                 .group_by(col('hourly_window')) \
                   .select(col('hourly_window').end.alias('hour'), col("Quantity").sum.alias('Qty'))
 
                         
result.print_schema()
result.fetch(30).execute().print()

(
  `hour` TIMESTAMP(3),
  `Qty` DOUBLE
)
+-------------------------+--------------------------------+
|                    hour |                            Qty |
+-------------------------+--------------------------------+
| 2011-03-28 13:00:00.000 |                         1567.0 |
| 2011-03-28 14:00:00.000 |                         1498.0 |
| 2011-03-28 15:00:00.000 |                         1230.0 |
| 2011-03-28 16:00:00.000 |                         3455.0 |
| 2011-03-28 17:00:00.000 |                          769.0 |
| 2011-03-29 09:00:00.000 |                          275.0 |
| 2011-03-29 10:00:00.000 |                          782.0 |
| 2011-03-29 11:00:00.000 |                          602.0 |
| 2011-03-29 12:00:00.000 |                        12998.0 |
| 2011-03-29 13:00:00.000 |                          769.0 |
| 2011-03-29 14:00:00.000 |                         7248.0 |
| 2011-03-29 15:00:00.000 |                         1035.0 |
| 2011-03-29 16:00:00.000 |                

In [4]:
# what is 1 hour transactions 
# how many transactions per hour
# Tumble window - non overlapping, fixed size window
# data is processed only once in a window
from pyflink.table.expressions import col, lit
from pyflink.table.window import Tumble
# lit - literal, useful constant data
result = invoices\
                 .select(col("InvoiceDate"), col("Quantity"), (col('Quantity') * col('UnitPrice')).alias("Amount") )\
                 .window(Tumble.over(lit(1).hour).on(col("InvoiceDate")).alias("hourly_window")) \
                 .group_by(col('hourly_window')) \
                   .select(col('hourly_window').start.alias('start_time'), 
                           col('hourly_window').end.alias('end_time'), 
                           col("Quantity").sum.alias('Qty'),
                           col("Amount").sum.alias('TotalAmount'),
                          )
 
                         
result.print_schema()
result.fetch(30).execute().print()

(
  `start_time` TIMESTAMP(3),
  `end_time` TIMESTAMP(3),
  `Qty` DOUBLE,
  `TotalAmount` DOUBLE
)
+-------------------------+-------------------------+--------------------------------+--------------------------------+
|              start_time |                end_time |                            Qty |                    TotalAmount |
+-------------------------+-------------------------+--------------------------------+--------------------------------+
| 2011-01-23 13:00:00.000 | 2011-01-23 14:00:00.000 |                          727.0 |             1335.1299999999997 |
| 2011-01-23 14:00:00.000 | 2011-01-23 15:00:00.000 |                          449.0 |              890.8700000000003 |
| 2011-01-23 15:00:00.000 | 2011-01-23 16:00:00.000 |                          962.0 |             1473.1400000000003 |
| 2011-01-23 16:00:00.000 | 2011-01-23 17:00:00.000 |                         1203.0 |                        2561.82 |
| 2011-01-24 08:00:00.000 | 2011-01-24 09:00:00.000 |        