In [1]:
from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes, CsvTableSource
import datetime
from pyflink.table.expressions import col
from pyflink.table.window import Over, GroupWindow
from pyflink.table.expressions import col, UNBOUNDED_RANGE, CURRENT_RANGE
from pyflink.table.udf import udf
# create a batch TableEnvironment
env_settings = EnvironmentSettings.in_batch_mode()
table_env = TableEnvironment.create(env_settings)

In [2]:
# InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
# 536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/01/2010 8:26,2.55,17850,United Kingdom
column_names = ['InvoiceNo', 'StockCode', 'Description', 'Quantity', 
                'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']

column_types = [DataTypes.STRING(), DataTypes.STRING(), DataTypes.STRING(), DataTypes.DOUBLE(), 
                DataTypes.TIMESTAMP(3), DataTypes.DOUBLE(),DataTypes.STRING(), DataTypes.STRING()]

source = CsvTableSource(
   '../data/ecommerce-clean.csv',  
    column_names,
    column_types,
    ignore_first_line=False,
    quote_character='"',
    lenient=True
)

# source is data source -origin data is from
# table is flink api
# sink is target where data goes to after transformation
table_env.register_table_source('invoices', source)

# invoices is a table
invoices = table_env.from_path('invoices')


##############################
print('\nRegistered Tables List')
print(table_env.list_tables())

print('\nFinancial Trxs Schema')
invoices.print_schema()

invoices.fetch(3).execute().print()


Registered Tables List
['invoices']

Financial Trxs Schema
(
  `InvoiceNo` STRING,
  `StockCode` STRING,
  `Description` STRING,
  `Quantity` DOUBLE,
  `InvoiceDate` TIMESTAMP(3),
  `UnitPrice` DOUBLE,
  `CustomerID` STRING,
  `Country` STRING
)
+--------------------------------+--------------------------------+--------------------------------+--------------------------------+-------------------------+--------------------------------+--------------------------------+--------------------------------+
|                      InvoiceNo |                      StockCode |                    Description |                       Quantity |             InvoiceDate |                      UnitPrice |                     CustomerID |                        Country |
+--------------------------------+--------------------------------+--------------------------------+--------------------------------+-------------------------+--------------------------------+--------------------------------+----------

In [17]:
# what is 1 hour transactions 
# how many transactions per hour
# Tumble window - non overlapping, fixed size window
# data is processed only once in a window
from pyflink.table.expressions import col, lit
from pyflink.table.window import Tumble
# lit - literal, useful constant data
result = invoices\
                 .select(col("InvoiceDate"), col("InvoiceNo"), col("Quantity"))\
                 .window(Tumble.over(lit(1).hour).on(col("InvoiceDate")).alias("hourly_window")) \
                 .group_by(col('hourly_window'), col('InvoiceNo')) \
                   .select(col('InvoiceNo'), col('hourly_window').end.alias('hour'), col("Quantity").sum.alias('Qty'))
 
                         
result.print_schema()
result.fetch(30).execute().print()

(
  `InvoiceNo` STRING,
  `hour` TIMESTAMP(3),
  `Qty` DOUBLE
)
+--------------------------------+-------------------------+--------------------------------+
|                      InvoiceNo |                    hour |                            Qty |
+--------------------------------+-------------------------+--------------------------------+
|                         547910 | 2011-03-28 13:00:00.000 |                          402.0 |
|                         547919 | 2011-03-28 13:00:00.000 |                           96.0 |
|                         547926 | 2011-03-28 14:00:00.000 |                          268.0 |
|                         547928 | 2011-03-28 15:00:00.000 |                            1.0 |
|                         547931 | 2011-03-28 15:00:00.000 |                            1.0 |
|                         547933 | 2011-03-28 15:00:00.000 |                            1.0 |
|                         547934 | 2011-03-28 15:00:00.000 |                          157.

In [23]:
# what is 1 hour transactions 
# how many transactions per hour
# Tumble window - non overlapping, fixed size window
# data is processed only once in a window
from pyflink.table.expressions import col, lit
from pyflink.table.window import Tumble
# lit - literal, useful constant data
result = invoices\
                 .select(col("InvoiceDate"), col("Quantity"), (col('Quantity') * col('UnitPrice')).alias("Amount") )\
                 .window(Tumble.over(lit(1).hour).on(col("InvoiceDate")).alias("hourly_window")) \
                 .group_by(col('hourly_window')) \
                   .select(col('hourly_window').start.alias('start_time'), 
                           col('hourly_window').end.alias('end_time'), 
                           col("Quantity").sum.alias('Qty'),
                           col("Amount").sum.alias('TotalAmount'),
                          )
 
                         
result.print_schema()
result.fetch(30).execute().print()

(
  `start_time` TIMESTAMP(3),
  `end_time` TIMESTAMP(3),
  `Qty` DOUBLE,
  `TotalAmount` DOUBLE
)
+-------------------------+-------------------------+--------------------------------+--------------------------------+
|              start_time |                end_time |                            Qty |                    TotalAmount |
+-------------------------+-------------------------+--------------------------------+--------------------------------+
| 2010-12-01 08:00:00.000 | 2010-12-01 09:00:00.000 |                          602.0 |             1383.8099999999997 |
| 2010-12-01 09:00:00.000 | 2010-12-01 10:00:00.000 |                         3259.0 |              7324.239999999998 |
| 2010-12-01 10:00:00.000 | 2010-12-01 11:00:00.000 |                         2794.0 |              5094.329999999998 |
| 2010-12-01 11:00:00.000 | 2010-12-01 12:00:00.000 |                         2356.0 |             4234.1599999999935 |
| 2010-12-01 12:00:00.000 | 2010-12-01 13:00:00.000 |        