In [2]:
from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes, CsvTableSource
import datetime
from pyflink.table.expressions import col
from pyflink.table.window import Over, GroupWindow
from pyflink.table.expressions import col, UNBOUNDED_RANGE, CURRENT_RANGE
from pyflink.table.udf import udf
# create a batch TableEnvironment
env_settings = EnvironmentSettings.in_batch_mode()
table_env = TableEnvironment.create(env_settings)

In [3]:
# InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
# 536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/01/2010 8:26,2.55,17850,United Kingdom
column_names = ['InvoiceNo', 'StockCode', 'Description', 'Quantity', 
                'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']

column_types = [DataTypes.STRING(), DataTypes.STRING(), DataTypes.STRING(), DataTypes.DOUBLE(), 
                DataTypes.TIMESTAMP(3), DataTypes.DOUBLE(),DataTypes.STRING(), DataTypes.STRING()]

source = CsvTableSource(
   './data/ecommerce-clean.csv',  
    column_names,
    column_types,
    ignore_first_line=False,
    quote_character='"',
    lenient=True
)

# source is data source -origin data is from
# table is flink api
# sink is target where data goes to after transformation
table_env.register_table_source('invoices', source)

# invoices is a table
invoices = table_env.from_path('invoices')


##############################
print('\nRegistered Tables List')
print(table_env.list_tables())

print('\nFinancial Trxs Schema')
invoices.print_schema()

invoices.fetch(3).execute().print()


Registered Tables List
['invoices']

Financial Trxs Schema
(
  `InvoiceNo` STRING,
  `StockCode` STRING,
  `Description` STRING,
  `Quantity` DOUBLE,
  `InvoiceDate` TIMESTAMP(3),
  `UnitPrice` DOUBLE,
  `CustomerID` STRING,
  `Country` STRING
)
+--------------------------------+--------------------------------+--------------------------------+--------------------------------+-------------------------+--------------------------------+--------------------------------+--------------------------------+
|                      InvoiceNo |                      StockCode |                    Description |                       Quantity |             InvoiceDate |                      UnitPrice |                     CustomerID |                        Country |
+--------------------------------+--------------------------------+--------------------------------+--------------------------------+-------------------------+--------------------------------+--------------------------------+----------

In [4]:
# what is 1 hour transactions 
# how many transactions per hour
# Tumble window - non overlapping, fixed size window
# data is processed only once in a window
from pyflink.table.expressions import col, lit
from pyflink.table.window import Tumble
# lit - literal, useful constant data
result = invoices\
                 .select(col("InvoiceDate"), col("InvoiceNo"), col("Quantity"))\
                 .window(Tumble.over(lit(1).hour).on(col("InvoiceDate")).alias("hourly_window")) \
                 .group_by(col('hourly_window'), col('InvoiceNo')) \
                   .select(col('InvoiceNo'), col('hourly_window').end.alias('hour'), col("Quantity").sum.alias('Qty'))
 
                         
result.print_schema()
result.fetch(30).execute().print()

(
  `InvoiceNo` STRING,
  `hour` TIMESTAMP(3),
  `Qty` DOUBLE
)
+--------------------------------+-------------------------+--------------------------------+
|                      InvoiceNo |                    hour |                            Qty |
+--------------------------------+-------------------------+--------------------------------+
|                         560717 | 2011-07-20 14:00:00.000 |                           -3.0 |
|                         560718 | 2011-07-20 14:00:00.000 |                         1005.0 |
|                         560719 | 2011-07-20 14:00:00.000 |                          -21.0 |
|                         560722 | 2011-07-20 14:00:00.000 |                          -31.0 |
|                         560725 | 2011-07-20 14:00:00.000 |                           54.0 |
|                         560726 | 2011-07-20 15:00:00.000 |                          128.0 |
|                         560730 | 2011-07-20 15:00:00.000 |                           -2.

In [5]:
# what is 1 hour transactions 
# how many transactions per hour
# Tumble window - non overlapping, fixed size window
# data is processed only once in a window
from pyflink.table.expressions import col, lit
from pyflink.table.window import Tumble
# lit - literal, useful constant data
result = invoices\
                 .select(col("InvoiceDate"), col("Quantity"), (col('Quantity') * col('UnitPrice')).alias("Amount") )\
                 .window(Tumble.over(lit(1).hour).on(col("InvoiceDate")).alias("hourly_window")) \
                 .group_by(col('hourly_window')) \
                   .select(col('hourly_window').start.alias('start_time'), 
                           col('hourly_window').end.alias('end_time'), 
                           col("Quantity").sum.alias('Qty'),
                           col("Amount").sum.alias('TotalAmount'),
                          )
 
                         
result.print_schema()
result.fetch(30).execute().print()

(
  `start_time` TIMESTAMP(3),
  `end_time` TIMESTAMP(3),
  `Qty` DOUBLE,
  `TotalAmount` DOUBLE
)
+-------------------------+-------------------------+--------------------------------+--------------------------------+
|              start_time |                end_time |                            Qty |                    TotalAmount |
+-------------------------+-------------------------+--------------------------------+--------------------------------+
| 2010-12-01 08:00:00.000 | 2010-12-01 09:00:00.000 |                          602.0 |             1383.8099999999997 |
| 2010-12-01 09:00:00.000 | 2010-12-01 10:00:00.000 |                         3259.0 |              7324.239999999998 |
| 2010-12-01 10:00:00.000 | 2010-12-01 11:00:00.000 |                         2794.0 |              5094.329999999998 |
| 2010-12-01 11:00:00.000 | 2010-12-01 12:00:00.000 |                         2356.0 |             4234.1599999999935 |
| 2010-12-01 12:00:00.000 | 2010-12-01 13:00:00.000 |        

In [6]:
from pyflink.table.window import Over
from pyflink.table.expressions import col, UNBOUNDED_RANGE, CURRENT_RANGE

# over window is NOT about start and end time
# over is used in general partitionong of data, not only time
# Not working
from pyflink.table.expressions import col, lit


# lit - literal, useful constant data
result = invoices.over_window(Over.partition_by(col("StockCode")).order_by(col("InvoiceDate"))
                            .preceding(UNBOUNDED_RANGE).following(CURRENT_RANGE)
                            .alias("w")) \
               .select(col("StockCode"), col("Quantity").sum.over(col('w')), col("Quantity").max.over(col('w')), col("Quantity").min.over(col('w')))

result.fetch(10).execute().print()

Py4JJavaError: An error occurred while calling o190.select.
: org.apache.flink.table.api.ValidationException: Ordering must be defined on a time attribute.
	at org.apache.flink.table.planner.expressions.PlannerTypeInferenceUtilImpl.validateArguments(PlannerTypeInferenceUtilImpl.java:111)
	at org.apache.flink.table.planner.expressions.PlannerTypeInferenceUtilImpl.runTypeInference(PlannerTypeInferenceUtilImpl.java:69)
	at org.apache.flink.table.expressions.resolver.rules.ResolveCallByArgumentsRule$ResolvingCallVisitor.runLegacyTypeInference(ResolveCallByArgumentsRule.java:284)
	at org.apache.flink.table.expressions.resolver.rules.ResolveCallByArgumentsRule$ResolvingCallVisitor.lambda$visit$2(ResolveCallByArgumentsRule.java:164)
	at java.util.Optional.orElseGet(Optional.java:267)
	at org.apache.flink.table.expressions.resolver.rules.ResolveCallByArgumentsRule$ResolvingCallVisitor.visit(ResolveCallByArgumentsRule.java:164)
	at org.apache.flink.table.expressions.resolver.rules.ResolveCallByArgumentsRule$ResolvingCallVisitor.visit(ResolveCallByArgumentsRule.java:98)
	at org.apache.flink.table.expressions.ApiExpressionVisitor.visit(ApiExpressionVisitor.java:37)
	at org.apache.flink.table.expressions.UnresolvedCallExpression.accept(UnresolvedCallExpression.java:97)
	at org.apache.flink.table.expressions.resolver.rules.ResolveCallByArgumentsRule.lambda$apply$0(ResolveCallByArgumentsRule.java:92)
	at java.util.stream.ReferencePipeline$7$1.accept(ReferencePipeline.java:269)
	at java.util.ArrayList$ArrayListSpliterator.forEachRemaining(ArrayList.java:1384)
	at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482)
	at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472)
	at java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708)
	at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)
	at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566)
	at org.apache.flink.table.expressions.resolver.rules.ResolveCallByArgumentsRule.apply(ResolveCallByArgumentsRule.java:93)
	at org.apache.flink.table.expressions.resolver.ExpressionResolver.lambda$null$2(ExpressionResolver.java:241)
	at java.util.function.Function.lambda$andThen$1(Function.java:88)
	at org.apache.flink.table.expressions.resolver.ExpressionResolver.resolve(ExpressionResolver.java:204)
	at org.apache.flink.table.operations.utils.OperationTreeBuilder.projectInternal(OperationTreeBuilder.java:194)
	at org.apache.flink.table.operations.utils.OperationTreeBuilder.project(OperationTreeBuilder.java:183)
	at org.apache.flink.table.api.internal.TableImpl$OverWindowedTableImpl.select(TableImpl.java:1002)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at org.apache.flink.api.python.shaded.py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at org.apache.flink.api.python.shaded.py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at org.apache.flink.api.python.shaded.py4j.Gateway.invoke(Gateway.java:282)
	at org.apache.flink.api.python.shaded.py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at org.apache.flink.api.python.shaded.py4j.commands.CallCommand.execute(CallCommand.java:79)
	at org.apache.flink.api.python.shaded.py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)
