# Linear Road Benchmark

In [1]:
import os
base_path = os.getcwd()
print(base_path)

/home/tim/Documents/work/apache_flink


In [None]:
from pyflink.table import TableEnvironment, EnvironmentSettings, DataTypes, TableDescriptor, Schema
from pyflink.common import Configuration

# Set up the execution configuration
configuration = Configuration()
configuration.set_integer("table.exec.resource.default-parallelism", 1)

# Create the TableEnvironment in streaming mode
t_env = TableEnvironment.create(
    EnvironmentSettings.new_instance().in_streaming_mode().with_configuration(configuration).build()
)

# Create a temporary table for Linear Road data
t_env.create_temporary_table(
    'lrb',
    TableDescriptor.for_connector('filesystem')
    .schema(
        Schema.new_builder()
            .column('creationTS', DataTypes.BIGINT())
            .column('vehicle', DataTypes.SMALLINT())
            .column('speed', DataTypes.FLOAT())
            .column('highway', DataTypes.SMALLINT())
            .column('lane', DataTypes.SMALLINT())
            .column('direction', DataTypes.SMALLINT())
            .column('position', DataTypes.INT())
            .column_by_expression('eventTime', "TO_TIMESTAMP_LTZ(creationTS, 3)")
            .watermark('eventTime', "eventTime")
            .build()
    )
    .option('path', f'{base_path}/data/linear_road/lrb-data-small-ht.csv')
    .format('csv')  # Default CSV delimiter is comma
    .build()
)


t_env.create_temporary_table(
    'sink_q1',
    TableDescriptor.for_connector('filesystem')
    .schema(
        Schema.new_builder()
            .column('highway', DataTypes.SMALLINT())
            .column('direction', DataTypes.SMALLINT())
            .column('pos_div', DataTypes.FLOAT())
            .column('avgSpeed', DataTypes.FLOAT())
            .column('window_start', DataTypes.BIGINT())
            .column('window_end', DataTypes.BIGINT())
            .build()
    )
    .option('path', f'{base_path}/data/linear_road/outputs/sink_q1.csv')
    .format('csv')
    .build()
)

t_env.create_temporary_table(
    'sink_q2',
    TableDescriptor.for_connector('filesystem')
    .schema(
         Schema.new_builder()
           .column('vehicle', DataTypes.SMALLINT())      # or INT if preferred
           .column('highway', DataTypes.SMALLINT())
           .column('direction', DataTypes.SMALLINT())
           .column('pos_div', DataTypes.FLOAT())
           .column('cntSpeed', DataTypes.BIGINT())         # COUNT(*) returns BIGINT
           .column('window_start', DataTypes.BIGINT())
           .column('window_end', DataTypes.BIGINT())
           .build()
    )
    .option('path', f'{base_path}/data/linear_road/outputs/sink_q2.csv')
    .format('csv')
    .build()
)

## Query 1
```cpp
Query::from("lrb")
    .map(Attribute("position") = Attribute("position")/5280)
    .window(SlidingWindow::of(EventTime(RecordCreationTs()), Seconds(300), Seconds(1)))
    .byKey(Attribute("highway"), Attribute("direction"), Attribute("position"))
    .apply(Avg(Attribute("speed"))->as(Attribute("avgSpeed")))
    .filter(Attribute("avgSpeed") < 40)
    .sink(NullOutputSinkDescriptor::create());

In [3]:
t_env.execute_sql("""
INSERT INTO sink_q1
WITH computed_lrb AS (
    SELECT 
        highway,
        direction,
        (`position` / 5280.0) AS pos_div,
        speed,
        eventTime
    FROM lrb
)
SELECT 
    highway,
    direction,
    pos_div,
    AVG(speed) AS avgSpeed,
    1000 * UNIX_TIMESTAMP(CAST(window_start AS STRING)) + EXTRACT(MILLISECOND FROM window_start) as `window_start`,
    1000 * UNIX_TIMESTAMP(CAST(window_end AS STRING)) + EXTRACT(MILLISECOND FROM window_end) as `window_end`
FROM TABLE(
    HOP(
         TABLE computed_lrb,
         DESCRIPTOR(eventTime),
         INTERVAL '1' SECOND,  -- slide interval
         INTERVAL '5' MINUTE   -- window size (equivalent to 300 seconds)
    )
)
GROUP BY highway, direction, pos_div, window_start, window_end
HAVING AVG(speed) < 40
""").wait()

## Query 2
```cpp
Query::from("lrb")
    .map(Attribute("position") = Attribute("position")/5280)
    .window(SlidingWindow::of(EventTime(RecordCreationTs()), Seconds(30), Seconds(1)))
    .byKey(Attribute("vehicle"), Attribute("highway"), Attribute("direction"), Attribute("position"))
    .apply(Count()->as(Attribute("cntSpeed")))
    .sink(NullOutputSinkDescriptor::create());

In [4]:
t_env.execute_sql("""
INSERT INTO sink_q2
WITH computed_lrb AS (
    SELECT 
        vehicle,
        highway,
        direction,
        (`position` / 5280.0) AS pos_div,
        eventTime
    FROM lrb
)
SELECT 
    vehicle,
    highway,
    direction,
    pos_div,
    COUNT(*) AS cntSpeed,
    1000 * UNIX_TIMESTAMP(CAST(window_start AS STRING)) + EXTRACT(MILLISECOND FROM window_start) as `window_start`,
    1000 * UNIX_TIMESTAMP(CAST(window_end AS STRING)) + EXTRACT(MILLISECOND FROM window_end) as `window_end`
FROM TABLE(
    HOP(
         TABLE computed_lrb,
         DESCRIPTOR(eventTime),
         INTERVAL '1' SECOND,
         INTERVAL '30' SECOND
    )
)
GROUP BY 
    vehicle,
    highway,
    direction,
    pos_div,
    window_start,
    window_end
""").wait()

#### Rewrite txt input file to csv

In [1]:
import csv
import os

# Define the base path and file paths
base_path = '/home/tim/Documents/work/apache_flink'
input_file = os.path.join(base_path, 'data/linear_road/lrb-data-small-ht.txt')
output_file = os.path.join(base_path, 'data/linear_road/lrb-data-small-ht.csv')

with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    for line in infile:
        # Split on any whitespace (handles multiple spaces/tabs)
        row = line.strip().split()
        if len(row) == 7:
            writer.writerow(row)
        else:
            print("Skipping line with unexpected number of columns:", line)