# Cluster Monitoring 

In [1]:
import os
base_path = os.getcwd()
print(base_path)

/home/tim/Documents/work/apache_flink


In [4]:
from pyflink.common import Configuration
from pyflink.table import TableEnvironment, EnvironmentSettings, DataTypes, TableDescriptor, Schema

# Set up the execution configuration
configuration = Configuration()
configuration.set_integer("table.exec.resource.default-parallelism", 1)

# Create the TableEnvironment in streaming mode
t_env = TableEnvironment.create(
    EnvironmentSettings.new_instance().in_streaming_mode().with_configuration(configuration).build()
)

# Create a temporary table for the monitoring data
monitoring_table = t_env.create_temporary_table(
    'monitoring',
    TableDescriptor.for_connector('filesystem')
    .schema(
        Schema.new_builder()
            .column('creationTS', DataTypes.BIGINT())
            .column('jobId', DataTypes.BIGINT())
            .column('taskId', DataTypes.BIGINT())
            .column('machineId', DataTypes.BIGINT())
            .column('eventType', DataTypes.SMALLINT())
            .column('userId', DataTypes.SMALLINT())
            .column('category', DataTypes.SMALLINT())
            .column('priority', DataTypes.SMALLINT())
            .column('cpu', DataTypes.FLOAT())
            .column('ram', DataTypes.FLOAT())
            .column('disk', DataTypes.FLOAT())
            .column('constraints', DataTypes.SMALLINT())
            .column_by_expression('eventTime', "TO_TIMESTAMP_LTZ(creationTS, 3)")
            .watermark('eventTime', "eventTime")
            .build()
    )
    .option('path', f'{base_path}/data/cluster_monitoring/google-cluster-data.csv')
    .format('csv')
    .build()
)

t_env.create_temporary_table(
    'sink_q1',
    TableDescriptor.for_connector('filesystem')
    .schema(Schema.new_builder()
            .column('totalCpu', DataTypes.FLOAT())
            .column('window_start', DataTypes.BIGINT())
            .column('window_end', DataTypes.BIGINT())
            .build())
    .option('path', f'{base_path}/data/cluster_monitoring/outputs/sink_q1.csv')
    .format('csv')
    .build()
)

t_env.create_temporary_table(
    'sink_q2',
    TableDescriptor.for_connector('filesystem')
    .schema(
       Schema.new_builder()
           .column('jobId', DataTypes.BIGINT())
           .column('totalCpu', DataTypes.FLOAT())
           .column('window_start', DataTypes.BIGINT())
           .column('window_end', DataTypes.BIGINT())
           .build()
    )
    .option('path', f'{base_path}/data/cluster_monitoring/outputs/sink_q2.csv')
    .format('csv')
    .build()
)

## Query 1 

```cpp
Query::from("cm")
    .window(SlidingWindow::of(EventTime(RecordCreationTs()), Seconds(60), Seconds(1)))
    .apply(Sum(Attribute("cpu"))->as(Attribute("totalCpu")))
    .sink(NullOutputSinkDescriptor::create());

In [3]:
t_env.execute_sql("""
INSERT INTO sink_q1
SELECT 
    SUM(cpu) AS totalCpu,
    1000 * UNIX_TIMESTAMP(CAST(window_start AS STRING)) + EXTRACT(MILLISECOND FROM window_start) as `window_start`,
    1000 * UNIX_TIMESTAMP(CAST(window_end AS STRING)) + EXTRACT(MILLISECOND FROM window_end) as `window_end`
FROM TABLE(
    HOP(
        TABLE monitoring,
        DESCRIPTOR(eventTime),
        INTERVAL '1' SECOND,  -- slide interval
        INTERVAL '60' SECOND  -- window size
    )
)
GROUP BY window_start, window_end
""").wait()

## Query 2 
```cpp
Query::from("cm")
    .filter(Attribute("eventType") == 3)
    .window(SlidingWindow::of(EventTime(RecordCreationTs()), Seconds(60), Seconds(1)))
    .byKey(Attribute("jobId"))
    .apply(Sum(Attribute("cpu"))->as(Attribute("totalCpu")))
    .sink(NullOutputSinkDescriptor::create());

Need to split the query into 2 seperate once since Flink SQL does not support writing subqueries directly in the Table HOP call.

In [5]:
t_env.execute_sql("""
CREATE TEMPORARY VIEW filtered_monitoring AS
SELECT *
FROM monitoring
WHERE eventType = 3
""")

<pyflink.table.table_result.TableResult at 0x7fdb8d92b490>

In [6]:
t_env.execute_sql("""
INSERT INTO sink_q2
SELECT 
    jobId,
    SUM(cpu) AS totalCpu,
    1000 * UNIX_TIMESTAMP(CAST(window_start AS STRING)) + EXTRACT(MILLISECOND FROM window_start) as `window_start`,
    1000 * UNIX_TIMESTAMP(CAST(window_end AS STRING)) + EXTRACT(MILLISECOND FROM window_end) as `window_end`
FROM TABLE(
    HOP(
        TABLE filtered_monitoring,
        DESCRIPTOR(eventTime),
        INTERVAL '1' SECOND,  -- slide interval
        INTERVAL '60' SECOND  -- window size
    )
)
GROUP BY jobId, window_start, window_end
""").wait()

#### Reformat data input from txt to csv

In [14]:
import csv
import os

# Define your base path and file paths
base_path = '/home/tim/Documents/work/apache_flink'
input_file = os.path.join(base_path, 'data/cluster_monitoring/google-cluster-data.txt')
output_file = os.path.join(base_path, 'data/cluster_monitoring/google-cluster-data.csv')

with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    for line in infile:
        # Split on any whitespace and filter out any empty strings
        row = line.strip().split()
        if len(row) == 12:  # Optionally, ensure the correct number of columns
            writer.writerow(row)
        else:
            print("Skipping line due to incorrect number of columns:", line)