# YSB

In [1]:
import os
base_path = os.getcwd()
print(base_path)

/home/tim/Documents/work/apache_flink


In [2]:
from pyflink.common import Configuration
from pyflink.table import TableEnvironment, EnvironmentSettings, DataTypes, TableDescriptor, Schema

# Set up the execution configuration
configuration = Configuration()
configuration.set_integer("table.exec.resource.default-parallelism", 1)

# Create the TableEnvironment in streaming mode
t_env = TableEnvironment.create(
    EnvironmentSettings.new_instance().in_streaming_mode().with_configuration(configuration).build()
)


# Create the input table for the YSB dataset
t_env.create_temporary_table(
    'ysb_input',
    TableDescriptor.for_connector('filesystem')
    .schema(
        Schema.new_builder()
            .column('user_id', DataTypes.BIGINT())
            .column('page_id', DataTypes.BIGINT())
            .column('campaign_id', DataTypes.BIGINT())
            .column('ad_type', DataTypes.BIGINT())
            .column('event_type', DataTypes.BIGINT())
            .column('current_ms', DataTypes.BIGINT())
            .column('ip', DataTypes.BIGINT())
            .column('d1', DataTypes.BIGINT())
            .column('d2', DataTypes.BIGINT())
            .column('d3', DataTypes.INT())
            .column('d4', DataTypes.SMALLINT())
            # Convert current_ms (in milliseconds) to a TIMESTAMP_LTZ column named eventTime
            .column_by_expression('eventTime', "TO_TIMESTAMP_LTZ(current_ms, 3)")
            .watermark('eventTime', "eventTime")
            .build()
    )
    .option('path', f'{base_path}/data/ysb/ysb_data.csv')
    .format('csv')
    .build()
)

t_env.create_temporary_table(
    'sink_q1',
    TableDescriptor.for_connector('filesystem')
    .schema(
        Schema.new_builder()
            .column('campaign_id', DataTypes.BIGINT())
            .column('sum_user_id', DataTypes.BIGINT())
            .column('window_start', DataTypes.BIGINT())
            .column('window_end', DataTypes.BIGINT())
            .build()
    )
    .option('path', f'{base_path}/data/ysb/outputs/sink_q1.csv')
    .format('csv')
    .build()
)


## YSB Query (Query1)
```cpp
Query::from("ysb")
    .filter(Attribute("event_type") < 1)
    .window(TumblingWindow::of(EventTime(Attribute("current_ms")), Seconds(30))).byKey(Attribute("campaign_id"))
    .apply(Sum(Attribute("user_id")))
    .sink(NullOutputSinkDescriptor::create());

In [3]:
t_env.execute_sql('''
INSERT INTO sink_q1
SELECT 
    campaign_id,
    SUM(user_id) AS sum_user_id,
    1000 * UNIX_TIMESTAMP(CAST(window_start AS STRING)) + EXTRACT(MILLISECOND FROM window_start) AS window_start,
    1000 * UNIX_TIMESTAMP(CAST(window_end AS STRING)) + EXTRACT(MILLISECOND FROM window_end) AS window_end
FROM TABLE(
    TUMBLE(
         TABLE ysb_input,
         DESCRIPTOR(eventTime),
         INTERVAL '30' SECOND
    )
)
WHERE event_type < 1
GROUP BY campaign_id, window_start, window_end;
''').wait()

#### Script to generate ysb test data 

In [None]:
import csv
import random
from datetime import datetime, timedelta

def generate_timestamp(start_ts, end_ts):
    """Generate a random timestamp in milliseconds between start_ts and end_ts."""
    delta = end_ts - start_ts
    random_offset = random.randint(0, int(delta.total_seconds() * 1000))
    random_time = start_ts + timedelta(milliseconds=random_offset)
    return int(random_time.timestamp() * 1000)

def generate_csv(output_file, num_rows):
    # Define a start and end time for event timestamps (simulate a one-day period)
    start_time = datetime(2020, 1, 1)
    end_time = start_time + timedelta(days=1)
    
    # Open CSV file for writing
    with open(output_file, mode='w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # Write header row
        writer.writerow(["user_id", "page_id", "campaign_id", "ad_type", "event_type",
                         "current_ms", "ip", "d1", "d2", "d3", "d4"])
        # Generate synthetic data rows
        for i in range(1, num_rows + 1):
            user_id = i  # Sequential user IDs
            page_id = random.randint(1, 10**6)
            campaign_id = random.randint(1, 500)
            ad_type = random.randint(0, 3)  # Example ad types: 0, 1, 2, or 3
            # event_type: 80% view (0), 15% click (1), 5% purchase (2)
            event_type = random.choices([0, 1, 2], weights=[80, 15, 5], k=1)[0]
            current_ms = generate_timestamp(start_time, end_time)
            ip = random.randint(0, 2**32 - 1)  # Simulated as a 32-bit integer
            d1 = random.randint(0, 2**64 - 1)
            d2 = random.randint(0, 2**64 - 1)
            d3 = random.randint(0, 2**32 - 1)
            d4 = random.randint(0, 2**16 - 1)
            writer.writerow([user_id, page_id, campaign_id, ad_type, event_type,
                             current_ms, ip, d1, d2, d3, d4])
            
    print(f"CSV file '{output_file}' with {num_rows} rows generated successfully.")

# Specify your output path and number of rows here:
output_path = f"{base_path}/data/ysb/ysb_data.csv"  # Change this path if you want the file saved elsewhere.
num_rows = 10000  # Change the number of rows as needed

# Generate the CSV file
generate_csv(output_path, num_rows)
# May need to remove first line of file so it works with aache flink

CSV file '/home/tim/Documents/work/apache_flink/data/ysb/ysb_data.csv' with 10000 rows generated successfully.
