In [7]:
%%writefile lab7_streaming_minute_traffic_SQL_pipeline.py
import argparse
import time
import logging
import json
import typing
from datetime import datetime
import apache_beam as beam
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.transforms.sql import SqlTransform
from apache_beam.runners import DataflowRunner, DirectRunner

# ### functions and classes

class CommonLog(typing.NamedTuple):
    ip: str
    user_id: str
    lat: float
    lng: float
    timestamp: str
    event_timestamp: str
    http_request: str
    http_response: int
    num_bytes: int
    user_agent: str

beam.coders.registry.register_coder(CommonLog, beam.coders.RowCoder)

def parse_json(element):
    row = json.loads(element.decode('utf-8'))
    return row

class GetEventTimestampFn(beam.DoFn):
    def process(self, row, timestamp=beam.DoFn.TimestampParam):
        event_ts = timestamp.to_utc_datetime().strftime("%Y-%m-%dT%H:%M:%S")
        row['event_timestamp'] = event_ts
        yield CommonLog(**row)

class ParseAndGetEventTimestamp(beam.PTransform):
    def expand(self, pcoll):
        return (
            pcoll
            | 'ParseJson' >> beam.Map(parse_json)
            | 'GetEventTimestamp' >> beam.ParDo(GetEventTimestampFn())
            )

def to_dict(row):
    return {'page_views' : row.page_views,
            'start_time' : row.start_time}

# ### main

def run():
    # Command line arguments
    parser = argparse.ArgumentParser(description='Load from Json from Pub/Sub into BigQuery')
    parser.add_argument('--project',required=True, help='Specify Google Cloud project')
    parser.add_argument('--region', required=True, help='Specify Google Cloud region')
    parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner')
    parser.add_argument('--input_topic', required=True, help='Input Pub/Sub Topic')
    parser.add_argument('--table_name', required=True, help='BigQuery table name for aggregate results')


    opts, pipeline_opts = parser.parse_known_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(pipeline_opts, save_main_session=True, streaming=True)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format('streaming-minute-traffic-sql-pipeline-',time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    input_topic = opts.input_topic
    table_name = opts.table_name

    # Table schema for BigQuery
    table_schema = {
        "fields": [
            {
                "name": "page_views",
                "type": "INTEGER"
            },
            {
                "name": "start_time",
                "type": "STRING"
            },

        ]
    }

    query = '''
        SELECT
            COUNT(*) AS page_views,
            STRING(window_start) AS start_time
        FROM
            TUMBLE(
                (SELECT TIMESTAMP(event_timestamp) AS ts FROM PCOLLECTION),
                DESCRIPTOR(ts),
                'INTERVAL 1 MINUTE')
        GROUP BY window_start
    '''

    # Create the pipeline
    p = beam.Pipeline(options=options)

    (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(input_topic)
       | 'ParseAndGetEventTimestamp' >> ParseAndGetEventTimestamp().with_output_types(CommonLog)
       | "CountPerMinute" >> SqlTransform(query, dialect='zetasql')
       | "ConvertToDict" >> beam.Map(to_dict)
       | 'WriteAggToBQ' >> beam.io.WriteToBigQuery(
            table_name,
            schema=table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
            )
    )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run().wait_until_finish()

if __name__ == '__main__':
  run()

Writing lab7_streaming_minute_traffic_SQL_pipeline.py


In [8]:
import os

In [9]:
os.environ["workdir"]="/path"

In [11]:
%%bash
cat $workdir/create_streaming_sinks.sh

#!/bin/#!/usr/bin/env bash
echo "Creating pipeline sinks"

PROJECT_ID=$(gcloud config get-value project)

# GCS buckets
#TODO: Add try/catch for the first bucket since qwiklabs
gsutil mb -l US gs://$PROJECT_ID
gsutil mb -l US -c "COLDLINE" gs://$PROJECT_ID-coldline

# BiqQuery Dataset
bq mk --location=US logs

# PubSub Topic
gcloud pubsub topics create my_topic

In [None]:
%%bash
bash $workdir/create_streaming_sinks.sh

In [None]:
PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

In [17]:
os.environ["PROJECT_ID"]=PROJECT_ID
os.environ["REGION"]='us-central1'
os.environ["BUCKET"]="gs://"+PROJECT_ID
os.environ["PIPELINE_FOLDER"]="gs://"+PROJECT_ID
os.environ["RUNNER"]="DataflowRunner"
os.environ["PUBSUB_TOPIC"]=f"projects/{PROJECT_ID}/topics/my_topic"
os.environ["WINDOW_DURATION"]=60
os.environ["ALLOWED_LATENESS"]=1
os.environ["OUTPUT_TABLE_NAME"]=f"{PROJECT_ID}:logs.minute_traffic"
os.environ["DEADLETTER_BUCKET"]="gs://"+PROJECT_ID
os.environ["INPUT_PATH"]="gs://"+PROJECT_ID+"/events.json"
os.environ["TABLE_NAME"]=PROJECT_ID+":logs.minute_traffic"
os.environ["AGGREGATE_TABLE_NAME"]=f"{PROJECT_ID}:logs.minute_traffic"
os.environ["RAW_TABLE_NAME"]=f"{PROJECT_ID}:logs.raw"



In [None]:
%%bash
# export PROJECT_ID=$(gcloud config get-value project)
# export REGION='us-central1'
# export BUCKET=gs://${PROJECT_ID}
# export PIPELINE_FOLDER=${BUCKET}
# export RUNNER=DataflowRunner
# export PUBSUB_TOPIC=projects/${PROJECT_ID}/topics/my_topic
# export TABLE_NAME=${PROJECT_ID}:logs.minute_traffic
python3 lab7_streaming_minute_traffic_SQL_pipeline.py \
--project=${PROJECT_ID} \
--region=${REGION} \
--staging_location=${PIPELINE_FOLDER}/staging \
--temp_location=${PIPELINE_FOLDER}/temp \
--runner=${RUNNER} \
--input_topic=${PUBSUB_TOPIC} \
--table_name=${TABLE_NAME} \
--experiments=use_runner_v2

In [12]:
cat $workdir/generate_streaming_events.sh

#!/bin/#!/usr/bin/env bash
echo "Installing packages"
# Install modules
sh ./install_packages.sh

echo "Generating synthetic users"
# Generate 10 fake web site users
python3 user_generator.py --n=10

echo "Generating synthetic events"
use_lag=$1

if [ "$use_lag" = true ] ; then
    echo "Using lag"
    python3 streaming_event_generator.py --project_id=$(gcloud config get-value project) -t=my_topic
else
    echo "Not using lag"
    python3 streaming_event_generator.py --project_id=$(gcloud config get-value project) -t=my_topic -off=1. -on=0. -l=0
fi

In [None]:
bash $workdir/generate_streaming_events.sh

In [None]:
%%bigquery
SELECT * FROM logs.minute_traffic