In [11]:
%%writefile lab_9_weather_statistics_pipeline.py
import json
import typing
import logging
import apache_beam as beam

class WeatherRecord(typing.NamedTuple):
    loc_id: str
    lat: float
    lng: float
    date: str
    low_temp: float
    high_temp: float
    precip: float

beam.coders.registry.register_coder(WeatherRecord, beam.coders.RowCoder)

class ConvertCsvToWeatherRecord(beam.DoFn):

    def process(self, line):
        fields = 'loc_id,lat,lng,date,low_temp,high_temp,precip'.split(',')
        values = line.split(',')
        row = dict(zip(fields,values))
        for num_field in ('lat', 'lng', 'low_temp', 'high_temp', 'precip'):
            row[num_field] = float(row[num_field])
        yield WeatherRecord(**row)

class ConvertTempUnits(beam.DoFn):

    def process(self, row):
        row_dict = row._asdict()
        for field in ('low_temp', 'high_temp'):
            row_dict[field] = row_dict[field] * 1.8 + 32.0
        yield WeatherRecord(**row_dict)

class ConvertToJson(beam.DoFn):

    def process(self, row):
        line = json.dumps(row._asdict())
        yield line

class ComputeStatistics(beam.PTransform):

    def expand(self, pcoll):
    
        results = (
            pcoll | 'ComputeStatistics' >> beam.GroupBy('loc_id')
                                                .aggregate_field('low_temp', min, 'record_low')
                                                .aggregate_field('high_temp', max, 'record_high')
                                                .aggregate_field('precip', sum, 'total_precip')
                | 'ToJson' >> beam.ParDo(ConvertToJson())
        )
        
        return results

class WeatherStats(beam.PTransform):

    def expand(self, pcoll):

        results = (
            pcoll | "ParseCSV" >> beam.ParDo(ConvertCsvToWeatherRecord())
                  | "ConvertToF" >> beam.ParDo(ConvertTempUnits())
                  | "ComputeStats" >> ComputeStatistics()
        )

        return results

def run():

    p = beam.Pipeline()

    (p | 'ReadCSV' >> beam.io.ReadFromText('./weather_data.csv')
       | 'ComputeStatistics' >> WeatherStats()
       | 'WriteJson' >> beam.io.WriteToText('./weather_stats', '.json')
    )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()

if __name__ == '__main__':
  run()

Overwriting lab_9_weather_statistics_pipeline.py


In [29]:
%%writefile lab_9_weather_statistics_pipeline_test.py
import logging
import json
import unittest
import sys

from lab_9_weather_statistics_pipeline import *
from apache_beam.testing.test_pipeline import TestPipeline
from apache_beam.testing.util import BeamAssertException
from apache_beam.testing.util import assert_that, equal_to

def main(out = sys.stderr, verbosity = 2):
    loader = unittest.TestLoader()
  
    suite = loader.loadTestsFromModule(sys.modules[__name__])
    unittest.TextTestRunner(out, verbosity = verbosity).run(suite)


class ConvertToWeatherRecordTest(unittest.TestCase):

    def test_convert_to_csv(self):

        with TestPipeline() as p:

            LINES = ['x,0.0,0.0,2/2/2021,1.0,2.0,0.1']
            EXPECTED_OUTPUT = [WeatherRecord('x', 0.0, 0.0, '2/2/2021', 1.0, 2.0, 0.1)]

            input_lines = p | beam.Create(LINES)

            output = input_lines | beam.ParDo(ConvertCsvToWeatherRecord())

            assert_that(output, equal_to(EXPECTED_OUTPUT))

class ConvertTempUnitsTest(unittest.TestCase):

    def test_convert_temp_units(self):

        with TestPipeline() as p:

            RECORDS = [WeatherRecord('x', 0.0, 0.0, '2/2/2021', 1.0, 2.0, 0.1),
                       WeatherRecord('y', 0.0, 0.0, '2/2/2021', -3.0, -1.0, 0.3)]

            EXPECTED_RECORDS = [WeatherRecord('x', 0.0, 0.0, '2/2/2021', 33.8, 35.6, 0.1),
                               WeatherRecord('y', 0.0, 0.0, '2/2/2021', 26.6, 30.2, 0.3)]

            input_records = p | beam.Create(RECORDS)

            output = input_records | beam.ParDo(ConvertTempUnits())
            
            assert_that(output, equal_to(EXPECTED_RECORDS))

class ComputeStatsTest(unittest.TestCase):
    
    def test_compute_statistics(self):

        with TestPipeline() as p:

            INPUT_RECORDS = [WeatherRecord('x', 0.0, 0.0, '2/2/2021', 33.8, 35.6, 0.1),
                             WeatherRecord('x', 0.0, 0.0, '2/3/2021', 41.6, 65.3, 0.2),
                             WeatherRecord('x', 0.0, 0.0, '2/4/2021', 45.3, 52.6, 0.2),
                             WeatherRecord('y', 0.0, 0.0, '2/2/2021', 12.8, 23.6, 0.1),
                             WeatherRecord('y', 0.0, 0.0, '2/3/2021', 26.6, 30.2, 0.3)]

            EXPECTED_STATS = [json.dumps({'loc_id': 'x', 'record_low': 33.8, 'record_high': 65.3, 'total_precip': 0.5 }),
                              json.dumps({'loc_id': 'y', 'record_low': 12.8, 'record_high': 30.2, 'total_precip': 0.4 })]

            inputs = p | beam.Create(INPUT_RECORDS)

            output = inputs | ComputeStatistics()

            assert_that(output, equal_to(EXPECTED_STATS))

class WeatherStatsTransformTest(unittest.TestCase):

    def test_weather_stats_transform(self):

        with TestPipeline() as p:

            INPUT_STRINGS = ["x,31.4,-39.2,2/2/21,4.0,7.5,0.1",
                             "x,31.4,-39.2,2/2/21,3.5,6.0,0.3",
                             "y,33.4,-49.2,2/2/21,12.5,17.5,0.5"]

            EXPECTED_STATS = [json.dumps({'loc_id': 'x', 'record_low': 38.3, 'record_high': 45.5, 'total_precip': 0.4 }),
                              json.dumps({'loc_id': 'y', 'record_low': 54.5, 'record_high': 63.5, 'total_precip': 0.5 })]

            inputs = p | beam.Create(INPUT_STRINGS)

            output = inputs | WeatherStats()

            assert_that(output, equal_to(EXPECTED_STATS))
            
if __name__ == '__main__':
    with open('lab_9_testingout.txt', 'w') as f:
        main(f)

Overwriting lab_9_weather_statistics_pipeline_test.py


```bash
PROJECT_ID=$(gcloud config get-value project)
export PROJECT_NUMBER=$(gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)")
export serviceAccount=""$PROJECT_NUMBER"-compute@developer.gserviceaccount.com"
gcloud projects add-iam-policy-binding $PROJECT_ID --member="serviceAccount:${serviceAccount}" --role="roles/dataflow.worker"
```

terminal
```bash
conda activate beam
cd /path
python $workdir/lab_9_weather_statistics_pipeline_test.py
cat lab_9_testingout.txt
```

In [2]:
!cat lab_9_testingout.txt

test_compute_statistics (__main__.ComputeStatsTest) ... ok
test_convert_temp_units (__main__.ConvertTempUnitsTest) ... ok
test_convert_to_csv (__main__.ConvertToWeatherRecordTest) ... ok
test_weather_stats_transform (__main__.WeatherStatsTransformTest) ... ok

----------------------------------------------------------------------
Ran 4 tests in 3.483s

OK
