# Transform Data for Analysis
This notebook aggregates data to help analyze the igra data. The main goal is to figure out the granularity of the data in terms of geopotential height. In particular, I am interested in the number of observations in the troposphere, where most of the weather happens. I arbitrarily decided the "top" of the troposphere is 10km.

For each observation, we filter out:
- Records older than 1/1/2000
- Non-pressure records
- Records where an observation is missing or invalid
- Observations where there isn't a valid surface record

For each observation, we find:
- Max Geopotential Height
- Minimum Pressure
- Total number of valid records in the observation
- Number of valid records below 10km in geopotential height

Update the following parameters in the first cell to accomodate your installation:

- BRONZE_DATA_POR_PATH - The location of the raw data-por zip files
- SILVER_QA_PATH - The location to save the transformed CSV files

In [1]:
# If you need to install olieigra, uncomment and execute this line. View the README in the project root for instructions
# on how to build or download this file.
#%pip install /lakehouse/default/Files/libs/olieigra-0.0.1-py3-none-any.whl

In [1]:
import math
import os
from datetime import datetime
import olieigra

BRONZE_DATA_POR_PATH = '/Users/olievortex/lakehouse/default/Files/bronze/igra2/data-por'
SILVER_QA_PATH = '/Users/olievortex/lakehouse/default/Files/silver/igra2/qa'

In [2]:
# Make sure the destination path exists
os.makedirs(SILVER_QA_PATH, exist_ok=True)

In [3]:
# The business logic is contained within this class

class QualityAnalysis(olieigra.Callbacks):
    def __init__(self, dst_path: str, min_effective_date: datetime):
        super().__init__()
        self.min_effective_date = min_effective_date
        self.dst_path = dst_path
        self.filename = ''
        self.filtered = 0
        self.writer = None
        self.hout = ''

    def start_file(self, filename: str) -> bool:
        """Decide if we want to process the file. If so, reset state and start writing to a
        temporary file."""

        # An IGRA2 file should end with -data.txt
        if not filename.endswith('-data.txt'):
            print(f'Skipping {filename}. Not sure what to do with it.')
            return False

        # Set the desired destination filename
        dst_filename = f'{self.dst_path}/{filename}'
        dst_filename = dst_filename.replace("-data.txt", "-data-qa.csv")

        # Skip this file if it has already been processed
        if os.path.exists(dst_filename):
            print(f'Skipping {filename}. Destination file already exists.')
            return False

        # If we got here, we are going to process the file
        print(f'Processing {filename}.')

        # Write to a temp file
        self.filename = dst_filename.replace('-data-qa.csv', '-data-qa.partial.csv')
        self.writer = open(self.filename, 'w', encoding='UTF-8')

        # Reset the filtered record count
        self.filtered = 0
        self.hout = ''

        # Write the header row
        self.writer.write('id,effective_date,hour,usable_10k,usable_all,max_gph,min_pa\n')

        # Tell olieigra to continue processing
        return True
    
    def finish_file(self, headers: int, rows: int):
        """File processing is complete. Clean up and provide user feedback."""

        # Close the temporary file
        self.writer.close()

        # Rename the temporary file
        dst_renamed = self.filename.replace('.partial.csv', '.csv')
        os.rename(self.filename, dst_renamed)

        # Calculate the number of records written
        loaded = headers - self.filtered

        # Provide feedback to the user
        print(f" Read {headers} headers, {rows} lines. Filtered {self.filtered}. " +
              f"Wrote {loaded} records.")

    def parse_header(self, header: olieigra.HeaderModel) -> bool:
        """Transform the header record and start writing a record"""

        # Combine the separate fields into a date
        effective_date = datetime(header.year, header.month, header.day)

        # Skip the record if it is too old
        if effective_date < self.min_effective_date:
            self.filtered += 1
            return False

        # We may not write the row, so save the header columns in a variable for now
        self.hout = f'{header.id},{effective_date:%Y-%m-%d},{header.hour}'

        # Tell olieigra to process the body associated with this header
        return True
    
    def parse_body(self, body: list[olieigra.BodyModel]):
        """Perform some analytics and finish writing a record"""

        # Initialize
        usable_10k = 0
        usable_all = 0
        surface_nan = 1
        max_gph = -1
        min_pa = 99999999

        # Iterate through each record in the body
        for item in body:
            # We don't care about non-pressure records
            if item.type[0] == '3':
                continue

            # We don't care about records that contain a NaN value
            if math.isnan(item.dpdp) | math.isnan(item.rh) | math.isnan(item.temp) | \
                    math.isnan(item.wdir) | math.isnan(item.wspd) | math.isnan(item.gph):
                continue

            # Find the highest gph
            if item.gph > max_gph:
                max_gph = item.gph

            # Find the lowest pa
            if item.pres < min_pa:
                min_pa = item.pres

            # This is a usable record
            usable_all += 1

            # We don't care about records higher than 10km
            if item.gph > 10000:
                continue

            # Flag that we found a valid surface record
            if item.type == '21':
                surface_nan = 0

            # If we got here, the record is usable and lower then 10k height
            usable_10k += 1

        # Write record if there was a valid surface record
        if surface_nan == 0:
            self.writer.write(f'{self.hout},{usable_10k},{usable_all},{max_gph},{min_pa}\n')


In [4]:
# Set up for processing
callbacks = QualityAnalysis(SILVER_QA_PATH, datetime(2000, 1, 1))
reader = olieigra.Reader(callbacks=callbacks)
crawler = olieigra.Crawler(reader=reader)

# Crawl and process files
crawler.crawl(BRONZE_DATA_POR_PATH)


Processing USM00072251-data.txt.
 Read 34081 headers, 4532303 lines. Filtered 16310. Wrote 17771 records.
Processing USM00072357-data.txt.
 Read 26409 headers, 4084416 lines. Filtered 8706. Wrote 17703 records.
Processing USM00072250-data.txt.
 Read 77417 headers, 5455633 lines. Filtered 59498. Wrote 17919 records.
Processing USM00072456-data.txt.
 Read 54326 headers, 5136424 lines. Filtered 36402. Wrote 17924 records.
Processing USM00072451-data.txt.
 Read 65711 headers, 4849890 lines. Filtered 47852. Wrote 17859 records.
Processing USM00072649-data.txt.
 Read 26612 headers, 3659546 lines. Filtered 9224. Wrote 17388 records.
Processing USM00072261-data.txt.
 Read 53682 headers, 5133602 lines. Filtered 35986. Wrote 17696 records.
Processing USM00074560-data.txt.
 Read 21214 headers, 3761759 lines. Filtered 3539. Wrote 17675 records.
Processing USM00072645-data.txt.
 Read 61708 headers, 4823579 lines. Filtered 44399. Wrote 17309 records.
Processing USM00072249-data.txt.
 Read 49737 head