# Transform Data gph20s10k
Transform the data-por observations into a format we can use for machine learning. The problem is the samples in an observation are not consistent between observations. The solution is to interpolate data into standardized levels. We split the samples into 20 levels between the surface and 10km. This allows all the observations to be consistent for a given station. Although the number of levels are the same between other stations, because the surface elevation is different, the actual altitudes of each slice will be different.

Observations are also filtered on quality. If there are less than 20 samples without data problems between the surface and 10km geopotential height, the entire observation is rejected. If a surface sample has any invalid data, the entire observation is also rejected.

Update the following parameters in the first cell to accomodate your installation:

- BRONZE_DATA_POR_PATH - The location of the raw data-por zip files
- SILVER_GPH20S10K_PATH - The location to save the transformed CSV files

In [1]:
# If you need to install olieigra, uncomment and execute this line. View the README in the project root for instructions
# on how to build or download this file.
#%pip install /lakehouse/default/Files/libs/olieigra-0.0.1-py3-none-any.whl

In [2]:
import math
import os
from datetime import datetime
import numpy as np
import olieigra

BRONZE_DATA_POR_PATH = '/lakehouse/default/Files/bronze/igra2/data-por'
SILVER_GPH20S10K_PATH = '/lakehouse/default/Files/silver/igra2/gph20s10k'

In [3]:
# Make sure the destination path exists
os.makedirs(SILVER_GPH20S10K_PATH, exist_ok=True)

In [4]:
# The business logic is contained within this class

class Gph20S10K(olieigra.Callbacks):
    samples = 21
    gph_top = 10000
    min_usable = 20

    def __init__(self, dst_path: str, min_effective_date: datetime):
        super().__init__()
        self.dst_path = dst_path
        self.filename = ''
        self.filtered = 0
        self.rejected = 0
        self.writer = None
        self.min_effective_date = min_effective_date
        self.hout = ''
        self.levels = []

    def start_file(self, filename: str) -> bool:
        """Decide if we want to process the file. If so, reset state and start writing to a
        temporary file."""

        # An IGRA2 file should end with -data.txt
        if not filename.endswith('-data.txt'):
            print(f'Skipping {filename}. Not sure what to do with it.')
            return False

        # Set the desired destination filename
        dst_filename = f'{self.dst_path}/{filename}'
        dst_filename = dst_filename.replace("-data.txt", "-data-gph20s10k.csv")

        # Skip this file if it has already been processed
        if os.path.exists(dst_filename):
            print(f'Skipping {filename}. Destination file already exists.')
            return False

        # If we got here, we are going to process the file
        print(f'Processing {filename}.')

        # Write to a temp file
        self.filename = dst_filename.replace('.csv', '.partial.csv')
        self.writer = open(self.filename, 'w', encoding='UTF-8')
        self.hout = ''

        # Reset the record counts
        self.filtered = 0
        self.rejected = 0

        # Write the header row
        attr = ['gph','pres','temp','dp','u','v']
        dynamic = ','.join([f'{level}_{x}'
                            for level in range(self.samples)
                            for x in attr])
        self.writer.write(f"id,effective_date,hour,day_num,{dynamic}\n")

        # Tell olieigra to continue processing
        return True

    def finish_file(self, headers: int, rows: int):
        """Callback for when processing is complete"""

        # Flush and close the temp file
        self.writer.close()

        # Rename it to the final filename
        dst_renamed = self.filename.replace('.partial.csv', '.csv')
        os.rename(self.filename, dst_renamed)

        # Calculate the number of records written
        loaded = headers - self.filtered - self.rejected

        print(f" Read {headers} headers, {rows} lines. Filtered {self.filtered}. " +
              f"Rejected {self.rejected}. Wrote {loaded} records.")

    def parse_header(self, header: olieigra.HeaderModel):
        """Transform the header record"""

        # Combine seperate fields into a datetime
        effective_date = datetime(header.year, header.month, header.day)

        # Filter out the observations that are too old
        if effective_date < self.min_effective_date:
            self.filtered += 1
            return False

        # We need some number that is analogous to the amount of sunlight and the season
        day_num = -math.cos(math.radians(effective_date.timetuple().tm_yday))

        # The observation may be rejected due to body data issues. Save the header values to
        # a variable for now. The parse_body will write it to the file, if appropriate.
        self.hout = f'{header.id},{effective_date:%Y-%m-%d},{header.hour},{day_num:.2f}'

        # Continue the processing
        return True

    def parse_body(self, body: list[olieigra.BodyModel]):
        """Perform some analytics on the body"""

        # Remove non-pressure records and records with bad data
        filtered = self.filter_body(body)

        # If the obs failed validation checks, skip it
        if len(filtered) == 0:
            self.rejected += 1
            return

        # Have we calculated the levels yet?
        if len(self.levels) == 0:
            # We can't calculate the levels until we've had a surface sample
            self.levels = np.linspace(filtered[0][0], self.gph_top, self.samples)

        # Convert rows to columns and interpolate to our standard levels
        pivoted = self.body_pivot(filtered)

        # Combine the results to a comma delimited list
        out = ','.join([f"{item:.1f}" for item in pivoted])

        # Write the record
        self.writer.write(f'{self.hout},{out}\n')

    def filter_body(self, body: list[olieigra.BodyModel]) -> list[list[float]]:
        """Filter out bad data"""
        result = [[], [], [], [], [], []]
        usable_count = 0
        surface_nan = 1
        last_gph = -1

        # Iterate over every body record
        for item in body:
            # If we have at least one record over 10k in height, we have enough data to interpolate
            if last_gph >= 10000:
                break

            # Skip non-pressure records
            if item.type[0] == '3':
                continue

            # Skip records with bad or missing data
            if math.isnan(item.dpdp) | math.isnan(item.rh) | math.isnan(item.temp) | \
                    math.isnan(item.wdir) | math.isnan(item.wspd) | math.isnan(item.gph):
                continue

            # If we got here, the record passed validation. Add it to the results.
            result.append(self.transform_body(item, result))

            # Clear the flag if we find a valid surface sample
            if item.type == '21':
                surface_nan = 0

            # Update tracking variables
            last_gph = item.gph
            usable_count += 1

        # Final validation
        if usable_count >= self.min_usable and surface_nan == 0 and last_gph >= self.gph_top:
            # Reject the entire obs if we don't have 20 valid samples, there is 
            # not a valid surface sample, or if the balloon didn't make it to 10k
            # above the surface.
            return result
        else:
            # Passed validation, return the results
            return []

    def transform_body(self, item: olieigra.BodyModel, agg: list[list[float]]):
        """Transform the body"""

        agg[0].append(item.gph)
        agg[1].append(item.pres / 100.0)
        agg[2].append(item.temp / 10.0)
        agg[3].append((item.temp - item.dpdp) / 10.0)

        # Convert wind from degrees/m^s to u,v
        wrad = math.radians(item.wdir)
        agg[4].append(-item.wspd * math.sin(wrad) / 10.0)
        agg[5].append(-item.wspd * math.cos(wrad) / 10.0)

    def body_pivot(self, body: list[list[float]]) -> list[float]:
        """Pivot and interpolate the levels"""
        return [np.interp(level, body[0], x)
                for level in self.levels
                for x in [body[0], body[1], body[2], body[3], body[4], body[5]]]

In [5]:
callbacks = Gph20S10K(SILVER_GPH20S10K_PATH, datetime(2000, 1, 1))
reader = olieigra.Reader(callbacks=callbacks)
crawler = olieigra.Crawler(reader=reader)

crawler.crawl(BRONZE_DATA_POR_PATH)

Skipping USM00072249-data.txt. Destination file already exists.
Skipping USM00072250-data.txt. Destination file already exists.
Skipping USM00072251-data.txt. Destination file already exists.
Skipping USM00072261-data.txt. Destination file already exists.
Skipping USM00072265-data.txt. Destination file already exists.
Skipping USM00072357-data.txt. Destination file already exists.
Skipping USM00072363-data.txt. Destination file already exists.
Skipping USM00072364-data.txt. Destination file already exists.
Skipping USM00072440-data.txt. Destination file already exists.
Skipping USM00072451-data.txt. Destination file already exists.
Skipping USM00072456-data.txt. Destination file already exists.
Skipping USM00072476-data.txt. Destination file already exists.
Skipping USM00072558-data.txt. Destination file already exists.
Skipping USM00072562-data.txt. Destination file already exists.
Skipping USM00072645-data.txt. Destination file already exists.
Skipping USM00072649-data.txt. Destinati