# Calculate LI
This notebook calcualtes the lifted index values for us to train models with. See the README to learn what the Lifted Index is. It reads in the data we generated from the transform_data_gph20s10k notebook. It then unpivots the data, converting columns back into pressure levels. This data can then be used by MetPy to calculate the Lifted Index using the actual physics algorithms. A CSV file is geratated for each input file.

Update the following parameters in the first cell to accomodate your installation:

- SILVER_GPH20S10K_PATH - The location to read the transformed CSV files
- SILVER_LI_PATH - Location to save the calculated Lifted Index values

In [93]:
import io
import os
import pandas as pd
import numpy as np

SILVER_GPH20S10K_PATH = '/lakehouse/default/Files/silver/igra2/gph20s10k'
SILVER_LI_PATH = '/lakehouse/default/Files/silver/igra2/li'

In [94]:
# Make sure the destination folder exists
os.makedirs(SILVER_LI_PATH, exist_ok=True)

In [95]:
# from datetime import datetime
class LiftedIndex:
    feature_cols = 4

    def __init__(self, dst_path: str):
        self.dst_path = dst_path
        self.pivot_columns = ['gph','pres','temp','dp','u','v']

    def crawl(self, path: str):
        """Scan a folder for data files and process them"""

        # Iterate through each file in the path
        for file in os.listdir(path):
            # Skip if the filename doesn't match the pattern we're expect
            if not file.endswith('-data-gph20s10k.csv'):
                print(f'Skipping {file}. Not sure what to do with it.')
                continue

            # Figure out the filenames
            src_filename = f'{path}/{file}'
            dst_filename = f"{self.dst_path}/{file.replace('gph20s10k.csv', 'li.csv')}"
            tmp_filename = dst_filename.replace('.csv', '.partial.csv')
           
            # Skip the file if it has already been processed
            if os.path.exists(dst_filename):
                print(f'Skipping {file}. Destination file already exists.')
                continue

            # Process the file
            print(f'Processing {file}.')
            self.process(src_filename, tmp_filename)

    def process(self, src_filename: str, tmp_filename: str):
        """Process a data file"""

        # Open the reader and writer streams
        with open(src_filename, 'r') as reader, open(tmp_filename, 'w') as writer:
            # Figure out how many levels we are dealing with
            num_levels = self.read_header(reader)

            # Loop through the data rows
            self.process_loop(reader, num_levels)

    def process_loop(self, reader: io.TextIOWrapper, num_levels: int):
        """Process all the data rows in a file"""

        # Loop through the data rows
        while True:
            # Read the next record
            line = reader.readline()

            # Exit the loop if EOF
            if line == "":
                break;

            # Unpivot the row
            data = self.read_row(line, num_levels)

    def read_header(self, reader: io.TextIOWrapper) -> int:
        """Process the file's header row"""

        # Read the first line
        line = reader.readline()

        # Skip over the dimension columns and get the feature columns
        parts = np.array(line.split(',')[self.feature_cols:])
        num_levels = int(len(parts) / len(self.pivot_columns))
        
        return num_levels

    def read_row(self, line: str, num_levels: int) -> pd.DataFrame:
        """Read a line and depivot the data"""
        gph = []        
        p = []
        t = []
        dp = []
        u = []
        v = []

        parts = line.split(',')

        for level in range(num_levels):
            ptr = level * 6 + self.feature_cols
            gph.append(float(parts[ptr]))
            p.append(float(parts[ptr + 1]))
            t.append(float(parts[ptr + 2]))
            dp.append(float(parts[ptr + 3]))
            u.append(float(parts[ptr + 4]))
            v.append(float(parts[ptr + 5]))

        df = pd.DataFrame({
            'gph': gph,
            'p': p,
            't': t,
            'dp': dp,
            'u': u,
            'v': v})

        return df

In [96]:
    # def calc_li_from_csv(self, in_filename: str, out_filename: str):
    #     """Make the LI prediction dataset"""
    #     ds_csv = DatasourceIgraCsv()
    #     start_time = datetime.now()
    #     iter_num = 0
    #     error_nan = 0
    #     good = 0

    #     with open(in_filename, 'r', encoding='UTF-8') as sr, open(out_filename, 'w', encoding='UTF-8') as sw:
    #         header = ds_csv.read_header(sr.readline())

    #         sw.write('lifted_index\n')

    #         while True:
    #             # region Check for EOF

    #             line = sr.readline()
    #             if line == "":
    #                 break

    #             # endregion

    #             # region Progress Indication

    #             if iter_num % 25 == 0:
    #                 print(
    #                     f"\rRow: {iter_num} {(datetime.now()-start_time).seconds}s", end="")
    #             iter_num += 1

    #             # endregion

    #             row = ds_csv.read_row(line, header['num_levels'])

    #             pres = np.array(row['p']) * units.Pa
    #             temp = np.array(row['t']) / 10.0 * units.degC
    #             dp = np.array(row['dp']) / 10.0 * units.degC

    #             if np.isnan(temp).any() or np.isnan(dp).any():
    #                 error_nan += 1
    #                 sw.write("nan\n")
    #                 continue

    #             # compute the parcel temperatures from surface parcel
    #             prof = parcel_profile(pres, temp[0], dp[0])

    #             # calculate the LI
    #             li = float(lifted_index(pres, temp, prof).magnitude[0])
    #             good += 1

    #             sw.write(f"{li}\n")

    #     print(
    #         f"\rTotal: {iter_num}, Good: {good}, NaNs: {error_nan}, Time taken: {(datetime.now()-start_time).seconds}s")


In [97]:
li = LiftedIndex(SILVER_LI_PATH)
li.crawl(SILVER_GPH20S10K_PATH)

Processing USM00072249-data-gph20s10k.csv.
Processing USM00072250-data-gph20s10k.csv.
Processing USM00072251-data-gph20s10k.csv.
Processing USM00072261-data-gph20s10k.csv.
Processing USM00072265-data-gph20s10k.csv.
Processing USM00072357-data-gph20s10k.csv.
Processing USM00072363-data-gph20s10k.csv.
Processing USM00072364-data-gph20s10k.csv.
Processing USM00072440-data-gph20s10k.csv.
Processing USM00072451-data-gph20s10k.csv.
Processing USM00072456-data-gph20s10k.csv.
Processing USM00072476-data-gph20s10k.csv.
Processing USM00072558-data-gph20s10k.csv.
Processing USM00072562-data-gph20s10k.csv.
Processing USM00072645-data-gph20s10k.csv.
Processing USM00072649-data-gph20s10k.csv.
Processing USM00072659-data-gph20s10k.csv.
Processing USM00072662-data-gph20s10k.csv.
Processing USM00072747-data-gph20s10k.csv.
Processing USM00072764-data-gph20s10k.csv.
Processing USM00074455-data-gph20s10k.csv.
Processing USM00074560-data-gph20s10k.csv.
Processing USM00074646-data-gph20s10k.csv.
