In [1]:
import pandas as pd
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import cfgrib
import os

In [2]:
input_path = "4_all_data_grib"

In [None]:
# Variable extraction file
var_file = input_path + "/zone_0/c3_ABLE_1950_08.grib"

# Load metadata using cfgrib
ds = cfgrib.open_datasets(var_file, cache=False)
paramID_list = []
shortName_list = []
Name_list = []

# Display available parameter IDs and their variables
for dataset in ds:
    for var_name in dataset.variables:
        if ('GRIB_paramId' in dataset[var_name].attrs.keys()):
            shortName_list.append(var_name)
            paramID_list.append(dataset[var_name].attrs['GRIB_paramId'])
            Name_list.append(dataset[var_name].attrs['GRIB_name'])
            print(f"{var_name}: {dataset[var_name].attrs['GRIB_name']}")
            print(f"{var_name}: {dataset[var_name].attrs['GRIB_paramId']}")

In [None]:
ID_to_name = dict(zip(paramID_list, shortName_list))
ID_to_long_name = dict(zip(shortName_list, Name_list))
for key, value in ID_to_long_name.items():
    print(f"{key} : {value}")

In [5]:
def process_grib(filepath):
    # Open the GRIB file
    grib_file = filepath
    valid_time_min_list = []
    valid_time_max_list = []

    for p in paramID_list[:]:
        ds = cfgrib.open_dataset(
            grib_file,
            filter_by_keys= {'paramId':p}
        )
        df = ds.to_dataframe().reset_index()[['valid_time']];
        valid_time_min_list.append(min(df['valid_time']))
        valid_time_max_list.append(max(df['valid_time']))
    valid_time_min = max(valid_time_min_list)
    valid_time_max = min(valid_time_max_list)

    # Step 2: Reopen datasets filtered by the common valid_time range
    filtered_dataframes = []

    for p in paramID_list[:]:
        ds = cfgrib.open_dataset(
            grib_file,
            filter_by_keys={'paramId': p}
        )
        df = ds.to_dataframe().reset_index()[['valid_time', 'latitude', 'longitude', ID_to_name[p]]]
        
        # Filter dataframe by the common valid_time range
        df_filtered = df[(df['valid_time'] >= valid_time_min) &
                        (df['valid_time'] <= valid_time_max)].sort_values(['valid_time', 'latitude', 'longitude']).reset_index(drop=True)
        filtered_dataframes.append(df_filtered)

    # Step 3: Merge the filtered dataframes on common keys
    final_df = pd.concat(filtered_dataframes, axis=1)

    # Drop duplicate columns after merging
    final_df = final_df.loc[:, ~final_df.columns.duplicated()]
    return final_df

In [None]:
replace = False
# CSV
file_count = 1e5
file_count_i = 0

# Input and output directories
outer_input_folder = "4_all_data_grib"
outer_output_folder = "6_all_data_table"

for zone in [0, 1, 2]:
    # Ensure output directory exists
    inner_input_folder = outer_input_folder + "/zone_" + f"{zone}"
    inner_output_folder = outer_output_folder + "/zone_" + f"{zone}"
    os.makedirs(inner_output_folder, exist_ok=True)

    # Loop through all GRIB files in the input folder
    for filename in os.listdir(inner_input_folder):

        if file_count_i==file_count:
            break
        file_count_i += 1
        
        if filename.endswith(".grib"):  # Process only .grib files
            grib_file_path = os.path.join(inner_input_folder, filename)
            # Define the output CSV file path
            csv_filename = os.path.splitext(filename)[0] + ".csv"
            csv_file_path = os.path.join(inner_output_folder, csv_filename)
            
            if ((not replace) and (os.path.exists(csv_file_path))):
                print(f"Already exists: {csv_file_path}")
            else:
                try:
                    print(f"Processing: {csv_file_path}")
                    df = process_grib(grib_file_path)
                    # Save the dataframe as a CSV file
                    df.to_csv(csv_file_path, index=False)
                    print(f"Processed and saved: {csv_file_path}")
                    

                except Exception as e:
                    print(f"Error processing: {filename} {e}")

In [None]:
zone = 0
input_folder = f"6_all_data_table/zone_{zone}"

###
filename=os.listdir(input_folder)[0]
data = pd.read_csv(input_folder+"/"+filename)[['latitude', 'longitude']]
data = data.drop_duplicates()

lat = data['latitude'].unique()
lon = data['longitude'].unique()
lat_ind = np.arange(1, len(lat)+1)
lon_ind = np.arange(1, len(lon)+1)
lat_dict = {l:i for l,i in zip(lat,lat_ind)}
lon_dict = {l:i for l,i in zip(lon,lon_ind)}
data['latitude_rank'] = [lat_dict[l] for l in data['latitude']]
data['longitude_rank'] = [lon_dict[l] for l in data['longitude']]

data = data.sort_values(by=['longitude', 'latitude'], ascending=[True, True]).reset_index(drop=True)
data['index'] = data.index+1
latitude_rank = np.array(data['latitude_rank'])
longitude_rank = np.array(data['longitude_rank'])

lat_to_rank = pd.Series(data.latitude_rank.values, index = data.latitude).to_dict()
long_to_rank = pd.Series(data.longitude_rank.values, index = data.longitude).to_dict()
coord_to_index = pd.Series(data.index, index = zip(data.latitude, data.longitude)).to_dict()

data

In [None]:
for filename in os.listdir(input_folder):
    filename = input_folder + "/" + filename
    event_data = pd.read_csv(filename)

    current_columns = event_data.columns
    if 'latitude_rank' not in current_columns:
        event_data['latitude_rank'] = [lat_to_rank[lat] for lat in event_data['latitude']]

    if 'longitude_rank' not in current_columns:
        event_data['longitude_rank'] = [long_to_rank[long] for long in event_data['longitude']]

    if 'coord_index' not in current_columns:
        event_data['coord_index'] = [coord_to_index[c] for c in zip(event_data['latitude'], event_data['longitude'])]

    if 'time_step' not in current_columns:
        valid_time_tp = pd.to_datetime(event_data['valid_time'])
        # Find the earliest date
        earliest_date = valid_time_tp.min()
        # Compute the number of timesteps from the earliest entry (hourly data)
        event_data['time_step'] = ((valid_time_tp - earliest_date).dt.total_seconds() / 3600).astype(int) + 1

    event_data['w10'] = np.sqrt(event_data['u10']**2 + event_data['v10']**2)
    event_data['w100'] = np.sqrt(event_data['u100']**2 + event_data['v100']**2)

    event_data = event_data[['valid_time', 'time_step', 'latitude', 'latitude_rank',
                                 'longitude', 'longitude_rank', 'coord_index', 'u10', 'u100', 'v10', 'v100',
                                 'w10', 'w100', 'tp', 't2m', 'lsm', 'sp', 'tcw', 'e']]

    event_data.to_csv(filename, index=False)

print("Finished adding time_step, coordinate rank and index to data")

In [10]:
for filename in os.listdir(input_folder):
    filename = input_folder + "/" + filename
    columns = pd.read_csv(filename, index_col=None, nrows=0).columns.tolist()
    wanted = ['valid_time', 'time_step', 'latitude', 'latitude_rank',
              'longitude', 'longitude_rank', 'coord_index', 'u10', 'u100', 'v10', 'v100',
              'w10', 'w100', 'tp', 't2m', 'lsm', 'sp', 'tcw', 'e']
    missing = list(set(columns) - set(wanted))
    if columns != wanted:
        print(filename, missing)
    break