In [1]:
import pandas as pd
import os
import json
import zipfile
from io import StringIO, BytesIO
import numpy as np
from scipy.interpolate import griddata, NearestNDInterpolator

In [2]:
df = pd.read_csv("../../Downloads/loiccopykutschera_v2.0/kutschera_ttpv1_v2.0/el_sf_body-090st100.csv", comment='#', sep='\s+')
df

Unnamed: 0,t,x-disp,x-vel,y-disp,y-vel,z-disp,z-vel
0,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.005,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.010,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.015,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.020,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...
47996,239.980,0.181724,-0.000013,-0.014787,0.000167,-0.040545,-0.000054
47997,239.985,0.181724,-0.000015,-0.014787,0.000168,-0.040546,-0.000054
47998,239.990,0.181724,-0.000016,-0.014786,0.000169,-0.040546,-0.000054
47999,239.995,0.181724,-0.000018,-0.014785,0.000169,-0.040546,-0.000055


In [4]:
template_path = os.path.join("./resources/benchmark_templates/ttpv1.json")
with open(template_path, 'r') as f:
    template = json.load(f)
    for file_info in template['files']:
        prefix = file_info['prefix']
        file_type = file_info['file_type']
        if prefix == "el_sf_body":
            expected_structure = file_info
expected_structure

{'name': 'body elastic seafloor',
 'content': 'receivers in the body elastic medium, seafloor, off-fault',
 'graph_type': 'timeseries',
 'list_of_receivers': ['el_sf_body-010st000',
  'el_sf_body-010st100',
  'el_sf_body-030st000',
  'el_sf_body-030st100',
  'el_sf_body-090st000',
  'el_sf_body-090st100',
  'el_sf_body-090st200',
  'el_sf_body-150st000',
  'el_sf_body-150st100',
  'el_sf_body-150st200',
  'el_sf_body-210st000',
  'el_sf_body-210st100',
  'el_sf_body-210st200',
  'el_sf_body010st000',
  'el_sf_body010st100',
  'el_sf_body030st000',
  'el_sf_body030st100',
  'el_sf_body090st000',
  'el_sf_body090st100',
  'el_sf_body090st200',
  'el_sf_body150st000',
  'el_sf_body150st100',
  'el_sf_body210st000',
  'el_sf_body210st100',
  'el_sf_body270st000',
  'el_sf_body270st100',
  'el_sf_body270st200',
  'el_sf_body330st000',
  'el_sf_body390st000',
  'el_sf_body450st000',
  'el_sf_body450st100',
  'el_sf_body450st200',
  'el_sf_body510st000',
  'el_sf_body570st000'],
 'var_list': 

In [5]:
var_list = expected_structure['var_list']
expected_columns = [var['name'] for var in var_list]
expected_columns

['t', 'x-disp', 'x-vel', 'y-disp', 'y-vel', 'z-disp', 'z-vel']

In [7]:
if list(df.columns) != expected_columns:
    print("WHYYYY")

In [16]:
def extract_header(content):
    """Extract and parse the common header from a .dat file."""
    header_data = {}
    for line in content.splitlines():
        if line.startswith("# File:"):
            continue
        if line.startswith('#'):
            if '=' in line:
                key, value = line[2:].strip().split('=', 1)
                header_data[key.strip()] = value.strip()
            else:
                header_data.setdefault('comments', []).append(line[2:].strip())
    return header_data


def interpolate_data(df, grid_params):
    """Apply interpolation for all variables in the dataframe."""
    print(f"Applying interpolation")
    
    x_min, x_max, x_n = grid_params["x"]["min"], grid_params["x"]["max"], grid_params["x"]["n"]
    y_min, y_max, y_n = grid_params["y"]["min"], grid_params["y"]["max"], grid_params["y"]["n"]

    print(grid_params)

    # Create a regular grid
    x_grid = np.linspace(x_min, x_max, x_n)
    y_grid = np.linspace(y_min, y_max, y_n)
    x_mesh, y_mesh = np.meshgrid(x_grid, y_grid)

    # Interpolate h-disp values onto the grid
    v_disp_grid = griddata(
        (df['x'], df['y']),  # Input points
        df['ssha'],        # Input values
        (x_mesh, y_mesh),    # Grid to interpolate onto
        method='nearest'      # Interpolation method ('linear', 'nearest', 'cubic')
    )

    # Extrapolate missing values using nearest neighbor
    valid_mask = ~np.isnan(v_disp_grid)  # Mask of valid (non-NaN) values
    valid_points = np.column_stack((x_mesh[valid_mask], y_mesh[valid_mask]))  # Valid (x, z) points
    valid_values = v_disp_grid[valid_mask]  # Valid v-disp values

    # Create a nearest neighbor interpolator
    nearest_interp = NearestNDInterpolator(valid_points, valid_values)

    # Fill missing values by extrapolating
    v_disp_grid_filled = np.where(
        np.isnan(v_disp_grid),  # Condition: Where values are NaN
        nearest_interp(x_mesh, y_mesh),  # Fill with nearest neighbor values
        v_disp_grid  # Keep original values where not NaN
    )

    # Flatten the grid arrays
    x_flat = x_mesh.flatten()
    y_flat = y_mesh.flatten()
    v_disp_flat = v_disp_grid_filled.flatten()

    # Create a new DataFrame
    interpolated_df = pd.DataFrame({
        'x': x_flat,
        'y': y_flat,
        'ssha': v_disp_flat
    })
    return interpolated_df
    

def process_zip(bucket_name, zip_key, benchmark_pb, code_name, version, user_metadata=None, **kwargs):
    # output_folder = f"/tmp/{code_name}_{version}/"
    output_folder = f"./tmp/{code_name}_{version}/"
    os.makedirs(output_folder, exist_ok=True)

    file_list = []
    common_header = None

    # Download and unzip the file
    # zip_obj = s3.get_object(Bucket=bucket_name, Key=zip_key)
    local_zip_path = f'./resources/{benchmark_pb}/{code_name}_{version}.zip'
    
    # Load the JSON template
    template_path = os.path.join("./resources/benchmark_templates/", f"{benchmark_pb}_template.json")
    with open(template_path, 'r') as f:
        template = json.load(f)
    with zipfile.ZipFile(local_zip_path, 'r') as zip_obj:
        for file_info in template['files']:
            prefix = file_info['prefix']
            file_type = file_info['file_type']
            expected_structure = file_info
            # Match files based on prefix and file type
            matching_files = [f for f in zip_obj.namelist() if f.startswith(prefix) and f.endswith(f".{file_type}")]
            for file_name in matching_files:
                # Read and validate file
                with zip_obj.open(file_name) as file:
                    file_content = file.read().decode('utf-8')
                    df = pd.read_csv(StringIO(file_content), comment='#', sep='\s+')

                    # Validate columns
                    var_list = expected_structure['var_list']
                    expected_columns = [var['name'] for var in var_list]
                    if list(df.columns) != expected_columns:
                        raise ValueError(f"File {file_name} does not match the expected structure.")
                        
                    if "grid" in expected_structure:
                        df = interpolate_data(df, expected_structure['grid'])
                    
                    # Save as Parquet
                    output_path = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}.parquet")
                    # df.to_parquet(output_path, index=False)
                target_key = f"public_ds/{benchmark_pb}/{code_name}_{version}/{os.path.basename(output_path)}"
                print(target_key)

# test interpolation

In [17]:
df = pd.read_csv("./resources/ttpv1/tsunami060s.csv", comment='#', sep='\s+')
df

Unnamed: 0,x,y,ssha
0,29369.0,86554.0,-0.006155
1,27203.0,87808.0,-0.007250
2,30177.0,85466.0,-0.006661
3,30181.0,85662.0,-0.006407
4,34638.0,94046.0,0.000500
...,...,...,...
973000,5005.6,2187.5,0.555210
973001,4734.9,2343.7,0.536600
973002,2569.9,-3906.3,0.402420
973003,2840.5,-3750.0,0.408130


In [18]:
template_path = os.path.join("./resources/benchmark_templates/ttpv1.json")
with open(template_path, 'r') as f:
    template = json.load(f)
    for file_info in template['files']:
        expected_structure = file_info
        if "grid" in expected_structure:
            df = interpolate_data(df, expected_structure['grid'])
df

Applying interpolation
{'x': {'min': -100000.0, 'max': 100000.0, 'n': 1000}, 'y': {'min': -100000.0, 'max': 100000.0, 'n': 1000}}


Unnamed: 0,x,y,ssha
0,-100000.000000,-100000.0,0.002420
1,-99799.799800,-100000.0,0.000334
2,-99599.599600,-100000.0,0.000334
3,-99399.399399,-100000.0,0.000056
4,-99199.199199,-100000.0,-0.000949
...,...,...,...
999995,99199.199199,100000.0,0.017586
999996,99399.399399,100000.0,0.016860
999997,99599.599600,100000.0,0.016122
999998,99799.799800,100000.0,0.016122


In [19]:
df.to_parquet("./resources/ttpv1/test_surface_rearest.parquet", index=False)

In [50]:
bucket_name = ''
zip_key = ''
benchmark_pb = 'ttpv1'
code_name = 'SeisSol' 
version = 'v1.2.0'
process_zip(bucket_name, zip_key, benchmark_pb, code_name, version)

public_ds/ttpv1/SeisSol_v1.2.0/body010st000dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/body010st100dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/body030st000dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/body090st000dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/body090st100dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/body090st200dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/body150st000dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/body210st000dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/body270st000dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/body270st100dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/body270st200dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/body330st000dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/body390st000dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/body450st000dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/body450st100dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/body450st200dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/faultst000dp000.parquet
public_ds/ttpv1/SeisSol_v1.2.0/fa

In [57]:
def get_plots_from_json(json_data, file_name):
    """
    Generate a list of variables to plot against time from the provided JSON.

    Parameters:
        json_data (dict): The JSON data containing file information.
        file_type (str): The file type to filter files (e.g., "csv").

    Returns:
        list: A list of dictionaries with variable names and units to plot against time.
    """
    plots = []

    for file_info in json_data['files']:
        if file_info['name'] == file_name:
            for var in file_info['var_list']:
                if var['name'] != 't':  # Exclude "time" itself
                    plots.append({"name": var['name'], "unit": var['unit']})

    return plots

In [59]:
json_data = {
  "name": "ttpv1",
  "files": [
    {
      "name": "body",
      "content": "receivers in the body, off-fault",
      "var_list": [
        {
          "name": "t",
          "unit": "s",
          "description": "time"
        },
        {
          "name": "h-disp",
          "unit": "m",
          "description": "horizontal displacement"
        },
        {
          "name": "h-vel",
          "unit": "m/s",
          "description": "horizontal velocity"
        },
        {
          "name": "v-disp",
          "unit": "m",
          "description": "vertical displacement"
        },
        {
          "name": "v-vel",
          "unit": "m/s",
          "description": "vertical velocity"
        },
        {
          "name": "n-disp",
          "unit": "m",
          "description": "normal displacement"
        },
        {
          "name": "n-vel",
          "unit": "m/s",
          "description": "normal velocity"
        }
      ],
      "prefix": "body",
      "file_type": "csv"
    },
    {
      "name": "fault",
      "content": "receivers on-fault",
      "var_list": [
        {
          "name": "t",
          "unit": "s",
          "description": "time"
        },
        {
          "name": "h-slip",
          "unit": "m",
          "description": "horizontal (along-strike) slip"
        },
        {
          "name": "h-slip-rate",
          "unit": "m/s",
          "description": "horizontal (along-strike) slip rate"
        },
        {
          "name": "h-shear-stress",
          "unit": "MPa",
          "description": "horizontal (along-strike) shear stress"
        },
        {
          "name": "v-slip",
          "unit": "m",
          "description": "along-dip slip"
        },
        {
          "name": "v-slip-rate",
          "unit": "m/s",
          "description": "along-dip slip rate"
        },
        {
          "name": "v-shear-stress",
          "unit": "MPa",
          "description": "along-dip shear stress"
        },
        {
          "name": "n-stress",
          "unit": "MPa",
          "description": "normal stress"
        }
      ],
      "prefix": "fault",
      "file_type": "csv"
    }
  ]
}

file_name = "body"
plot_variables = get_plots_from_json(json_data, file_name)
print(plot_variables)

[{'name': 'h-disp', 'unit': 'm'}, {'name': 'h-vel', 'unit': 'm/s'}, {'name': 'v-disp', 'unit': 'm'}, {'name': 'v-vel', 'unit': 'm/s'}, {'name': 'n-disp', 'unit': 'm'}, {'name': 'n-vel', 'unit': 'm/s'}]


In [60]:
len(plot_variables)

6

In [45]:
code_name, version = str("SeisSol_v1.2.0.zip").rsplit('.', 1)[0].split('_', 1)
print(f'Processing code {code_name}, version {version}')

Processing code SeisSol, version v1.2.0


In [41]:
# Example usage
input_folder = "./resources/bp1-qd/erickson/"
code_name = "ku"
version = "1"
process_files(input_folder, code_name, version)

Saved: ./parquet_examples/thrase_v1/thrase_v1_000.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_025.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_050.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_075.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_100.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_125.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_150.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_175.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_200.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_250.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_300.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_350.parquet
Metadata saved: ./parquet_examples/thrase_v1/metadata.json


In [24]:
def convert_seconds_to_time(seconds):
    years = seconds / (365.25 * 24 * 3600)
    days = (seconds / (24 * 3600))
    hours = seconds / 3600
    seconds = seconds
    return years, days, hours, seconds

In [3]:
# Input and output folders
input_folder = "./resources/bp1-qd/erickson/"
output_folder = "./parquet_examples/"
code_name = "thrase"
version = "1"

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Loop through all files in the input folder and subdirectories
for root, _, files in os.walk(input_folder):
    for file in files:
        if file.endswith(".dat"):
            # Construct full input and output file paths
            input_path = os.path.join(root, file)
            output_file = file.replace(".dat", ".parquet")
            output_path = os.path.join(output_folder, output_file)

            try:
                # Read the .dat file and convert to Parquet
                print(f"Processing: {input_path}")
                df = pd.read_csv(input_path, comment='#', delim_whitespace=True)
                df['years'], df['days'], df['hours'], df['seconds'] = convert_seconds_to_time(df['t'])
                df.to_parquet(output_path, index=False)
                print(f"Saved: {output_path}")

            except Exception as e:
                print(f"Failed to process {input_path}: {e}")

Processing: ./resources/bp1-qd/erickson\erickson_bp1-qd_fltst_dp000.dat
Saved: ./parquet_examples/erickson_bp1-qd_fltst_dp000.parquet
Processing: ./resources/bp1-qd/erickson\erickson_bp1-qd_fltst_dp025.dat
Saved: ./parquet_examples/erickson_bp1-qd_fltst_dp025.parquet
Processing: ./resources/bp1-qd/erickson\erickson_bp1-qd_fltst_dp050.dat
Saved: ./parquet_examples/erickson_bp1-qd_fltst_dp050.parquet
Processing: ./resources/bp1-qd/erickson\erickson_bp1-qd_fltst_dp075.dat
Saved: ./parquet_examples/erickson_bp1-qd_fltst_dp075.parquet
Processing: ./resources/bp1-qd/erickson\erickson_bp1-qd_fltst_dp100.dat
Saved: ./parquet_examples/erickson_bp1-qd_fltst_dp100.parquet
Processing: ./resources/bp1-qd/erickson\erickson_bp1-qd_fltst_dp125.dat
Saved: ./parquet_examples/erickson_bp1-qd_fltst_dp125.parquet
Processing: ./resources/bp1-qd/erickson\erickson_bp1-qd_fltst_dp150.dat
Saved: ./parquet_examples/erickson_bp1-qd_fltst_dp150.parquet
Processing: ./resources/bp1-qd/erickson\erickson_bp1-qd_fltst_

In [5]:
%%time
ds = pd.read_parquet("./parquet_examples/erickson_bp1-qd_fltst_dp000.parquet")
ds['dataset_name'] = f"erickson_dp000"
ds

CPU times: total: 0 ns
Wall time: 11.4 ms


Unnamed: 0,t,slip,slip_rate,shear_stress,state,years,days,hours,seconds,dataset_name
0,0.000000e+00,0.000000e+00,-9.000000,26.546122,0.591409,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,erickson_dp000
1,1.000000e-03,9.999999e-13,-9.000000,26.546122,3.903090,3.168809e-11,1.157407e-08,2.777778e-07,1.000000e-03,erickson_dp000
2,1.099213e+04,5.618357e-06,-9.562926,26.546126,4.278376,3.483197e-04,1.272238e-01,3.053370e+00,1.099213e+04,erickson_dp000
3,4.113129e+06,1.530747e-05,-13.065780,26.548487,6.614980,1.303372e-01,4.760566e+01,1.142536e+03,4.113129e+06,erickson_dp000
4,2.437484e+07,1.572737e-05,-14.214194,26.559678,7.387069,7.723920e-01,2.821162e+02,6.770788e+03,2.437484e+07,erickson_dp000
...,...,...,...,...,...,...,...,...,...,...
43938,9.466306e+10,8.945083e+01,-15.166594,28.542657,9.170264,2.999691e+03,1.095637e+06,2.629529e+07,9.466306e+10,erickson_dp000
43939,9.466306e+10,8.945083e+01,-15.166594,28.542657,9.170264,2.999691e+03,1.095637e+06,2.629529e+07,9.466306e+10,erickson_dp000
43940,9.466307e+10,8.945083e+01,-15.166594,28.542663,9.170268,2.999692e+03,1.095637e+06,2.629530e+07,9.466307e+10,erickson_dp000
43941,9.466954e+10,8.945083e+01,-15.166752,28.545752,9.172162,2.999897e+03,1.095712e+06,2.629709e+07,9.466954e+10,erickson_dp000


In [19]:
dataset_list = ['erickson', 'jiang']
depth = '025'

In [20]:
dataset_names = [name + '_dp' + depth for name in dataset_list]
dataset_names

['erickson_dp025', 'jiang_dp025']

In [21]:
filtered_df = ds[ds['dataset_name'].isin(dataset_names)]
filtered_df

Unnamed: 0,t,slip,slip_rate,shear_stress,state,years,days,hours,seconds,dataset_name


In [22]:
filtered_dataset_names = [name.split('_')[0] for name in dataset_names if name not in ds['dataset_name'].values]
filtered_dataset_names

['erickson', 'jiang']

In [66]:
from os import listdir
from os.path import isfile, join
onlyfiles = [f.split('.')[0] for f in listdir("./resources/bp1-qd/jiang/") if isfile(join("./resources/bp1-qd/jiang/", f))]
onlyfiles

['jiang_bp1-qd_fltst_dp000',
 'jiang_bp1-qd_fltst_dp025',
 'jiang_bp1-qd_fltst_dp050',
 'jiang_bp1-qd_fltst_dp075',
 'jiang_bp1-qd_fltst_dp100',
 'jiang_bp1-qd_fltst_dp125',
 'jiang_bp1-qd_fltst_dp150',
 'jiang_bp1-qd_fltst_dp175',
 'jiang_bp1-qd_fltst_dp200',
 'jiang_bp1-qd_fltst_dp250',
 'jiang_bp1-qd_fltst_dp300',
 'jiang_bp1-qd_fltst_dp350']

In [84]:
with open("./resources/benchmark_templates/ttpv1.json", 'r') as f:
    template = json.load(f) 
template

{'name': 'ttpv1',
 'files': [{'name': 'body',
   'content': 'receivers in the body, off-fault',
   'list_of_receivers': ['body010st000dp000',
    'body010st100dp000',
    'body030st000dp000',
    'body090st000dp000',
    'body090st100dp000',
    'body090st200dp000',
    'body150st000dp000',
    'body210st000dp000',
    'body270st000dp000',
    'body270st100dp000',
    'body270st200dp000',
    'body330st000dp000',
    'body390st000dp000',
    'body450st000dp000',
    'body450st100dp000',
    'body450st200dp000'],
   'var_list': [{'name': 't', 'unit': 's', 'description': 'time'},
    {'name': 'h-disp', 'unit': 'm', 'description': 'horizontal displacement'},
    {'name': 'h-vel', 'unit': 'm/s', 'description': 'horizontal velocity'},
    {'name': 'v-disp', 'unit': 'm', 'description': 'vertical displacement'},
    {'name': 'v-vel', 'unit': 'm/s', 'description': 'vertical velocity'},
    {'name': 'n-disp', 'unit': 'm', 'description': 'normal displacement'},
    {'name': 'n-vel', 'unit': 'm/s

In [86]:
file_type = 'body'

In [89]:
for file in template['files']:
    if file['name'] == file_type:
        receivers = file['list_of_receivers']
        
receivers

['body010st000dp000',
 'body010st100dp000',
 'body030st000dp000',
 'body090st000dp000',
 'body090st100dp000',
 'body090st200dp000',
 'body150st000dp000',
 'body210st000dp000',
 'body270st000dp000',
 'body270st100dp000',
 'body270st200dp000',
 'body330st000dp000',
 'body390st000dp000',
 'body450st000dp000',
 'body450st100dp000',
 'body450st200dp000']

In [94]:
for idx, value in enumerate([1, 5, 8, 6]):
    print(idx)

0
1
2
3
