In [32]:
import pandas as pd
import os
import json

In [39]:
def convert_seconds_to_time(seconds):
    years = seconds / (365.25 * 24 * 3600)
    days = (seconds / (24 * 3600))
    hours = seconds / 3600
    seconds = seconds
    return years, days, hours, seconds

def extract_header(input_path):
    """Extract and parse the common header from a .dat file."""
    header_data = {}
    with open(input_path, 'r') as f:
        for line in f:
            if line.startswith("# File:"):  # Skip file-specific lines
                continue
            if line.startswith('#'):
                # Extract key-value pairs, if present
                if '=' in line:
                    key, value = line[2:].strip().split('=', 1)
                    header_data[key.strip()] = value.strip()
                else:
                    # Store plain comments
                    header_data.setdefault('comments', []).append(line[2:].strip())
    return header_data

def process_files(input_folder, code_name, version):
    # Create the output folder based on the code name and version
    output_folder = f"./parquet_examples/{code_name}_v{version}/"
    os.makedirs(output_folder, exist_ok=True)

    file_list = []  # Store filenames for the metadata summary
    common_header = None  # Store the shared header

    # Loop through all .dat files in the input folder and subdirectories
    for root, _, files in os.walk(input_folder):
        for file in files:
            if file.endswith(".dat"):
                input_path = os.path.join(root, file)
                depth = file.split('dp')[-1].split('.')[0]  # Extract depth from filename
                output_path = os.path.join(output_folder, f"{code_name}_v{version}_{depth}.parquet")

                try:
                    # Extract the common header only from the first file
                    if common_header is None:
                        common_header = extract_header(input_path)

                    # Collect the filename for the metadata summary
                    file_list.append(file)

                    # Read the .dat file into a DataFrame
                    df = pd.read_csv(input_path, comment='#', delim_whitespace=True)
                    df['years'], df['days'], df['hours'], df['seconds'] = convert_seconds_to_time(df['t'])

                    # Save to Parquet
                    df.to_parquet(output_path, index=False)
                    print(f"Saved: {output_path}")

                except Exception as e:
                    print(f"Failed to process {input_path}: {e}")

    # Prepare the metadata as a dictionary
    metadata = {
        "common_header": common_header,
        "processed_files": file_list
    }

    # Write the metadata to a JSON file
    metadata_path = os.path.join(output_folder, "metadata.json")
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=4)
    print(f"Metadata saved: {metadata_path}")

In [41]:
# Example usage
input_folder = "./resources/bp1-qd/erickson/"
code_name = "thrase"
version = "1"
process_files(input_folder, code_name, version)

Saved: ./parquet_examples/thrase_v1/thrase_v1_000.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_025.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_050.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_075.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_100.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_125.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_150.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_175.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_200.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_250.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_300.parquet
Saved: ./parquet_examples/thrase_v1/thrase_v1_350.parquet
Metadata saved: ./parquet_examples/thrase_v1/metadata.json


In [24]:
def convert_seconds_to_time(seconds):
    years = seconds / (365.25 * 24 * 3600)
    days = (seconds / (24 * 3600))
    hours = seconds / 3600
    seconds = seconds
    return years, days, hours, seconds

In [3]:
# Input and output folders
input_folder = "./resources/bp1-qd/erickson/"
output_folder = "./parquet_examples/"
code_name = "thrase"
version = "1"

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Loop through all files in the input folder and subdirectories
for root, _, files in os.walk(input_folder):
    for file in files:
        if file.endswith(".dat"):
            # Construct full input and output file paths
            input_path = os.path.join(root, file)
            output_file = file.replace(".dat", ".parquet")
            output_path = os.path.join(output_folder, output_file)

            try:
                # Read the .dat file and convert to Parquet
                print(f"Processing: {input_path}")
                df = pd.read_csv(input_path, comment='#', delim_whitespace=True)
                df['years'], df['days'], df['hours'], df['seconds'] = convert_seconds_to_time(df['t'])
                df.to_parquet(output_path, index=False)
                print(f"Saved: {output_path}")

            except Exception as e:
                print(f"Failed to process {input_path}: {e}")

Processing: ./resources/bp1-qd/erickson\erickson_bp1-qd_fltst_dp000.dat
Saved: ./parquet_examples/erickson_bp1-qd_fltst_dp000.parquet
Processing: ./resources/bp1-qd/erickson\erickson_bp1-qd_fltst_dp025.dat
Saved: ./parquet_examples/erickson_bp1-qd_fltst_dp025.parquet
Processing: ./resources/bp1-qd/erickson\erickson_bp1-qd_fltst_dp050.dat
Saved: ./parquet_examples/erickson_bp1-qd_fltst_dp050.parquet
Processing: ./resources/bp1-qd/erickson\erickson_bp1-qd_fltst_dp075.dat
Saved: ./parquet_examples/erickson_bp1-qd_fltst_dp075.parquet
Processing: ./resources/bp1-qd/erickson\erickson_bp1-qd_fltst_dp100.dat
Saved: ./parquet_examples/erickson_bp1-qd_fltst_dp100.parquet
Processing: ./resources/bp1-qd/erickson\erickson_bp1-qd_fltst_dp125.dat
Saved: ./parquet_examples/erickson_bp1-qd_fltst_dp125.parquet
Processing: ./resources/bp1-qd/erickson\erickson_bp1-qd_fltst_dp150.dat
Saved: ./parquet_examples/erickson_bp1-qd_fltst_dp150.parquet
Processing: ./resources/bp1-qd/erickson\erickson_bp1-qd_fltst_

In [5]:
%%time
ds = pd.read_parquet("./parquet_examples/erickson_bp1-qd_fltst_dp000.parquet")
ds['dataset_name'] = f"erickson_dp000"
ds

CPU times: total: 0 ns
Wall time: 11.4 ms


Unnamed: 0,t,slip,slip_rate,shear_stress,state,years,days,hours,seconds,dataset_name
0,0.000000e+00,0.000000e+00,-9.000000,26.546122,0.591409,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,erickson_dp000
1,1.000000e-03,9.999999e-13,-9.000000,26.546122,3.903090,3.168809e-11,1.157407e-08,2.777778e-07,1.000000e-03,erickson_dp000
2,1.099213e+04,5.618357e-06,-9.562926,26.546126,4.278376,3.483197e-04,1.272238e-01,3.053370e+00,1.099213e+04,erickson_dp000
3,4.113129e+06,1.530747e-05,-13.065780,26.548487,6.614980,1.303372e-01,4.760566e+01,1.142536e+03,4.113129e+06,erickson_dp000
4,2.437484e+07,1.572737e-05,-14.214194,26.559678,7.387069,7.723920e-01,2.821162e+02,6.770788e+03,2.437484e+07,erickson_dp000
...,...,...,...,...,...,...,...,...,...,...
43938,9.466306e+10,8.945083e+01,-15.166594,28.542657,9.170264,2.999691e+03,1.095637e+06,2.629529e+07,9.466306e+10,erickson_dp000
43939,9.466306e+10,8.945083e+01,-15.166594,28.542657,9.170264,2.999691e+03,1.095637e+06,2.629529e+07,9.466306e+10,erickson_dp000
43940,9.466307e+10,8.945083e+01,-15.166594,28.542663,9.170268,2.999692e+03,1.095637e+06,2.629530e+07,9.466307e+10,erickson_dp000
43941,9.466954e+10,8.945083e+01,-15.166752,28.545752,9.172162,2.999897e+03,1.095712e+06,2.629709e+07,9.466954e+10,erickson_dp000


In [19]:
dataset_list = ['erickson', 'jiang']
depth = '025'

In [20]:
dataset_names = [name + '_dp' + depth for name in dataset_list]
dataset_names

['erickson_dp025', 'jiang_dp025']

In [21]:
filtered_df = ds[ds['dataset_name'].isin(dataset_names)]
filtered_df

Unnamed: 0,t,slip,slip_rate,shear_stress,state,years,days,hours,seconds,dataset_name


In [22]:
filtered_dataset_names = [name.split('_')[0] for name in dataset_names if name not in ds['dataset_name'].values]
filtered_dataset_names

['erickson', 'jiang']