In [1]:
# Prepare files to train surrogate
import os
import numpy as np
import pandas as pd
import json


def read_json(filename):
    """Read a JSON file and return its content as a dictionary."""
    with open(filename, 'r') as f:
        return json.load(f)

In [None]:
# Current working directory
cwd = os.getcwd()

# Here, define the location for the files in quoFEM to train the surrogate
save_files_to = os.path.join(cwd, 'surrogate_training_data')

# Path to calibrated curves
path_to_calibrations = os.path.join(cwd, 'cals_dr_005')
all_cals = os.listdir(path_to_calibrations)

# Read the first calibration file
data_json = read_json(os.path.join(path_to_calibrations, all_cals[0]))
cal_data = pd.DataFrame(data_json['best_fit'], index=[0])

# Set id to be the last 3 digits of the calibration file name begore the ".json"
uniqueId = all_cals[0].split('_')[-1].split('.')[0]

# Get name of test in the json file
name = data_json['Name']

# Add properties of test to the cal_data dataframe
cal_data['uniqueId'] = uniqueId
cal_data['name'] = name
cal_data['PeakDrift'] = max(abs(np.array(data_json['data']['disp'])))/data_json['L_Inflection']
cal_data['FailureType'] = data_json['FailureType']

# Add the first calibration data to the dataframe
cal_data_df = pd.DataFrame(cal_data)

# Loop through the rest of the calibration files
for cal in all_cals[1:]:
    # Read the calibration file
    data_json = read_json(os.path.join(path_to_calibrations, cal))
    cal_data = pd.DataFrame(data_json['best_fit'], index=[0])

    # Set id to be the last 3 digits of the calibration file name before the ".json"
    uniqueId = cal.split('_')[-1].split('.')[0]

    # Get name of test in the json file
    name = data_json['Name']
    
    # Add properties of test to the cal_data dataframe
    cal_data['uniqueId'] = uniqueId
    cal_data['name'] = name
    cal_data['PeakDrift'] = max(abs(np.array(data_json['data']['disp'])))/data_json['L_Inflection']
    cal_data['FailureType'] = data_json['FailureType']

    # Just store the cases where the failure type is flexure
    failuretype = data_json['FailureType']
    #if failuretype != 'Flexure':
    cal_data_df = pd.concat([cal_data_df, cal_data], ignore_index=True)

#Explore the dataframe... So far it only has the calibrated parameters
cal_data_df.head()

Unnamed: 0,eta1,kappa_k,kappa,sig,lam,mup,sigp,rsmax,n,alpha,alpha1,alpha2,betam1,gamma,uniqueId,name,PeakDrift,FailureType
0,0.859518,1.036790,1.021637,0.539988,0.544674,2.829084,2.412544,0.852311,4.280550,0.000214,2.544506,1.255819,0.000514,0.447093,001,"Gill et al. 1979, No. 1",0.028223,Flexure
1,1.969386,0.963894,0.959355,0.704658,0.533797,5.835124,0.597596,0.878801,1.534143,0.006376,1.866523,1.722041,0.002459,0.468600,002,"Gill et al. 1979, No. 2",0.021550,Flexure
2,1.677084,1.001006,1.006076,0.586495,0.512518,2.810250,2.816572,0.964923,3.815321,0.000967,4.446139,0.773216,0.000788,0.620984,003,"Gill et al. 1979, No. 3",0.017558,Flexure
3,1.436969,1.048921,0.979729,0.618096,0.574347,2.007441,0.965376,0.887724,1.372616,0.006757,4.579712,0.309938,0.000871,0.844298,004,"Gill et al. 1979, No. 4",0.013032,Flexure
4,1.297831,0.983432,0.982595,0.564496,0.778450,3.525745,2.595482,0.780115,1.812361,0.011749,6.297019,0.670711,0.005149,0.539721,005,"Ang et al. 1981, No. 3",0.031832,Flexure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389,1.979941,0.226029,1.037947,0.255848,0.232318,1.149940,4.810724,0.879166,1.027699,0.003626,6.593853,0.767306,0.001456,0.177899,412,"Hamilton, 2002, UCI5",0.040909,Flexure-Shear
390,1.939927,0.165344,0.940284,0.110906,0.151807,2.866205,4.903878,0.653888,1.576330,0.007218,5.105349,1.933418,0.023722,0.619617,413,"Hamilton, 2002, UCI6",0.123564,Flexure
391,0.885572,0.535313,0.985639,0.235297,0.575382,5.424065,8.135427,0.833799,2.905724,0.002974,1.087417,1.327587,0.003306,0.199675,414,"McDaniel, 1997, S1",0.024006,Shear
392,1.214206,0.648313,1.044553,0.430096,0.344591,9.222367,6.522411,0.656878,4.920417,0.000291,4.214173,0.217359,0.007253,0.176048,415,"McDaniel, 1997, S1-2",0.010653,Shear


In [51]:
# Load the nondimensional parameters and merge with with the calibrated parameters

# load csv file with the nondimensional parameters
nondim_params = pd.read_csv('data_spiral_wnd.csv')  # This includes all the data... we want just the last 6 columns (nondimensional parameters)
names = nondim_params['name']
nondim_params = nondim_params.iloc[:, -6::]
nondim_params['name'] = names
nondim_params

Unnamed: 0,ar,lrr,srr,alr,sdr,smr,name
0,,,,,,,"Davey 1975, No. 1"
1,0.181818,0.304296,0.001880,0.058293,0.600000,0.429439,"Davey 1975, No. 2"
2,0.285714,0.288749,0.001793,0.055613,0.600000,0.528389,"Davey 1975, No. 3"
3,0.153846,0.298894,0.002024,0.057258,0.600000,0.369010,"Munro et al. 1976, No. 1"
4,0.183150,0.206522,0.002947,0.003361,1.411765,0.115615,"Ng et al. 1978, No. 2"
...,...,...,...,...,...,...,...
159,0.387860,0.151781,0.002660,0.000000,0.425197,0.243059,"Hamilton, 2002, UCI6"
160,0.219178,0.150928,0.002776,0.000000,0.849057,0.109021,"McDaniel, 1997, S1"
161,0.500000,0.207288,0.000492,0.002162,0.289370,0.535311,"McDaniel, 1997, S1-2"
162,0.500000,0.230492,0.000547,0.002403,0.289370,0.561034,"McDaniel, 1997, S2"


In [52]:
# Merge the two dataframes on the 'name' column
merged_data = pd.merge(cal_data_df, nondim_params, on='name')

# Save the merged data to a new CSV file
merged_data.to_csv('merged_data.csv', index=False)

merged_data

Unnamed: 0,eta1,kappa_k,kappa,sig,lam,mup,sigp,rsmax,n,alpha,...,gamma,name,PeakDrift,FailureType,ar,lrr,srr,alr,sdr,smr
0,1.148932,1.780301,1.020907,0.348714,0.518051,1.490134,2.723279,0.948992,2.622910,0.002713,...,1.729762,"Davey 1975, No. 1",0.043364,Flexure,,,,,,
1,0.630606,0.841674,0.987722,0.227504,0.507170,1.211439,2.126357,0.762523,2.015740,0.014184,...,0.921419,"Davey 1975, No. 2",0.057183,Flexure,0.181818,0.304296,0.001880,0.058293,0.600000,0.429439
2,1.478835,5.314510,0.976617,0.386420,0.748542,2.441889,0.314088,0.637786,1.198469,0.007345,...,1.376716,"Davey 1975, No. 3",0.035757,Flexure,0.285714,0.288749,0.001793,0.055613,0.600000,0.528389
3,1.617294,0.551926,1.017993,0.142420,0.649311,1.039210,0.494608,0.532812,3.299630,0.017042,...,1.478985,"Munro et al. 1976, No. 1",0.055538,Flexure,0.153846,0.298894,0.002024,0.057258,0.600000,0.369010
4,1.138895,0.615178,0.990485,0.127393,0.607689,0.419765,1.385741,0.966630,1.303237,0.005371,...,0.597973,"Ng et al. 1978, No. 2",0.078142,Flexure,0.183150,0.206522,0.002947,0.003361,1.411765,0.115615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,0.801189,1.986198,1.000156,0.275466,0.329429,2.668027,1.747025,0.714904,1.013788,0.002320,...,0.190341,"Hamilton, 2002, UCI5",0.040782,Flexure-Shear,0.387860,0.180650,0.002714,0.000000,0.157526,0.411109
140,1.697304,0.504133,0.985804,0.131875,0.192101,4.980451,4.926214,0.574142,1.005974,0.001677,...,0.894524,"Hamilton, 2002, UCI6",0.129975,Flexure,0.387860,0.151781,0.002660,0.000000,0.425197,0.243059
141,1.184817,0.908320,0.959475,0.396224,0.256509,0.531340,0.317291,0.830325,1.341692,0.022070,...,0.278082,"McDaniel, 1997, S1",0.024549,Shear,0.219178,0.150928,0.002776,0.000000,0.849057,0.109021
142,1.267335,2.678181,1.044735,0.389762,0.325598,0.386566,0.517464,0.993302,6.535933,0.002377,...,0.206365,"McDaniel, 1997, S1-2",0.010622,Shear,0.500000,0.207288,0.000492,0.002162,0.289370,0.535311


In [53]:
# Load spiral_data_use.csv which contains the indices for the test data that we should use
spiral_data_use = pd.read_csv('spiral_data_use.csv')

# Get rid of instances where use == 0 

spiral_data_use = spiral_data_use[spiral_data_use['use'] == 1]
spiral_data_use

Unnamed: 0.1,Unnamed: 0,id,name,use
0,0,254,"Davey 1975, No. 1",1
1,1,255,"Davey 1975, No. 2",1
2,2,256,"Davey 1975, No. 3",1
3,3,257,"Munro et al. 1976, No. 1",1
4,4,258,"Ng et al. 1978, No. 2",1
...,...,...,...,...
135,135,408,"Hamilton, 2002, UCI1",1
139,139,412,"Hamilton, 2002, UCI5",1
141,141,414,"McDaniel, 1997, S1",1
142,142,415,"McDaniel, 1997, S1-2",1


In [54]:
# Create folder to save files with the configuration code
config_code = 'flexure_pd_010_just_good_data'

# Generate filters

# Filter 1: Peak drift < 0.04
filter1 = merged_data['PeakDrift'] < 0.1

# Filter 2: Failure type is flexure
filter2 = merged_data['FailureType'] == 'Flexure'

# Filter 3: Just use data that is in spiral_data_use
filter3 = merged_data['name'].isin(spiral_data_use['name'])

# Apply the filters
filtered_data = merged_data[filter1 & filter2 & filter3]

# Randomly shuffle the data
merged_data_shuffle = filtered_data.sample(frac=1, random_state=1).reset_index(drop=True)

# Generate a split for training and testing
split = 0.75
train_data = merged_data_shuffle.iloc[:int(split*len(merged_data_shuffle)), :]
test_data = merged_data_shuffle.iloc[int(split*len(merged_data_shuffle)):, :]

# These are the indices for the calibrated parameters and the nondimensional parameters (column)
# DO NOT MODIFY THESE
cal_params_index = np.arange(0, 14)
nd_params_index = np.arange(17, 23)

# Extract the calibrated and nondimensional parameters for both training and testing
cal_params_all = merged_data.iloc[:, cal_params_index]
nondim_params_all = merged_data.iloc[:, nd_params_index]

# Extract just the training data
cal_params_train = train_data.iloc[:, cal_params_index]
nondim_params_train = train_data.iloc[:, nd_params_index]

# Extract just the testing data
cal_params_test = test_data.iloc[:, cal_params_index]
nondim_params_test = test_data.iloc[:, nd_params_index]

print('We have {} training samples and {} testing samples'.format(len(train_data), len(test_data)))

We have 48 training samples and 16 testing samples


In [55]:
# Save to separate txt files with 5 decimal places

if not os.path.exists(os.path.join('quoFEM_Surrogate',config_code)):
    os.makedirs(os.path.join('quoFEM_Surrogate',config_code))

# Save training data
nondim_params_train.to_csv(os.path.join('quoFEM_Surrogate',config_code,'input_train.txt'), 
                     sep='\t', 
                     index=False, 
                     float_format='%.5f'
                     )

cal_params_train.to_csv(os.path.join('quoFEM_Surrogate',config_code,'output_train.txt'), 
                  sep='\t', 
                  index=False, 
                  float_format='%.5f'
                  )

# Save testing data
nondim_params_test.to_csv(os.path.join('quoFEM_Surrogate',config_code,'input_test.txt'), 
                     sep='\t', 
                     index=False, 
                     float_format='%.5f'
                     )

cal_params_test.to_csv(os.path.join('quoFEM_Surrogate',config_code,'output_test.txt'),
                    sep='\t', 
                    index=False, 
                    float_format='%.5f'
                    )


# Save train and test data into CSV files
train_data.to_csv(os.path.join('quoFEM_Surrogate',config_code,'train_data.csv'), index=False)
test_data.to_csv(os.path.join('quoFEM_Surrogate',config_code,'test_data.csv'), index=False)
