# Training and cross validation of symbolic regression model

This workbook performs symbolic regression (SR) on bulk modulus data. 

The SR algorithm was first implemented by Flores et. al. in the following work DOI: [10.1039/D2DD00027J](https://doi.org/10.1039/D2DD00027J).

In [None]:
# import packages
import os, sys, platform
import numpy as np
import pandas as pd
import workflows as wf
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import sklearn.metrics as skmetrics 
from itertools import permutations, combinations
import runSR as sr

print('Python version', sys.version)
print('Running on', platform.system())

In [None]:
# LOADING THE TRAINING DATA
#=========================================
training_file = 'Data_Summary/BulkModulus_Data.csv' #Everything is in SI units now
data_df = pd.read_csv(training_file)
data_df['d'] = data_df['d']/1E9 # Convert Pa to GPa for coefficient magnitude

In [None]:
# GENERATE KFOLD INDICES
#=========================================

splits = 10
kf = KFold(n_splits = splits, shuffle = True, random_state = 1)
result = next(kf.split(data_df), None)
Ntrain = len(data_df.iloc[result[0]])
Ntest =  len(data_df.iloc[result[1]])
print('Test size:')
print(Ntest)
print('Training size:')
print(Ntrain)

In [None]:
# GET EXPRESSION COMBINATION LIST
#=========================================

expressions = ["sqrt","^2","^3"]
expression_array = []

for i in range(len(expressions)):
    exp = list(combinations(expressions, i+1))
    expression_array = expression_array + exp

expression_array = [list(exp_tup) for exp_tup in expression_array]

In [None]:
# FEATURE ARRAY LIST
#=========================================

feature_array = [1,2,3]

In [None]:
# RUNNING KFOLD LOOPING
#=========================================

df = pd.DataFrame()

for split in range(splits):
    
    result = list(kf.split(data_df))[split]
    train = data_df.iloc[result[0]].reset_index(drop=True)
    test =  data_df.iloc[result[1]].reset_index(drop=True)
    kfold = split
    
    for expressions in expression_array:
    
        for FEATENG_STEPS in feature_array:

            eqn, eqn_str, coeffs, rmse, r2, mape, y_real, y_hat, trained_workflow = sr.run_SR(train,FEATENG_STEPS,expressions)

            #Check test data
            ytest_real = test['d'] #soundspeed or density
            ytest_hat = trained_workflow.predict(x = test[['T','r','m']])
            rmse_test = np.sqrt(skmetrics.mean_squared_error(ytest_real, ytest_hat))
            r2_test = skmetrics.r2_score(ytest_real, ytest_hat)

            single_dict = {'kfold':[kfold],
                           'exp': [expressions],
                           'feats': [FEATENG_STEPS],
                           'eqn': [eqn],
                           'rmse_test': [rmse_test],
                           'r2_test': [r2_test],
                           'rmse_train': [rmse],
                           'r2_train': [r2],
                           'ytrain_real': [[y_real]],
                           'ytrain_hat': [[y_hat]],
                           'ytest_real': [[ytest_real]],
                           'ytest_hat': [[ytest_hat]]}

            df_new = pd.DataFrame.from_dict(single_dict)
            df = pd.concat([df,df_new],axis=0)

In [None]:
# SAVE AND SEE RESULTS
#=========================================
export_filename='SR_Kfold_BulkModulus_Results.csv'                             
df.to_csv(export_filename) 