In [5]:
import pandas as pd 
import numpy as np
from dk_model import DeepKrigingTrainer

In [6]:
deposit_data = pd.read_csv("Data/filtered_deposit_data.csv", low_memory=False)
deposit_data

Unnamed: 0,Name,X,Y,Z,Density_gcm3,RQD_Pct,Cr_ppm,CP_Total,PO_Total,PY_Total
0,KV-NME001,0.437814,0.509816,0.461455,0.400922,0.8800,0.127305,0.250,0.066667,0.0
1,KV-NME001,0.438061,0.509789,0.460591,0.410138,0.8800,0.160479,0.250,0.066667,0.0
2,KV-NME001,0.448174,0.508800,0.426068,0.442396,0.9900,0.128743,0.250,0.066667,0.0
3,KV-NME001,0.448431,0.508777,0.425204,0.442396,0.9900,0.141317,0.375,0.133333,0.0
4,KV-NME001,0.448683,0.508755,0.424340,0.442396,0.9900,0.153293,0.500,0.200000,0.0
...,...,...,...,...,...,...,...,...,...,...
2613,KV365,0.629186,0.001755,0.432766,0.543779,0.9235,0.078443,0.050,0.200000,0.0
2614,KV365,0.629096,0.001314,0.431988,0.539171,0.9235,0.074850,0.050,0.200000,0.0
2615,KV365,0.629011,0.000877,0.431205,0.543779,0.8584,0.076647,0.050,0.200000,0.0
2616,KV365,0.628921,0.000436,0.430427,0.525346,0.8584,0.077246,0.050,0.200000,0.0


In [7]:
N = len(deposit_data)

lon = deposit_data.values[:, 1]
lat = deposit_data.values[:, 2]
az = deposit_data.values[:, 3]

num_basis_3_lvl = [10**3, 19**3, 37**3]
num_basis_2_lvl = [10**3, 19**3]
num_basis_1_lvl = [10**3]

num_basis_list = [num_basis_3_lvl, num_basis_2_lvl, num_basis_1_lvl]

phi_arrays = []  

# For each grid
for grid in num_basis_list:
    knots_1dx = [np.linspace(0, 1, int(i**(1/3)) + 1) for i in grid]
    knots_1dy = [np.linspace(0, 1, int(i**(1/3)) + 1) for i in grid]
    knots_1dz = [np.linspace(0, 1, int(i**(1/3)) + 1) for i in grid]
    basis_size = 0
    phis = np.zeros((N, sum(grid)))
    
    # For each level of resolution
    for res in range(len(grid)):
        theta = 1 / (grid[res]**(1/3)) * 2.5
        knots_x, knots_y, knots_z = np.meshgrid(knots_1dx[res], knots_1dy[res], knots_1dz[res])
        knots = np.column_stack((knots_x.flatten(), knots_y.flatten(), knots_z.flatten()))
        
        # For each node in the grid
        for i in range(grid[res]):
            d = np.linalg.norm(np.vstack((lon, lat, az)).astype(float).T - knots[i, :], axis=1) / theta
            
            # For each distance of our data to the node i, calculate Wendland kernel
            for j in range(len(d)):
                if 0 <= d[j] <= 1:
                    phis[j, i + basis_size] = (1 - d[j])**6 * (35 * d[j]**2 + 18 * d[j] + 3) / 3
                else:
                    phis[j, i + basis_size] = 0
        
        basis_size += grid[res]
    
    phi_arrays.append(phis)  # Store the phi array for this grid level

# Unpack phi arrays into individual variables
phi_1_lvl, phi_2_lvl, phi_3_lvl = phi_arrays


In [8]:
phis = [phi_1_lvl, phi_2_lvl, phi_3_lvl]
phi_reduces = {}
dfs = []


phi_columns = deposit_data.columns[10:].tolist()

# Display the list of column names
print(phi_columns[:10])

total_columns = ['CP_Total', 'PO_Total', 'PY_Total']

# All covariates
covariates = total_columns[:3] + ['RQD_Pct', 'Cr_ppm'] 

# Drop rows with NaN values in specific columns
deposit_data = deposit_data.dropna(subset=['Density_gcm3'] + covariates + phi_columns)

for idx, phi in enumerate(phis, start=1):
    idx_zero = np.array([], dtype=int)
    for i in range(phi.shape[1]):
        if np.sum(phi[:, i] != 0) == 0:
            idx_zero = np.append(idx_zero, int(i))

    phi_reduce = np.delete(phi, idx_zero, 1)
    phi_reduces[f"phi_{idx}_lvl_reduce"] = phi_reduce
    
    # Convert phi to DataFrame
    len_phi_regular = phi.shape[1]
    df_phi_regular = pd.DataFrame(phi, columns=[f'phi_{i}' for i in range(len_phi_regular)])
    dfs.append(df_phi_regular)
    
    # Convert phi_reduce to DataFrame
    len_phi_reduce = phi_reduce.shape[1]
    df_phi_reduce = pd.DataFrame(phi_reduce, columns=[f'phi_{i}' for i in range(len_phi_reduce)])
    dfs.append(df_phi_reduce)
    


[]


In [9]:
deposit_data_list = []
for df in dfs:
    df_reset = df.reset_index(drop=True)
    deposit_data_reset = deposit_data.reset_index(drop=True)

    # Concatenate along columns
    deposit_data_basis = pd.concat([deposit_data_reset, df], axis=1)
    phi_columns = deposit_data_basis.columns[10:].tolist()
    total_columns = ['CP_Total','PO_Total', 'PY_Total']
    covariates = total_columns[:3] + ['RQD_Pct', 'Cr_ppm'] 
    deposit_data_basis = deposit_data_basis.dropna(subset=['Density_gcm3'] + covariates + phi_columns)

    deposit_data_list.append(deposit_data_basis)

## Comparison varying the levels of the basis function generating grid

In [10]:
dfs_names = ['3 levels', '3 levels no 0s', '2 levels', '2 levels no 0s', '1 level', '1 level no 0s']
for df, df_name in zip(deposit_data_list[1:], dfs_names[1:]):
    print(f"\nMetrics for df with {len(df.columns)} columns (grid with {df_name})")
    trainer = DeepKrigingTrainer(df, regular_nn=False, plot_errors=False)
    trainer.train_neural_network()



Metrics for df with 6413 columns (grid with 3 levels no 0s)

Average Metrics Across Folds:
  Average MSE: 0.0029
  Average MAE: 0.0390
  Average Adjusted R2: 1.0082

Metrics for df with 7869 columns (grid with 2 levels)

Average Metrics Across Folds:
  Average MSE: 0.0031
  Average MAE: 0.0405
  Average Adjusted R2: 1.0072

Metrics for df with 2070 columns (grid with 2 levels no 0s)

Average Metrics Across Folds:
  Average MSE: 0.0028
  Average MAE: 0.0394
  Average Adjusted R2: 1.0272

Metrics for df with 1010 columns (grid with 1 level)

Average Metrics Across Folds:
  Average MSE: 0.0030
  Average MAE: 0.0397
  Average Adjusted R2: 1.0703

Metrics for df with 498 columns (grid with 1 level no 0s)

Average Metrics Across Folds:
  Average MSE: 0.0029
  Average MAE: 0.0389
  Average Adjusted R2: 1.2157


In [14]:
#Choose the last one
deposit_data_list[-1].to_csv('Data/final_dataset_1_no_0.csv', index=False)