In [1]:
import pandas as pd 
import numpy as np
from dk_model import DeepKrigingTrainer

In [2]:
deposit_data = pd.read_csv("Data_Parker/filtered_deposit_data.csv", low_memory=False)
deposit_data

Unnamed: 0,X.1,HOLEID,SAMPFROM_1m,SAMPTO_1m,SAMPLETYPE,Au_ppm,Au_ppm.1,LITH_INTERP,LITH_LOGGED,Ag_ppm_BESTEL,...,V_ppm_BESTEL,W_ppm_BESTEL,Y_ppm_BESTEL,Yb_ppm_BESTEL,Zn_ppm_BESTEL,Zr_ppm_BESTEL,X,Y,Z,Log_Au_ppm
0,56662,BD-022,0,1,RC,0.000354,0.000354,,DSOup_Ov,0.77,...,74.0,1.9,7.5,,50.0,32.6,0.574296,0.636578,0.585957,-3.352407
1,56663,BD-022,1,2,RC,0.000354,0.000354,,DSOup_Ov,0.77,...,74.0,1.9,7.5,,50.0,32.6,0.574297,0.636575,0.585785,-3.352407
2,56664,BD-022,2,3,RC,0.000354,0.000354,Ovi,DSOup_Ov,0.77,...,74.0,1.9,7.5,,50.0,32.6,0.574298,0.636572,0.585614,-3.352407
3,56665,BD-022,3,4,RC,0.000354,0.000354,Ovi,DSOup_Ov,0.77,...,74.0,1.9,7.5,,50.0,32.6,0.574300,0.636568,0.585442,-3.352407
4,56666,BD-022,4,5,RC,0.000354,0.000354,Ovi,DSOup_Ov,0.77,...,74.0,1.9,7.5,,50.0,32.6,0.574301,0.636565,0.585271,-3.352407
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184565,1059804,S4-280-2,97,98,CORE,0.009111,0.009111,Drc,Bx Generic,,...,,,,,,,0.357193,0.651190,0.398508,-0.160169
184566,1059805,S4-280-2,98,99,CORE,0.009111,0.009111,Drc,Bx Generic,,...,,,,,,,0.357025,0.651220,0.398490,-0.160169
184567,1059806,S4-280-2,99,100,CORE,0.009111,0.009111,Drc,Bx Generic,,...,,,,,,,0.356857,0.651250,0.398472,-0.160169
184568,1059807,S4-280-2,100,101,CORE,0.009111,0.009111,Drc,Bx Generic,,...,,,,,,,0.356689,0.651279,0.398454,-0.160169


In [3]:
deposit_data.values[:, 83]

array([0.5742957769134003, 0.574297069962726, 0.5742983447827412, ...,
       0.35685663002418, 0.3566886149461491, 0.3565205985593133],
      dtype=object)

In [None]:
N = len(deposit_data)

lon = deposit_data.values[:, 83]
lat = deposit_data.values[:, 84]
az = deposit_data.values[:, 85]
'''
num_basis_3_lvl = [10**3, 19**3, 37**3]
num_basis_2_lvl = [10**3, 19**3]
num_basis_1_lvl = [10**3]
'''

num_basis_3_lvl = [5**3, 10**3, 18**3]
num_basis_2_lvl = [5**3, 10**3]
num_basis_1_lvl = [5**3]

num_basis_list = [num_basis_3_lvl, num_basis_2_lvl, num_basis_1_lvl]

phi_arrays = []  

# For each grid
for grid in num_basis_list:
    knots_1dx = [np.linspace(0, 1, int(i**(1/3)) + 1) for i in grid]
    knots_1dy = [np.linspace(0, 1, int(i**(1/3)) + 1) for i in grid]
    knots_1dz = [np.linspace(0, 1, int(i**(1/3)) + 1) for i in grid]
    basis_size = 0
    phis = np.zeros((N, sum(grid)))
    
    # For each level of resolution
    for res in range(len(grid)):
        theta = 1 / (grid[res]**(1/3)) * 2.5
        knots_x, knots_y, knots_z = np.meshgrid(knots_1dx[res], knots_1dy[res], knots_1dz[res])
        knots = np.column_stack((knots_x.flatten(), knots_y.flatten(), knots_z.flatten()))
        
        # For each node in the grid
        for i in range(grid[res]):
            d = np.linalg.norm(np.vstack((lon, lat, az)).astype(float).T - knots[i, :], axis=1) / theta
            
            # For each distance of our data to the node i, calculate Wendland kernel
            for j in range(len(d)):
                if 0 <= d[j] <= 1:
                    phis[j, i + basis_size] = (1 - d[j])**6 * (35 * d[j]**2 + 18 * d[j] + 3) / 3
                else:
                    phis[j, i + basis_size] = 0
        
        basis_size += grid[res]
    
    phi_arrays.append(phis)  # Store the phi array for this grid level

# Unpack phi arrays into individual variables
phi_1_lvl, phi_2_lvl, phi_3_lvl = phi_arrays


KeyboardInterrupt: 

In [7]:
phis = [phi_1_lvl, phi_2_lvl, phi_3_lvl]
phi_reduces = {}
dfs = []


phi_columns = deposit_data.columns[10:].tolist()

# Display the list of column names
print(phi_columns[:10])

#total_columns = ['CP_Total', 'PO_Total', 'PY_Total']
total_columns = ['As_ppm_BESTEL', 'Hg_ppm_BESTEL']

# All covariates
#covariates = total_columns[:3] + ['RQD_Pct', 'Cr_ppm'] 
covariates = total_columns[-2:]


deposit_data = deposit_data.dropna(subset=['Au_ppm'] + covariates + phi_columns)

for idx, phi in enumerate(phis, start=1):
    idx_zero = np.array([], dtype=int)
    for i in range(phi.shape[1]):
        if np.sum(phi[:, i] != 0) == 0:
            idx_zero = np.append(idx_zero, int(i))

    phi_reduce = np.delete(phi, idx_zero, 1)
    phi_reduces[f"phi_{idx}_lvl_reduce"] = phi_reduce
    
    len_phi_regular = phi.shape[1]
    df_phi_regular = pd.DataFrame(phi, columns=[f'phi_{i}' for i in range(len_phi_regular)])
    dfs.append(df_phi_regular)
    
    len_phi_reduce = phi_reduce.shape[1]
    df_phi_reduce = pd.DataFrame(phi_reduce, columns=[f'phi_{i}' for i in range(len_phi_reduce)])
    dfs.append(df_phi_reduce)
    


['Al_pct_BESTEL', 'As_ppm_BESTEL', 'B_ppm_BESTEL', 'Ba_ppm_BESTEL', 'Be_ppm_BESTEL', 'Bi_ppm_BESTEL', 'Br_ppm_BESTEL', 'C_pct_BESTEL', 'Ca_pct_BESTEL', 'Cd_ppm_BESTEL']


In [None]:
deposit_data_list = []
for df in dfs:
    df_reset = df.reset_index(drop=True)
    deposit_data_reset = deposit_data.reset_index(drop=True)

    # Concatenate along columns
    deposit_data_basis = pd.concat([deposit_data_reset, df], axis=1)
    phi_columns = deposit_data_basis.columns[10:].tolist()
    #total_columns = ['CP_Total','PO_Total', 'PY_Total']
    total_columns = ['As_ppm_BESTEL', 'Hg_ppm_BESTEL']
    #covariates = total_columns[:3] + ['RQD_Pct', 'Cr_ppm'] 
    covariates = total_columns[-2:]
    deposit_data_basis = deposit_data_basis.dropna(subset=['Au_ppm'] + covariates + phi_columns)

    deposit_data_list.append(deposit_data_basis)

## Comparison varying the levels of the basis function generating grid

In [7]:
dfs_names = ['3 levels', '3 levels no 0s', '2 levels', '2 levels no 0s', '1 level', '1 level no 0s']
for df, df_name in zip(deposit_data_list[1:], dfs_names[1:]):
    print(f"\nMetrics for df with {len(df.columns)} columns (grid with {df_name})")
    trainer = DeepKrigingTrainer(df, regular_nn=False, plot_errors=False)
    trainer.train_neural_network()



Metrics for df with 6413 columns (grid with 3 levels no 0s)

Average Metrics Across Folds:
  Average MSE: 0.0020
  Average MAE: 0.0283
  Average Adjusted R2: 1.0058
  Average R2: 0.8643

Metrics for df with 7869 columns (grid with 2 levels)

Average Metrics Across Folds:
  Average MSE: 0.0024
  Average MAE: 0.0306
  Average Adjusted R2: 1.0054
  Average R2: 0.8428

Metrics for df with 2070 columns (grid with 2 levels no 0s)

Average Metrics Across Folds:
  Average MSE: 0.0040
  Average MAE: 0.0306
  Average Adjusted R2: 1.0409
  Average R2: 0.7181

Metrics for df with 1010 columns (grid with 1 level)

Average Metrics Across Folds:
  Average MSE: 0.0024
  Average MAE: 0.0335
  Average Adjusted R2: 1.0563
  Average R2: 0.8403

Metrics for df with 498 columns (grid with 1 level no 0s)

Average Metrics Across Folds:
  Average MSE: 0.0021
  Average MAE: 0.0324
  Average Adjusted R2: 1.1574
  Average R2: 0.8629


In [8]:
#Choose the last one
deposit_data_list[-1].to_csv('Data/final_dataset_1_no_0.csv', index=False)