In [None]:
'''
#UMAP_and_plotting_v6.ipynb

Script to use UMAP dimensionality reduction, save models, and plot merged Carrasco-Godoy et al. (2024) dataset.
The user can trial binary classification with supervised learning, and edit the color map of the plots using continuous or 
categorical variables (static and plotly) before saving in a trial folder.

#Created: 20-Aug-24, Marco Acevedo
#Updated: 12-Sep-24, 17-Sep-24, 3-Oct-24, 20-Oct-25

#If having installation issues, please, try installing:
openpyxl module
Mime type rendering requires nbformat>=4.2.0

Documentation:
https://saturncloud.io/blog/jupyter-notebook-reload-module-a-comprehensive-guide/
'''

# Dependencies

In [1]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd

#relative path (in jupyter notebook, restart kernel after modifying functions)
from main_functions import make_dir, load_dataset_type0
from main_functions import geochemistry_calculation1, impute_REE, anenburg_lambdas, aitchison_CLR, pca_calculation
from main_functions import load_umap, umap_calculation


'''
#Created: 27-Oct-25, Marco Acevedo
#Updated: 

Run UMAP with different parameters and generate a database for an interactive UMAP plot.

'''

'\n#Created: 27-Oct-25, Marco Acevedo\n#Updated: \n\nRun UMAP with different parameters and generate a database for an interactive UMAP plot.\n\n'

# Loading and filtering data

In [None]:
#User input
data_folder0 = r"E:\Feb-March_2024_zircon imaging\05_zircon geochemical data\carrasco-godoy data\Zircon_Fertility_Data-main"
file1 = "CG2024data_v5.csv" #input data
end_idx_categorical = 19 #last raw input categorical column
end_idx_numerical = 64 #last raw input numerical column
grid_name = 'Oct29'

umap_variables = ['P', 'Ce', 'Eu', 'Th', 'Hf', 'ppmCalc_La', 'ppmCalc_Pr', 'Imputed_Y', 'Imputed_Nd', 'Imputed_Gd', 'Imputed_Er', 'Imputed_Yb', 'Imputed_Sm', 'Imputed_Dy', 'Imputed_Lu']
clr_choice = 'yes' #fertility: no; geochemistry: yes
standardise_choice = 'no' #fertility: yes; geochemistry: no
outliers_remove = 'no' #allows meaningful plots

model_folder = r"" #input folder
neighbors_input = 20 #default=15, preservation of local (> singletons) vs global structure
min_dist_input = 0.3 #0.003, min. dist. of packing value (in low dimensional representation)
metric_input = 'correlation'
components_output = 2 #default=2, dimensionality
variable_legend = "" #default= "" non-parametric umap; else, any categorical: "ml_classes"

type_cmap = "continuous" #categorical, continuous
variable_categorical = "ml_classes"  #"ml_classes"
variable_continuous = "P" 
percentile = 1 #capping data
n_bins = 10 #percentile bins
chosen_palette = "turbo" #"icefire", "viridis", "cubehelix", "rainbow"

add_wiggle = "yes" #or "no"; fertile=blue / barren= red background
variable_legend_ML = "ml_classes" #for binary classification: ml_classes
type_ml_model = 'svc_linear' #‘rt’, ‘rf’, ‘knn’, ‘ada_boost’, ‘svc_linear’, svc_poly’, ‘svc_rbf’, ‘neural_network’
variable_interactive = "Deposit_Batholith" #for interactive plot

#Custom UMAP plots
markerSize = 4
type_background = 'solid' #geochemistry: solid, fertility: gradient, natwani: contour



#Script begins (do not edit)

#Default
file4 = "standard_scaler.sav"
file5 = "umap_model.sav" 

if variable_legend:
    umap_type = "Non-parametric supervised"
else:
    umap_type = "Non-parametric unsupervised"

data_start_idx = end_idx_categorical + 4 #for interactive plot

#Generate folders
current_directory = os.getcwd() #main script location
r_script_path = os.path.join(current_directory, "script_Carrasco.R")

#Destination
file3 = "workable_table.xlsx" #for reproducibility
file4 = "standard_scaler.sav"
file5 = "umap_model.sav" 
file_grid = 'grid_book.csv'
file_embeddings = 'grid_embeddings.csv'

data_folder1b = os.path.join(os.path.dirname(data_folder0), f'UMAP_grid_seach_{grid_name}')
data_folder1 = data_folder1b
make_dir(data_folder1b)

filepath1 = os.path.join(data_folder0, file1)
file_name1 = os.path.join(data_folder1b, file_grid)
file_name2 = os.path.join(data_folder1b, file_embeddings)

#Load data
table2 = load_dataset_type0(filepath1, end_idx_categorical, end_idx_numerical)

Geochemical calculations

In [3]:
output_Carrasco = impute_REE(table2, r_script_path, data_folder1) #time= 1m 20sec

table_input_lambdas = pd.concat([table2, output_Carrasco], axis=1) #plus Ce, Eu
output_Anenburg = anenburg_lambdas(table_input_lambdas, data_folder1) #time= 0.8 sec

output_calculations1 = geochemistry_calculation1(table_input_lambdas)

R Script Output:

R Script Error (if any):

Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

In model_REE(., prefix = NULL, suffix = NULL, method = model_chosen,  :
  There are 5689 Samples with less than 3 or less elements to model, consider filtering that data, or including more elements



In [None]:
#Protype intervals that are going to be used (copy-paste below):

step1 = 2.5 #for n_samples ~18
step2 = 0.02
neighbors_input = np.arange(5, 45 + step1, step=step1).astype(int)
min_dist_input0 = np.arange(0.003, 0.3 + step2, step=step2)
min_dist_input = np.insert(min_dist_input0, 0, np.array([0.001, 0.002])) #add for variation
metric_input = ['correlation', 'euclidean']

print(f"n_neighbors to use {neighbors_input} that total {len(neighbors_input)} ")
print(f"min_dist to use {min_dist_input} that total {len(min_dist_input)} ")

n_neighbors to use [ 5  7 10 12 15 17 20 22 25 27 30 32 35 37 40 42 45] that total 17 
min_dist to use [0.001 0.002 0.003 0.023 0.043 0.063 0.083 0.103 0.123 0.143 0.163 0.183
 0.203 0.223 0.243 0.263 0.283 0.303] that total 18 


Generate UMAP parameter grid search

In [7]:
step1 = 2.5 #for n_samples ~18
step2 = 0.02
neighbors_input = np.arange(5, 45 + step1, step=step1).astype(int)
min_dist_input0 = np.arange(0.003, 0.3 + step2, step=step2)
min_dist_input = np.insert(min_dist_input0, 0, np.array([0.001, 0.002])) #add for variation
metric_input = ['correlation', 'euclidean']

#Alternative:
# n_samples = 15 #default= 4
# neighbors_input, step1 = np.linspace(5, 55, n_samples, endpoint= True, retstep=True, dtype=int) 
# min_dist_input, step2 = np.linspace(0.02, 1, n_samples, endpoint= True, retstep=True, dtype=float) 

components_str = [f"umap{str(x)}" for x in range(0, components_output)]

#Define inputs
table_input_umap = pd.concat([table2, output_Carrasco, output_Anenburg, output_calculations1], axis=1) #UMAP_zirconfertility_testing_PCA.ipynb
table_input_pca = table_input_umap

out = []
embedding_tables = []
m = 0
for i in metric_input:
    for j in neighbors_input:
        for k in min_dist_input:
            m += 1
            trial_name = f"folder_{m:03d}"            

            #Destination            
            data_folder2 = os.path.join(data_folder1b, trial_name)            
            make_dir(data_folder2)            
            filepath4 = os.path.join(data_folder2, file4)
            filepath5 = os.path.join(data_folder2, file5)                       

            #UMAP
            output_UMAP = umap_calculation(table_input_umap, umap_variables, variable_legend,
                                           j, k, i, components_output, filepath4, filepath5, data_folder1) #time= 25 sec

            input_umap, sc, umap_model = load_umap(filepath4, filepath5, data_folder1)

    
            #writing reference book
            out.append([i, j, k, data_folder2])

            #writing embeddings            
            output_UMAP.columns = [trial_name + '_' + str_temp for str_temp in components_str] #manual

            embedding_tables.append(output_UMAP)

#Refence book
out3 = np.array(out) 
file_table = pd.DataFrame(out3)
file_table.columns =['metric_input', 'neighbors_input', 'min_dist_input', 'folderpath']    
#medicine
file_table['neighbors_input'] = file_table['neighbors_input'].astype(float)
file_table['min_dist_input'] = file_table['min_dist_input'].astype(float)

file_table.to_csv(file_name1, sep=',', encoding='utf-8', index=False, header=True)

#Embeddings table
embedding_tables2 = pd.concat(embedding_tables, axis=1)
embedding_tables2.to_csv(file_name2, sep=',', encoding='utf-8', index=False, header=True)