In [1]:
%load_ext autoreload
%autoreload 2

import os
import collections
import numpy as np
import pandas as pd

#relative path (in jupyter notebook, restart kernel after modifying functions)
from main_functions import make_dir, load_dataset_type0
from main_functions import geochemistry_calculation1, impute_REE, anenburg_lambdas, aitchison_CLR, pca_calculation
from main_functions import load_umap, umap_calculation, umap_apply, train_binary_classifier, load_wiggle, find_outliers_iqr, plot_ROC
from main_functions import define_categorical_cmap_custom, define_continuous_cmap
from main_functions import plot_umap_variable, plot_umap_variable_wiggle, plot_umap_variable_interactive 
from main_functions import plot_correlation_circle, pca_plots, plot_pca_3d_rgb, plot_pca_biplots_rgb, plot_umap_biplot_rgb, pca_to_rgb
from main_functions_interactive import calculate_xy_limits, plot_umap_grid_interactive

import dash
from dash import dcc, html, Input, Output
import plotly.express as px 
import seaborn as sns

'''
#Created: 27-Oct-25, Marco Acevedo
#Updated: 

Open interactive UMAP plot in the web-browser playing with n_neighbors, min_dist, and metric. 
The variable for the legend can be customised.

Documentation
#https://dash.plotly.com/dash-core-components/slider

'''



'\n#Created: 27-Oct-25, Marco Acevedo\n#Updated: \n\nOpen interactive UMAP plot in the web-browser playing with n_neighbors, min_dist, and metric. \nThe variable for the legend can be customised.\n\nDocumentation\n#https://dash.plotly.com/dash-core-components/slider\n\n'

In [2]:
#User input
data_folder0 = r"E:\Feb-March_2024_zircon imaging\05_zircon geochemical data\carrasco-godoy data\Zircon_Fertility_Data-main"
file1 = "CG2024data_v5.csv" #input data
end_idx_categorical = 19 #last raw input categorical column
end_idx_numerical = 64 #last raw input numerical column
grid_name = 'Oct29'

umap_variables = ['P', 'Ce', 'Eu', 'Th', 'Hf', 'ppmCalc_La', 'ppmCalc_Pr', 'Imputed_Y', 'Imputed_Nd', 'Imputed_Gd', 'Imputed_Er', 'Imputed_Yb', 'Imputed_Sm', 'Imputed_Dy', 'Imputed_Lu']
clr_choice = 'yes' #fertility: no; geochemistry: yes
standardise_choice = 'no' #fertility: yes; geochemistry: no
outliers_remove = 'no' #allows meaningful plots

model_folder = r"" #input folder
neighbors_input = 20 #default=15, preservation of local (> singletons) vs global structure
min_dist_input = 0.3 #0.003, min. dist. of packing value (in low dimensional representation)
metric_input = 'correlation'
components_output = 2 #default=2, dimensionality
variable_legend = "" #default= "" non-parametric umap; else, any categorical: "ml_classes"

type_cmap = "categorical" #categorical, continuous
variable_categorical = "Temporality"  #"ml_classes"
variable_continuous = "P" 
percentile = 1 #capping data
n_bins = 10 #percentile bins
chosen_palette = "turbo" #"icefire", "viridis", "cubehelix", "rainbow"

add_wiggle = "yes" #or "no"; fertile=blue / barren= red background
variable_legend_ML = "ml_classes" #for binary classification: ml_classes
type_ml_model = 'svc_linear' #‘rt’, ‘rf’, ‘knn’, ‘ada_boost’, ‘svc_linear’, svc_poly’, ‘svc_rbf’, ‘neural_network’
variable_interactive = "Deposit_Batholith" #for interactive plot

#Custom UMAP plots
markerSize = 4
type_background = 'solid' #geochemistry: solid, fertility: gradient, natwani: contour



#Script begins (do not edit)

#Default
file4 = "standard_scaler.sav"
file5 = "umap_model.sav" 

if variable_legend:
    umap_type = "Non-parametric supervised"
else:
    umap_type = "Non-parametric unsupervised"

data_start_idx = end_idx_categorical + 4 #for interactive plot

#Generate folders
current_directory = os.getcwd() #main script location
r_script_path = os.path.join(current_directory, "script_Carrasco.R")

#Destination
data_folder1b = os.path.join(os.path.dirname(data_folder0), f'UMAP_grid_seach_{grid_name}')
data_folder1 = data_folder1b
make_dir(data_folder1b)

file4 = "standard_scaler.sav"
file5 = "umap_model.sav" 
file_grid = 'grid_book.csv'
file_embeddings = 'grid_embeddings.csv'
filepath1 = os.path.join(data_folder0, file1)
file_name1 = os.path.join(data_folder1b, file_grid)
file_name2 = os.path.join(data_folder1b, file_embeddings)

#Load data
table2 = load_dataset_type0(filepath1, end_idx_categorical, end_idx_numerical)

Geochemical calculations

In [3]:
output_Carrasco = impute_REE(table2, r_script_path, data_folder1) #time= 1m 20sec

table_input_lambdas = pd.concat([table2, output_Carrasco], axis=1) #plus Ce, Eu
output_Anenburg = anenburg_lambdas(table_input_lambdas, data_folder1) #time= 0.8 sec

output_calculations1 = geochemistry_calculation1(table_input_lambdas)

#Define inputs
table_input_umap = pd.concat([table2, output_Carrasco, output_Anenburg, output_calculations1], axis=1) #UMAP_zirconfertility_testing_PCA.ipynb
table_input_pca = table_input_umap

#CLR followed by PCA
table_input_pca2 = table_input_pca.loc[:, umap_variables]

if clr_choice == 'no':
    output_PCA, loadings, explained_variance_ratio = pca_calculation(table_input_pca2, standardise_choice, data_folder1) #output_CLR

elif clr_choice == 'yes':
    output_CLR = aitchison_CLR(table_input_pca2, data_folder1)
    output_PCA, loadings, explained_variance_ratio = pca_calculation(output_CLR, standardise_choice, data_folder1) #output_CLR  

R Script Output:

R Script Error (if any):

Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

In model_REE(., prefix = NULL, suffix = NULL, method = model_chosen,  :
  There are 5689 Samples with less than 3 or less elements to model, consider filtering that data, or including more elements



Prepare interactive plot

In [None]:
table3 = table2
variable_categorical = 'Temporality' #'ml_classes'
markerSize_interactive = 3

# Load data
table_coordinates = pd.read_csv(file_name2)
table_book = pd.read_csv(file_name1)

limits_array = calculate_xy_limits(table_book, table_coordinates)


# Get parameter ranges and category options
var1 = 'neighbors_input'
var2 = 'min_dist_input'
var3 = 'metric_input'
neighbors_input = table_book[var1].unique().tolist()
min_dist_input = table_book[var2].unique().tolist()
metric_input = table_book[var3].unique().tolist()

#Option 1: Elise's selection
param1_sel = 20
param2_sel = 0.003
param3_sel = 'correlation'

# #Option 2: Automatic selection
# param1_sel = neighbors_input[len(neighbors_input) // 2]
# param2_sel = min_dist_input[len(min_dist_input) // 2]
# param3_sel = metric_input[len(metric_input) // 2]

#Plot colormap
if type_cmap == 'categorical':
    variable_legend1 = variable_categorical
    list_unique, classif, cmap_updated, classif_colours = define_categorical_cmap_custom(table3, variable_categorical)
    
elif type_cmap == 'continuous':
    variable_legend1 = variable_continuous
    list_unique, classif, cmap_updated, classif_colours = define_continuous_cmap(table3, variable_continuous, percentile, n_bins, chosen_palette)

#converting triplets into string
colours = [sns.color_palette(palette= cmap_updated)[x] for x in classif.unique()] #RGB triplets
colours0 = np.array(colours)*255
colours1 = colours0.round(0)
a = colours1.astype(str)
colour_map = [f'rgb({",".join(c)})' for c in a]

#Optional: Interactive plot to find and understand points
table_coordinates['classif'] = classif
table_interactive = table3.iloc[:, 0:data_start_idx]
table_interactive2 = pd.concat([table_coordinates, table_interactive], axis=1) #or: table3.iloc[:, 0:data_start_idx]	

Interactive plot

In [34]:
app = plot_umap_grid_interactive(table_book, table_interactive2, 
                            var1, var2, var3, param1_sel, param2_sel, param3_sel,
                            neighbors_input, min_dist_input, metric_input,                               
                            variable_legend1, list_unique, colour_map, classif_colours, 
                            limits_array, markerSize_interactive)

# Run the app
if __name__ == "__main__":	
    app.run(debug=True, port=2225, jupyter_mode="tab")


Dash app running on http://127.0.0.1:2225/


<IPython.core.display.Javascript object>