In [195]:
%pip install matplotlib --quiet
%pip install Dash --quiet
%pip install dask --quiet
%pip install pyarrow

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\ntrip\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\ntrip\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\ntrip\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\ntrip\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [196]:
#write requirements for longevity of the notebook
import importlib.metadata as md
pkgs = [
    "numpy","pandas","dask","plotly","dash",
    "pyarrow","shapely","matplotlib"
]

for p in pkgs:
    print(f"{p}=={md.version(p)}")

numpy==2.3.4
pandas==2.3.3
dask==2025.12.0
plotly==6.4.0
dash==3.3.0
pyarrow==22.0.0
shapely==2.1.2
matplotlib==3.10.7


In [197]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
import plotly.express as px
import plotly.graph_objects as go

from plotly.colors import DEFAULT_PLOTLY_COLORS



## Loading tables from Google Drive
### Designed for the Google Sheet to have three tabs:
*  **srcs** tab contains source sample geochemistry and Group variable (follow example provided)
*  **srcs_locs** tab contains Latitude and Longitude coordinates for each source
*  **study** tab contains geochemistry of samples under study

In [205]:
def get_df(sheet_id, sheet_name):
  url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}'
  return pd.read_csv(url)

In [250]:
# Load Data from tabs on a Google Drive Spreadsheet
# Provide the sheet_id which is an alphanumeric string that starts and ends with the slash (/).
# In a standard Google Sheet URL the sheet_id appears after the /d/

srcs = get_df('1R4PlMACBn0l8ZguwtYDlZLnbvORzH5CHhKGFBezjSvk','KRA21_Sources') #Source data from Rademaker et al 2021
srcs_locs = get_df('1R4PlMACBn0l8ZguwtYDlZLnbvORzH5CHhKGFBezjSvk','Source_Coords')
study = get_df ('1R4PlMACBn0l8ZguwtYDlZLnbvORzH5CHhKGFBezjSvk','Samples')

### Dataset headers are cleaned up (if necessary) ###

In [252]:
# Remove Bruker column name artifacts and spaces
#srcs  = clean_geochem_columns(srcs)
study.columns = study.columns.str.replace(r'(Ka1|La1| )', '', regex=True)
# ensure Sample and Group are String
string_cols = ['Group', 'Sample']
for df in (study, srcs):
    present_strings = [c for c in string_cols if c in df.columns]
    numeric_cols = df.columns.difference(present_strings)
# make first two columns strings, the rest Numeric
    df[present_strings] = df[present_strings].astype('string')
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
    df.dropna(axis=1, how='all', inplace=True) # Drop columns that are all NaN such as extra columns from spreadsheet

# How are the column headers?
print('Dataset Headers')
print('Study: ' + str(study.columns.tolist()))
print('Sources: ' + str(srcs.columns.tolist()))


Dataset Headers
Study: ['Sample', 'Group', 'Mn', 'Fe_percent', 'Zn', 'Ga', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Ba', 'Th']
Sources: ['Sample', 'Group', 'Fe', 'Th', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Northing', 'Easting', 'Elevation']


In [253]:
print('Complete Study Sample Table')
display(study)

Complete Study Sample Table


Unnamed: 0,Sample,Group,Mn,Fe_percent,Zn,Ga,Rb,Sr,Y,Zr,Nb,Ba,Th
0,RGM-Solid1,Reference,196,1.3093,32,17,146,104,29,211,8,810,12
1,7150_1,Anillo,408,0.6317,44,22,159,142,17,118,16,905,19
2,7150_2,Anillo,405,0.6278,47,19,162,146,14,123,15,994,17
3,7150_4,Anillo,462,0.6681,49,17,159,138,18,111,12,1033,16
4,7151_3,Anillo,431,0.653,38,12,169,142,15,128,15,977,19
5,7151_1,Anillo,469,0.6506,41,12,168,147,20,119,15,831,17
6,7151_4,Anillo,575,0.7075,39,13,181,147,19,140,14,888,16
7,7151_2,Anillo,404,0.6645,42,20,179,142,18,138,13,940,25
8,7151_5,Anillo,393,0.5848,25,14,147,127,16,121,14,926,15
9,7152_1,Anillo,614,0.6484,37,18,161,130,16,119,18,792,22


## Use Lasso tool to select obsidian sources from region of interest

In [210]:
from dash import Dash, dcc, html, Input, Output
from dash.dash_table import DataTable
import plotly.express as px

fig = px.scatter_map(
    srcs_locs,
    lat="Lat",
    lon="Long",
    hover_name="Name",
    zoom=3,
    map_style="open-street-map"
)
# Changes the size of points if they are selected
fig.update_traces(
    selected={"marker": {"size": 14}}
)
# Defaults your selection to lasso, and specifies margin
fig.update_layout(
    dragmode="lasso",
    clickmode="event+select",
    margin={"r": 0, "l": 0, "t" : 0, "b": 0})

# Create the Dash App
app = Dash(__name__)
app.layout = html.Div([
    # The map itself
    dcc.Graph(id="map", figure=fig),
    # Keeps track of selected variables
    dcc.Store(id="selection-store"),
    # A title for the text region below
    html.H3(
        id="count",
        children="Number of Selected Sources: 0",
        style={"textAlign": 'center', 'backgroundColor': 'white'}
    ),

    # A table for tracking selected variables
    DataTable(
        id="table",
        columns=[{"name": c, "id": c} for c in srcs_locs.columns],
        data=srcs_locs.iloc[[]].to_dict("records"),
        style_header={
            "textAlign": "center",
            "fontWeight": "bold"
        },
        style_cell={
            "textAlign": "center"
        }
    )
])

# Use this to track selected sites outside of the cell
selected_locs = None

# This callback updates the selection store when data is selected on the map
@app.callback(
    Output("selection-store", "data"),
    Input("map", "selectedData")
)
def stash_selection(selected):
    global selected_locs
    if not selected or not selected.get("points"):
        selected_locs = None
        return {"indices": [], "names": []}
    inds = [p["pointIndex"] for p in selected["points"]]
    names = srcs_locs["Name"].iloc[inds].tolist()
    selected_locs = srcs_locs.iloc[inds]
    return {"indices": inds, "names": names}

# This callback outputs to the text region once the data store is updated
@app.callback(
    Output("count", "children"),
    Output("table", "data"),
    Input("selection-store", "data"),
)
def update_views(sel):
    inds = sel.get("indices", []) if sel else []
    sub = srcs_locs.iloc[inds]
    return f"Number of Selected Sources: {len(inds)}", sub.to_dict("records")

app.run(jupyter_mode="inline")

In [211]:
print(selected_locs)

     Name      Lat     Long
0    Alca -15.1580 -72.7360
1  Anillo -15.0558 -72.9812
2  Chivay -15.6400 -71.5400


In [212]:
# Select the appropriate rows from the Source Chemistry sheet

if selected_locs is None:
    print('No sources selected. Please return to the map above and select obsidian sources in the region of interest.')
else:
    # selected_sources is usually a DataFrame set by the Dash callback above
    if isinstance(selected_locs, pd.DataFrame):
        names = selected_locs['Name'].tolist()
    else:
        # In some contexts selected_locs may be a dict from the dcc.Store
        names = selected_locs.get('names', []) if hasattr(selected_locs, 'get') else []

    # Filter the full chemistry table `srcs` by the selected names
    matched = srcs[srcs['Group'].isin(names)]
    print(f'Number of selected Obsidian Sources: {len(names)}; matched rows in Sources Chemistry table: {len(matched)}')
    display(matched.head(5))


Number of selected Obsidian Sources: 3; matched rows in Sources Chemistry table: 258


Unnamed: 0,Sample,Group,ANID,Fe,Th,Rb,Sr,Y,Zr,Nb,Northing,Easting,Elevation
0,Alca-1,Alca,,5453,13,132,72,17,99,12,8325947.0,741049.0,3150.0
1,Alca-1,Alca,,5365,13,132,74,15,103,11,8326797.0,740828.0,3050.0
2,Alca-1,Alca,,5503,13,135,74,14,108,13,8327982.0,744167.0,4419.0
3,Alca-1,Alca,,5675,14,138,74,15,101,12,8325622.0,746169.0,4146.0
4,Alca-1,Alca,,5514,13,140,75,14,98,12,8327309.0,747901.0,4830.0


In [213]:
# Ellipses require at least 2 data points per group.
# Split Sources into two tables called 'srcs' and 'onerow' for single sample sources

# Count how many times each group appears
counts = srcs['Group'].value_counts()

# DataFrame with groups that appear < 2 times
onesample = srcs[srcs['Group'].map(counts) < 2]

# DataFrame with sources that have >= 2 samples
srcs = srcs[srcs['Group'].map(counts) >= 2]

print('Ellipses will be created for the following Source Groups:')
print(srcs['Group'].value_counts())

print('Points will be shown for these Sources with <2 samples:')

if onesample is not None and not onesample.empty:
    print(onesample)
else:
    print('(All sources have sufficient samples for ellipses)')


Ellipses will be created for the following Source Groups:
Group
Alca            248
Quispisisa       20
Chivay            6
Jampatilla        6
Lisahuacho        5
Potreropampa      5
Anillo            4
Name: count, dtype: Int64
Points will be shown for these Sources with <2 samples:
(All sources have sufficient samples for ellipses)


In [219]:
#@title
#@
def confidence_ellipse(x, y, n_std=1.96, size=100):   # Ellipses in Plotly
    """
    Get the covariance confidence ellipse of *x* and *y*.
    from https://gist.github.com/dpfoose/38ca2f5aee2aea175ecc6e599ca6e973

    Parameters
    ----------
    x, y : array-like, shape (n, )
        Input data.
    n_std : float
        The number of standard deviations to determine the ellipse's radiuses.
    size : int
        Number of points defining the ellipse
    Returns
    -------
    String containing an SVG path for the ellipse

    References (H/T)
    ----------------
    https://matplotlib.org/3.1.1/gallery/statistics/confidence_ellipse.html
    https://community.plotly.com/t/arc-shape-with-path/7205/5
    """
    if x.size != y.size:
        raise ValueError("x and y must be the same size")

    cov = np.cov(x, y)
    pearson = cov[0, 1]/np.sqrt(cov[0, 0] * cov[1, 1])
    # Using a special case to obtain the eigenvalues of this
    # two-dimensionl dataset.
    ell_radius_x = np.sqrt(1 + pearson)
    ell_radius_y = np.sqrt(1 - pearson)
    theta = np.linspace(0, 2 * np.pi, size)
    ellipse_coords = np.column_stack([ell_radius_x * np.cos(theta), ell_radius_y * np.sin(theta)])

    # Calculating the stdandard deviation of x from
    # the squareroot of the variance and multiplying
    # with the given number of standard deviations.
    x_scale = np.sqrt(cov[0, 0]) * n_std
    x_mean = np.mean(x)

    # calculating the stdandard deviation of y ...
    y_scale = np.sqrt(cov[1, 1]) * n_std
    y_mean = np.mean(y)

    translation_matrix = np.tile([x_mean, y_mean], (ellipse_coords.shape[0], 1))
    rotation_matrix = np.array([[np.cos(np.pi / 4), np.sin(np.pi / 4)],
                                [-np.sin(np.pi / 4), np.cos(np.pi / 4)]])
    scale_matrix = np.array([[x_scale, 0],
                            [0, y_scale]])
    ellipse_coords = ellipse_coords.dot(rotation_matrix).dot(scale_matrix) + translation_matrix

    path = f'M {ellipse_coords[0, 0]}, {ellipse_coords[0, 1]}'
    for k in range(1, len(ellipse_coords)):
        path += f'L{ellipse_coords[k, 0]}, {ellipse_coords[k, 1]}'
    path += ' Z'
    return path



In [220]:
# Assign a color to each Source for consistency
name_to_color = {}

unique_groups = srcs['Group'].dropna().unique()  # drop NA just in case
colors = DEFAULT_PLOTLY_COLORS

# Cycle colors if more groups than colors
name_to_color = {name: colors[i % len(colors)] for i, name in enumerate(unique_groups)}

# 2️⃣ If you want a simple mapping to original name (optional)
unique_name_to_name = {name: name for name in unique_groups}

In [232]:
# To change the variables modify x_col and y_col here
# and Re-run the cell
print('Available elements to plot.')
print(srcs.columns)
print('To change elements modify x_col and y_col variables and re-run this cell:')

# Variables to plot
x_col = 'Sr'
y_col = 'Zr'
group = 'Group'

figsrcs = go.Figure()


# Add source ellipses
for g in srcs[group].unique():
    # Map group to color, default to gray if missing
    color = name_to_color.get(unique_name_to_name.get(g, ""), "gray")

    # Optional: add source points (uncomment to show)
    # figsrcs.add_trace(
    #     go.Scatter(
    #         x=srcs[srcs[group] == g][x_col],
    #         y=srcs[srcs[group] == g][y_col],
    #         name=g,
    #         mode='markers',
    #         marker=dict(symbol='x', size=4, color=color)
    #     )
    # )

    # Add confidence ellipse for the group
    figsrcs.add_shape(
        type='path',
        path=confidence_ellipse(
            srcs[srcs[group] == g][x_col],
            srcs[srcs[group] == g][y_col]
        ),
        line_color=color,
        name=g,
        showlegend=True
    )

# Add single source sample point
figsrcs.add_trace(
    go.Scatter(
        x=onesample[x_col],
        y=onesample[y_col],
        name="Source Sample",
        mode='markers',
        marker=dict(symbol='x', size=4, color='black'),
        text=onesample['Sample'],
        hovertemplate="Source: %{text}<br><extra></extra>",
        showlegend=True
    )
)

# Map Study sample groups to colors
study_colors = [
    name_to_color.get(unique_name_to_name.get(g, ""), "gray")
    for g in study[group]
]

# Add study samples colored by Group
figsrcs.add_trace(
    go.Scatter(
        x=study[x_col],
        y=study[y_col],
        name="Study Samples",
        mode='markers',
        text=study['Sample'],
        hovertemplate="Sample: %{text}<br><extra></extra>",
        showlegend=True,
        marker=dict(
            size=8,
            symbol='circle',
            color=study_colors
        )
    )
)
# ------------------------------
# Add confidence ellipses for study groups
# ------------------------------
for g in study[group].unique():
    mask = study[group] == g
    data_x = study.loc[mask, x_col]
    data_y = study.loc[mask, y_col]
    
    # Skip empty groups
    if len(data_x) < 2:
        continue
    
    color = name_to_color.get(g, "gray")
    
    figsrcs.add_shape(
        type='path',
        path=confidence_ellipse(data_x, data_y),
        line_color=color,
        line_width=2,
        name=f"{g} Study Ellipse",
        legendgroup=g,
        showlegend=True,
        opacity=0.5
    )

# Set layout with centered title
figsrcs.update_layout(
    title={
        'text': "Connecting Study Samples with known obsidian sources",
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title=x_col,
    yaxis_title=y_col
)

figsrcs.show()

Available elements to plot.
Index(['Sample', 'Group', 'ANID', 'Fe', 'Th', 'Rb', 'Sr', 'Y', 'Zr', 'Nb',
       'Northing', 'Easting', 'Elevation'],
      dtype='object')
To change elements modify x_col and y_col variables and re-run this cell:


In [229]:
# TERNARY PLOT

# --- Function to compute ellipse in 2D (fractions space) ---
def confidence_ellipse_points(x, y, n_std=1.96, size=100):
    """
    Return Nx2 array of (x,y) points for the confidence ellipse in 2D space.
    """
    if x.size != y.size:
        raise ValueError("x and y must be the same size")

    cov = np.cov(x, y)
    mean_x = np.mean(x)
    mean_y = np.mean(y)

    eigvals, eigvecs = np.linalg.eigh(cov)
    order = eigvals.argsort()[::-1]
    eigvals, eigvecs = eigvals[order], eigvecs[:, order]

    theta = np.linspace(0, 2 * np.pi, size)
    ellipse = np.stack([np.cos(theta), np.sin(theta)], axis=1)
    scale = n_std * np.sqrt(eigvals)
    ellipse = ellipse @ np.diag(scale) @ eigvecs.T
    ellipse += np.array([mean_x, mean_y])

    return ellipse  # Nx2 array

# --- Normalize compositional data ---
def normalize_composition(df, cols):
    """
    Normalize rows so that the specified columns sum to 1
    """
    comp = df[cols].values
    row_sums = comp.sum(axis=1).reshape(-1,1)
    comp_norm = comp / row_sums
    return comp_norm

# Columns
cols = ['Rb','Sr','Zr']

# Normalize srcs and study
srcs_frac = normalize_composition(srcs, cols)
study_frac = normalize_composition(study, cols)

# --- Ternary Plot ---
fig = go.Figure()

# --- PLOT ELLIPSES ---
for g in srcs[group].unique():
    color = name_to_color.get(unique_name_to_name.get(g, ""), "gray")

    mask = srcs[group] == g
    rb, sr, zr = srcs_frac[mask].T

    ell = confidence_ellipse_points(rb, sr)

    eRb = ell[:,0]
    eSr = ell[:,1]
    eZr = 1 - eRb - eSr

    fig.add_trace(
        go.Scatterternary(
            a=eRb, b=eSr, c=eZr,
            mode='lines',
            line=dict(color=color),
            name=f"{g} Source",
            showlegend=True
        )
    )

# --- PLOT ONESAMPLE POINTS IF NON-EMPTY ---
if onesample is not None and not onesample.empty:
    onesample_frac = normalize_composition(onesample, cols)
    for g in onesample[group].unique():
        mask = onesample[group] == g
        data = onesample_frac[mask]
        color = name_to_color.get(unique_name_to_name.get(g, ""), "black")

        fig.add_trace(
            go.Scatterternary(
                a=data[:,0],
                b=data[:,1],
                c=data[:,2],
                mode='markers',
                name=f"Source Sample: {g}",
                marker=dict(symbol='x', size=6, color=color),
                text=onesample[mask]['Sample'],
                hovertemplate="Source: %{text}<extra></extra>"
            )
        )

# --- PLOT STUDY SAMPLES WITH DIFFERENT COLORS PER GROUP ---
for g in study[group].unique():
    mask = study[group] == g
    data = study_frac[mask]
    color = name_to_color.get(unique_name_to_name.get(g, ""), "gray")

    fig.add_trace(
        go.Scatterternary(
            a=data[:,0],
            b=data[:,1],
            c=data[:,2],
            mode='markers',
            name=f"Study: {g}",
            marker=dict(symbol='circle', size=8, color=color),
            text=study[mask]['Sample'],
            hovertemplate="Sample: %{text}<extra></extra>"
        )
    )

# --- Layout ---
fig.update_layout(
    ternary=dict(
        sum=1,
        aaxis=dict(title='Rb'),
        baxis=dict(title='Sr'),
        caxis=dict(title='Zr')
    ),
    title=dict(
        text="Ternary Plot with Obsidian Source Ellipses and Study Samples",
        x=0.5,
        xanchor='center',
        yanchor='top'
    )
)

fig.show()
