<a href="https://colab.research.google.com/github/AvantiShri/oceanography_colab_notebooks/blob/master/for_rian/ArchetypeAnalysis_GP15_WaterMasses.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install py_pcha
!pip install gsw

Collecting py_pcha
  Downloading https://files.pythonhosted.org/packages/fb/02/65048696734e504414b7a8ea171a0c012fcb72daebb69826024478d3a3d8/py_pcha-0.1.3-py3-none-any.whl
Installing collected packages: py-pcha
Successfully installed py-pcha-0.1.3
Collecting gsw
[?25l  Downloading https://files.pythonhosted.org/packages/31/88/bfb0b1df0ea0a147bde8020af1ffc089ff711a9c9ca630ebb6212a8bc8ff/gsw-3.3.1.tar.gz (2.4MB)
[K     |████████████████████████████████| 2.4MB 6.8MB/s 
Building wheels for collected packages: gsw
  Building wheel for gsw (setup.py) ... [?25l[?25hdone
  Created wheel for gsw: filename=gsw-3.3.1-cp36-cp36m-linux_x86_64.whl size=2008419 sha256=17a4918e5225bca4da58e972ee93ee6f2580167a10612ef5d6f569fe38e71ff6
  Stored in directory: /root/.cache/pip/wheels/34/a5/7d/a8398b76644ed482744a2c4af2f0869c20a15a3682c8d37ab6
Successfully built gsw
Installing collected packages: gsw
Successfully installed gsw-3.3.1


Grab the data

In [2]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1O869bUhoTrPCW4hDaSVswLiQ7vzhzIlP' -O names_added_GP15OMPA_33RR20180918_only_gs_rosette_clean1_hy1.csv

--2020-08-14 20:10:13--  https://docs.google.com/uc?export=download&id=1O869bUhoTrPCW4hDaSVswLiQ7vzhzIlP
Resolving docs.google.com (docs.google.com)... 108.177.126.113, 108.177.126.102, 108.177.126.139, ...
Connecting to docs.google.com (docs.google.com)|108.177.126.113|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-0s-a4-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/675sl0i82umj8cet0v400gt40mjt2nub/1597435800000/06203730782251856755/*/1O869bUhoTrPCW4hDaSVswLiQ7vzhzIlP?e=download [following]
--2020-08-14 20:10:13--  https://doc-0s-a4-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/675sl0i82umj8cet0v400gt40mjt2nub/1597435800000/06203730782251856755/*/1O869bUhoTrPCW4hDaSVswLiQ7vzhzIlP?e=download
Resolving doc-0s-a4-docs.googleusercontent.com (doc-0s-a4-docs.googleusercontent.com)... 108.177.119.132, 2a00:1450:4013:c00::84
Connecting to doc-0s-a4-docs.googleusercontent.com (doc-0s

In [3]:
from matplotlib import pyplot as plt
import numpy as np
import pandas
import gsw

Read in the data frame and rename the columns

In [4]:
#Easy remapping of the column names
colnames_map = {'Station number':"stnnbr",
            'GEOTRACES ID':"geotrc_ID",
            'latitude (degrees)':"lat",
            'longitude (degrees)':"lon",
            'depth (m)':"depth",
            'pressure (dbar)':"pres",
            'temperature(degrees C)':"t",
            'salinity (psu)':"SP",
            'oxygen (umol/kg)':"O2",
            'silicate (umol/kg)':"Si",
            'nitrate (umol/kg)':"NO3",
            'phosphate (umol/kg)':"PO4",
            'potential density':"sig0",
            'PO (umol/kg)':"PO",
            }


#For some reason, altair chokes when provided data frames with some
# of the original column names. So I am remapping the column names.
def remap_colnames(df, colnames_map):
  foraltair_df = pandas.DataFrame(dict([
      (new_col, np.array(df[orig_col]))
      for new_col,orig_col in colnames_map.items()]))
  return foraltair_df

df = pandas.read_csv("names_added_GP15OMPA_33RR20180918_only_gs_rosette_clean1_hy1.csv", na_values = -999)
df.columns
#, sep='delimiter',header=None
foraltair_df = remap_colnames(df=df, colnames_map=colnames_map)
#create a column for calculated variables
foraltair_df['NO'] = foraltair_df['oxygen (umol/kg)'] + (foraltair_df['nitrate (umol/kg)']*9.68)
foraltair_df['pt'] = gsw.pt_from_t(foraltair_df['salinity (psu)'],foraltair_df['temperature(degrees C)'],foraltair_df['pressure (dbar)'],foraltair_df['potential density'])
#foraltair_df['PO'] = foraltair_df['O2']+ foraltair_df['PO4']*155
#foraltair_df['SiO'] = foraltair_df['O2']+ foraltair_df['Si']*15


Prepare the features that define the convex hull

In [5]:
import sklearn.impute

#the columns to use for defining the convex hull
columns_to_compare = [
            'pt',
            'salinity (psu)',
            'silicate (umol/kg)',
            'potential density',
            'PO (umol/kg)',
            'NO']

#Let's standardize each column by subtracting mean and
# dividing by standard deviation. Call it a 'features' dataframe
features_df = pandas.DataFrame()
#keep track of mean and std in order to do inverse transform
colname_to_mean = {}
colname_to_std = {} 
for colname in columns_to_compare:
  vals = np.array(foraltair_df[colname])
  #use nanmean and nanstd to ignore nan values for now
  mean = np.nanmean(vals)
  std = np.nanstd(vals)
  colname_to_mean[colname] = mean
  colname_to_std[colname] = std
  features_df['zscore_'+colname] = (vals-mean)/std

#we impute nan values using KNNImputer
features_df = pandas.DataFrame(data=sklearn.impute.KNNImputer(
    missing_values=np.nan, n_neighbors=5,
    weights='distance').fit_transform(features_df),
    columns=features_df.columns)
TRANSFORM_MEANS = np.array([colname_to_mean[colname]
                           for colname in columns_to_compare])
TRANSFORM_STDS = np.array([colname_to_std[colname]
                           for colname in columns_to_compare])

#prepare a 'features' matrix for each point
features = np.array([np.array(features_df["zscore_"+col])
                     for col in columns_to_compare]).transpose((1,0))

In [6]:
print(features.shape)

(761, 6)


In [53]:
#Let's identify archetypes
from py_pcha import PCHA
import matplotlib

NUM_ENDMEMBERS = 3
XC, S, C, SSE, varexpl = PCHA(X=features.T, noc=NUM_ENDMEMBERS, verbose=False)
print("variance explained:",varexpl)

arrS = np.array(S)
for archetype_num in range(S.shape[0]):
  foraltair_df['archetype_'+str(archetype_num)] = arrS[archetype_num, :]

#compute a color based on the archetypes
foraltair_df['archetypecolors'] = [
  matplotlib.colors.rgb2hex((a1,a2,a3))
  for (a1,a2,a3) in zip(arrS[0, :], arrS[1, :], arrS[2, :])
]

explanation = XC*S #this is what the archetypes explain
squared_errors = np.sum(np.square(np.array(explanation-features.T)), axis=0)
foraltair_df['squared_errors'] = squared_errors

variance explained: 0.9435972586646876


Run clustering + compute lower-dimensional t-sne visualization

View altair interactive visualizations


In [56]:
import altair as alt

DF_TO_USE = foraltair_df
INTERVAL_SELECTION = alt.selection_interval()
COMPOSED_SELECTION = INTERVAL_SELECTION
TOTAL_WIDTH=1200
TOTAL_HEIGHT=680
TSNE_HEIGHTFRAC=0.4
TSNE_WIDTHFRAC=0.2
FONTSIZE=10
PADDING_GUESS=45 #additional padding to subtract off


#convenience functions to turn off default altair behaviour of including
# zero in the axis even if no points are at 0
def nozero_xaxis(field_name):
  return alt.X(field_name, scale=alt.Scale(zero=False))
def nozero_yaxis(field_name, domain=None):
  if (domain is None):
    return alt.Y(field_name, scale=alt.Scale(zero=False))
  else:
    return alt.Y(field_name, scale=alt.Scale(zero=False, domain=domain))

def get_interactive_histogram(colname):
  yaxis = alt.Y('count():Q', title="Count")
  xaxis = alt.X(colname+':Q', bin=alt.Bin(maxbins=100))
  #apparently height/width doesn't include the space for the
  # axes labels, so these need to be adjusted a bit.
  bg_histogram = alt.Chart(DF_TO_USE).mark_bar().encode(
                    y=yaxis,
                    x=xaxis,
                    color=alt.value('lightgrey')).properties(
                      width=TOTAL_WIDTH*(1-TSNE_WIDTHFRAC)/4
                            - (FONTSIZE+PADDING_GUESS),
                      height=TOTAL_HEIGHT*TSNE_HEIGHTFRAC/3
                            - (FONTSIZE+PADDING_GUESS),
                      selection=INTERVAL_SELECTION)
  fg_histogram = alt.Chart(DF_TO_USE).mark_bar().encode(
                      y=yaxis,
                      color=alt.value('steelblue'),
                      x=xaxis).transform_filter(COMPOSED_SELECTION)
  return (bg_histogram+fg_histogram)

#define the color property that will be shared for the scatterplots/legend
color = alt.condition(COMPOSED_SELECTION, 'archetypecolors', alt.value('lightgray'),
                      scale=None,
                      )
#color = alt.condition(COMPOSED_SELECTION, 'squared_errors', alt.value('lightgray'))

#base chart for all other scatterplots
base = alt.Chart(DF_TO_USE).mark_point(opacity=0.3).encode(
  color=color,
  tooltip=['archetype_0', 'archetype_1', 'archetype_2']
).properties(width=TOTAL_WIDTH/4 - (FONTSIZE+PADDING_GUESS),
             height=(TOTAL_HEIGHT*(1-TSNE_HEIGHTFRAC))/2 
                     - (FONTSIZE+PADDING_GUESS)).add_selection(
                         INTERVAL_SELECTION)

#compose the whole layout
alt.vconcat(

(base.encode(nozero_xaxis('pt'), nozero_yaxis('salinity (psu)'))
| base.encode(nozero_xaxis('salinity (psu)'),
              nozero_yaxis('silicate (umol/kg)') )
| base.encode(nozero_xaxis('pt'), 
              nozero_yaxis('silicate (umol/kg)'))
),

(base.encode(nozero_xaxis('pt'),
             nozero_yaxis('NO'))
| base.encode(nozero_xaxis('salinity (psu)'),
              nozero_yaxis('NO'))
| base.encode(nozero_xaxis('silicate (umol/kg)'),
              nozero_yaxis('NO'))
),
 
 (base.encode(nozero_xaxis('pt'), nozero_yaxis('PO (umol/kg)'))
| base.encode(nozero_xaxis('salinity (psu)'), nozero_yaxis('PO (umol/kg)'))
|  base.encode(nozero_xaxis('salinity (psu)'), nozero_yaxis('NO'))
| base.encode(nozero_xaxis('NO'), nozero_yaxis('PO (umol/kg)'))
#| base.encode(x='salinity (psu)', y='PO (umol/kg)')
#| base.encode(x='NO', y='PO (umol/kg)')
),
#

(base.encode(nozero_xaxis('latitude (degrees)'),
             nozero_yaxis('depth (m)', domain=(6000, 0))))

).configure_axis(labelFontSize=FONTSIZE,
                 titleFontSize=FONTSIZE).properties(padding=0, spacing=0)
# the padding/spacing doesn't propagate to subcharts propertly
