<a href="https://colab.research.google.com/github/AvantiShri/oceanography_colab_notebooks/blob/master/for_rian/ArchetypeAnalysis_GP15_WaterMasses.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install py_pcha
!pip install gsw

Collecting py_pcha
  Downloading https://files.pythonhosted.org/packages/fb/02/65048696734e504414b7a8ea171a0c012fcb72daebb69826024478d3a3d8/py_pcha-0.1.3-py3-none-any.whl
Installing collected packages: py-pcha
Successfully installed py-pcha-0.1.3
Collecting gsw
[?25l  Downloading https://files.pythonhosted.org/packages/72/46/42d3b297108f88a1e152515323af6dea379c4e0d31c4b9c9adf722111a3a/gsw-3.3.1.post1-cp36-cp36m-manylinux2010_x86_64.whl (2.4MB)
[K     |████████████████████████████████| 2.4MB 4.3MB/s 
Installing collected packages: gsw
Successfully installed gsw-3.3.1.post1


Grab the data

In [2]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1O869bUhoTrPCW4hDaSVswLiQ7vzhzIlP' -O names_added_GP15OMPA_33RR20180918_only_gs_rosette_clean1_hy1.csv

--2020-08-28 19:45:46--  https://docs.google.com/uc?export=download&id=1O869bUhoTrPCW4hDaSVswLiQ7vzhzIlP
Resolving docs.google.com (docs.google.com)... 74.125.142.101, 74.125.142.138, 74.125.142.102, ...
Connecting to docs.google.com (docs.google.com)|74.125.142.101|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-0s-a4-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/5fcdjbmjbo10fdf70kmlaa7nkfhuh254/1598643900000/06203730782251856755/*/1O869bUhoTrPCW4hDaSVswLiQ7vzhzIlP?e=download [following]
--2020-08-28 19:45:47--  https://doc-0s-a4-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/5fcdjbmjbo10fdf70kmlaa7nkfhuh254/1598643900000/06203730782251856755/*/1O869bUhoTrPCW4hDaSVswLiQ7vzhzIlP?e=download
Resolving doc-0s-a4-docs.googleusercontent.com (doc-0s-a4-docs.googleusercontent.com)... 74.125.142.132, 2607:f8b0:400e:c08::84
Connecting to doc-0s-a4-docs.googleusercontent.com (doc-0s-a4-d

In [3]:
from matplotlib import pyplot as plt
import numpy as np
import pandas
import gsw

Read in the data frame and rename the columns

In [4]:
#Easy remapping of the column names
colnames_map = {'Station number':"stnnbr",
            'GEOTRACES ID':"geotrc_ID",
            'latitude (degrees)':"lat",
            'longitude (degrees)':"lon",
            'depth (m)':"depth",
            'pressure (dbar)':"pres",
            'temperature(degrees C)':"t",
            'salinity (psu)':"SP",
            'oxygen (umol/kg)':"O2",
            'silicate (umol/kg)':"Si",
            'nitrate (umol/kg)':"NO3",
            'phosphate (umol/kg)':"PO4",
            'potential density':"sig0",
            'PO (umol/kg)':"PO",
            }

ROUNDING_PRECISION = 4


#For some reason, altair chokes when provided data frames with some
# of the original column names. So I am remapping the column names.
def remap_colnames(df, colnames_map):
  remapnames_df = pandas.DataFrame(dict([
      (new_col, np.array(df[orig_col]))
      for new_col,orig_col in colnames_map.items()]))
  return remapnames_df

df = pandas.read_csv("names_added_GP15OMPA_33RR20180918_only_gs_rosette_clean1_hy1.csv", na_values = -999)
df.columns
#, sep='delimiter',header=None
remapnames_df = remap_colnames(df=df, colnames_map=colnames_map)
#create a column for calculated variables
remapnames_df['NO'] = remapnames_df['oxygen (umol/kg)'] + (remapnames_df['nitrate (umol/kg)']*9.68)
remapnames_df['pt'] = np.round(
    gsw.pt_from_t(remapnames_df['salinity (psu)'],
                  remapnames_df['temperature(degrees C)'],
                  remapnames_df['pressure (dbar)'],
                  remapnames_df['potential density']),
   decimals=ROUNDING_PRECISION)

Prepare the features that define the convex hull

In [5]:
import sklearn.impute

#the columns to use for defining the convex hull
COLUMNS_TO_COMPARE = [
            'pt',
            'salinity (psu)',
            'silicate (umol/kg)',
            'potential density',
            'PO (umol/kg)',
            'NO']

#Let's standardize each column by subtracting mean and
# dividing by standard deviation. Call it a 'features' dataframe
features_df = pandas.DataFrame()
#keep track of mean and std in order to do inverse transform
colname_to_mean = {}
colname_to_std = {} 
for colname in COLUMNS_TO_COMPARE:
  vals = np.array(remapnames_df[colname])
  #use nanmean and nanstd to ignore nan values for now
  mean = np.nanmean(vals)
  std = np.nanstd(vals)
  colname_to_mean[colname] = mean
  colname_to_std[colname] = std
  features_df['zscore_'+colname] = (vals-mean)/std

#we impute nan values using KNNImputer
features_df = pandas.DataFrame(data=sklearn.impute.KNNImputer(
    missing_values=np.nan, n_neighbors=5,
    weights='distance').fit_transform(features_df),
    columns=features_df.columns)

#prepare a 'features' matrix for each point
features = np.array([np.array(features_df["zscore_"+col])
                     for col in COLUMNS_TO_COMPARE]).transpose((1,0))

TRANSFORM_MEANS = np.array([colname_to_mean[colname]
                           for colname in COLUMNS_TO_COMPARE])
TRANSFORM_STDS = np.array([colname_to_std[colname]
                           for colname in COLUMNS_TO_COMPARE])
#map features back to original space
def map_features_back(features):
  return features*TRANSFORM_STDS[None,:] + TRANSFORM_MEANS[None,:]

In [6]:
#Defining some helper functions for storing the archetype info and making
# altair-friendly dataframes

from collections import OrderedDict

def getarchetypename(i):
  return str(i)

#get the archetypes in the original feature space
def get_archetypes_df(archetype_features):
  #map features back to the original space
  archetype_orig_features = map_features_back(archetype_features)
  archetype_df = pandas.DataFrame()
  for feature_idx, colname in enumerate(COLUMNS_TO_COMPARE):
    archetype_df[colname] = archetype_orig_features[:,feature_idx]
  archetype_df["archetype"] = [getarchetypename(i) for i in
                                   range(archetype_features.shape[0])]
  return archetype_df


#save the proportions of each archetype
def save_archetype_compositions(archetype_compositions, df):
  for archetype_num in range(archetype_compositions.shape[1]):
    df[getarchetypename(archetype_num)] = np.round(
      archetype_compositions[:, archetype_num], decimals=ROUNDING_PRECISION)


#save the reconstructed features in the original feature space
def save_reconstructed_features(reconstructed_features, df):
  #map features back to the original space
  reconstructed_orig_features = map_features_back(reconstructed_features)
  for feature_idx, colname in enumerate(COLUMNS_TO_COMPARE):
    df["reconstructed_"+colname] = np.round(
        reconstructed_orig_features[:,feature_idx], decimals=ROUNDING_PRECISION)


#Create a df where there is a feature called 'composition' and a feature
# called 'archetype', and 'composition' indicates the proportion of the
# archetype
def get_archetype_composition_df(orig_df):
  feature_names = list(remapnames_df.columns)
  df_dict = OrderedDict()

  #initialize feature_dict
  for feature_name in feature_names:
    df_dict[feature_name] = []
  df_dict["archetype"] = []
  df_dict["composition"] = []

  for archetype_num in range(NUM_ENDMEMBERS):
    archetype_str = getarchetypename(archetype_num)
    archetype_fraction = orig_df[archetype_str]
    for feature_name in feature_names:
      df_dict[feature_name].extend(list(orig_df[feature_name]))
    df_dict["composition"].extend(list(orig_df[archetype_str]))
    df_dict["archetype"].extend([archetype_str for i in range(len(orig_df))])

  return pandas.DataFrame(df_dict)

In [7]:
#Let's identify archetypes
from py_pcha import PCHA
import matplotlib


NUM_ENDMEMBERS = 3
XC, S, C, SSE, varexpl = PCHA(X=features.T, noc=NUM_ENDMEMBERS, verbose=False)
print("variance explained:",varexpl)

#get the archetype_dataframe containing original archetype features
archetypes_df = get_archetypes_df(archetype_features=np.array(XC).transpose())

archetype_compositions = np.array(S).transpose()
save_archetype_compositions(
    archetype_compositions=np.array(S).transpose(),
    df=remapnames_df)

save_reconstructed_features(
    reconstructed_features=np.array(XC*S).transpose(),
    df=remapnames_df)

archetype_composition_df = get_archetype_composition_df(orig_df=remapnames_df)

##Old code for computing a color when there are only 3 archetypes
#remapnames_df['archetypecolors'] = [
#  matplotlib.colors.rgb2hex((a1,a2,a3))
#  for (a1,a2,a3) in zip(archetype_compositions[:,0],
#                        archetype_compositions[:,1],
#                        archetype_compositions[:,2])
#]

variance explained: 0.9435972711679422


In [8]:
print("The archetypes are:")
archetypes_df

The archetypes are:


Unnamed: 0,pt,salinity (psu),silicate (umol/kg),potential density,PO (umol/kg),NO,archetype
0,13.004188,34.867394,6.292591,26.293122,332.43585,297.142046,0
1,1.146612,34.682076,149.237562,27.795831,538.503652,493.338769,1
2,4.681207,32.840404,30.105147,26.003203,556.681024,491.916259,2


View altair interactive visualizations


In [9]:
import altair as alt

OBS_DF = archetype_composition_df #data frame for observations
ARCHETYPES_DF = archetypes_df #data frame for archetypes
CHART_WIDTH=400
CHART_HEIGHT=200
FONTSIZE=10


#convenience functions to turn off default altair behaviour of including
# zero in the axis even if no points are at 0
def nozero_xaxis(field_name):
  return alt.X(field_name, scale=alt.Scale(zero=False))

def nozero_yaxis(field_name, domain=None):
  if (domain is None):
    return alt.Y(field_name, scale=alt.Scale(zero=False))
  else:
    return alt.Y(field_name, scale=alt.Scale(zero=False, domain=domain))

interval_selection = alt.selection_interval()
archetypes_dropdown = alt.binding_select(options=[getarchetypename(i) for i in range(NUM_ENDMEMBERS)])
archetype_select = alt.selection_single(
    fields=['archetype'], bind=archetypes_dropdown, name="Archetype", init={'archetype':getarchetypename(0)})

composed_selection = interval_selection & archetype_select

#define the color property that will be shared for the scatterplots/legend
color = alt.condition(interval_selection, 'composition', alt.value('lightgray'))
#color = alt.condition(COMPOSED_SELECTION, 'squared_errors', alt.value('lightgray'))

tooltip_columns = (
    ['latitude (degrees)', 'longitude (degrees)', 'depth (m)']
   +[getarchetypename(i) for i in range(NUM_ENDMEMBERS)]
   +[prefix+x for x in COLUMNS_TO_COMPARE for prefix in ['', 'reconstructed_']])

#base chart for displaying the observed points

obs_basechart = alt.Chart(OBS_DF).mark_point(opacity=0.3).encode(
  color=color,
  tooltip=tooltip_columns
).properties(width=CHART_WIDTH,
             height=CHART_HEIGHT
).add_selection(interval_selection).add_selection(archetype_select).transform_filter(archetype_select)

#base chart for displaying the archetypes
archetype_basechart = alt.Chart(ARCHETYPES_DF).mark_point(opacity=1, shape='diamond', size=50).encode(
    color=alt.Color("archetype", scale=alt.Scale(scheme="category10")) ).properties(
        width=CHART_WIDTH, height=CHART_HEIGHT)

def create_scatter_obsonly(xaxis, yaxis):
  diagonal_df = pandas.DataFrame({xaxis: OBS_DF[xaxis], yaxis: OBS_DF[xaxis]})
  return (obs_basechart.encode(nozero_xaxis(xaxis), nozero_yaxis(yaxis))
          + alt.Chart(diagonal_df).mark_line().encode(x=xaxis, y=yaxis, color=alt.value('black') )
          )

def create_scatter(xaxis, yaxis):
  return obs_basechart.encode(nozero_xaxis(xaxis), nozero_yaxis(yaxis)) + archetype_basechart.encode(nozero_xaxis(xaxis), nozero_yaxis(yaxis))
  #return archetype_basechart.encode(nozero_xaxis(xaxis), nozero_yaxis(yaxis))


#compose the whole layout
alt.vconcat(

(create_scatter_obsonly('pt', 'reconstructed_pt')
| create_scatter_obsonly('salinity (psu)', 'reconstructed_salinity (psu)')
| create_scatter_obsonly('silicate (umol/kg)', 'reconstructed_silicate (umol/kg)')),

(create_scatter_obsonly('potential density', 'reconstructed_potential density')
| create_scatter_obsonly('PO (umol/kg)', 'reconstructed_PO (umol/kg)')
| create_scatter_obsonly('NO', 'reconstructed_NO')),

(create_scatter('pt','salinity (psu)')
| create_scatter('salinity (psu)', 'silicate (umol/kg)')
| create_scatter('pt','silicate (umol/kg)')),

(create_scatter('pt','NO')
| create_scatter('salinity (psu)', 'NO')
| create_scatter('silicate (umol/kg)','NO')),
 
(create_scatter('pt','PO (umol/kg)')
| create_scatter('salinity (psu)', 'PO (umol/kg)')
| create_scatter('NO', 'PO (umol/kg)')),

(obs_basechart.encode(nozero_xaxis('latitude (degrees)'),
             nozero_yaxis('depth (m)', domain=(6000, 0))))

).configure_axis(labelFontSize=FONTSIZE,
                 titleFontSize=FONTSIZE)#.properties(padding=0, spacing=0)
# the padding/spacing doesn't propagate to subcharts propertly
