<a href="https://colab.research.google.com/github/AvantiShri/oceanography_colab_notebooks/blob/master/for_rian/V1_Pacific_Ocean_GLODAP_ArchetypeAnalysis_WaterMasses.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install py_pcha
!pip install gsw

Collecting py_pcha
  Downloading https://files.pythonhosted.org/packages/fb/02/65048696734e504414b7a8ea171a0c012fcb72daebb69826024478d3a3d8/py_pcha-0.1.3-py3-none-any.whl
Installing collected packages: py-pcha
Successfully installed py-pcha-0.1.3
Collecting gsw
[?25l  Downloading https://files.pythonhosted.org/packages/72/46/42d3b297108f88a1e152515323af6dea379c4e0d31c4b9c9adf722111a3a/gsw-3.3.1.post1-cp36-cp36m-manylinux2010_x86_64.whl (2.4MB)
[K     |████████████████████████████████| 2.4MB 2.8MB/s 
Installing collected packages: gsw
Successfully installed gsw-3.3.1.post1


Grab the data

In [2]:
#!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1E9XGag2_uC2TM_5DcOcmSz86I1xj6hHr' -O GLODAPv2.2019_Pacific_Ocean.csv
#For large files, this is the command:
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1E9XGag2_uC2TM_5DcOcmSz86I1xj6hHr' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1E9XGag2_uC2TM_5DcOcmSz86I1xj6hHr" -O GLODAPv2.2019_Pacific_Ocean.csv && rm -rf /tmp/cookies.txt

--2020-09-01 23:17:10--  https://docs.google.com/uc?export=download&confirm=xrBn&id=1E9XGag2_uC2TM_5DcOcmSz86I1xj6hHr
Resolving docs.google.com (docs.google.com)... 108.177.125.101, 108.177.125.139, 108.177.125.102, ...
Connecting to docs.google.com (docs.google.com)|108.177.125.101|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-04-bg-docs.googleusercontent.com/docs/securesc/eo295a74b7rvji702rtp8f3pfth0dtd1/sl9t7kp8p9ffva44cpnri0h91eis90ak/1599002175000/06203730782251856755/04250999775610707777Z/1E9XGag2_uC2TM_5DcOcmSz86I1xj6hHr?e=download [following]
--2020-09-01 23:17:10--  https://doc-04-bg-docs.googleusercontent.com/docs/securesc/eo295a74b7rvji702rtp8f3pfth0dtd1/sl9t7kp8p9ffva44cpnri0h91eis90ak/1599002175000/06203730782251856755/04250999775610707777Z/1E9XGag2_uC2TM_5DcOcmSz86I1xj6hHr?e=download
Resolving doc-04-bg-docs.googleusercontent.com (doc-04-bg-docs.googleusercontent.com)... 64.233.189.132, 2404:6800:4008:c07::84
Connec

In [3]:
from matplotlib import pyplot as plt
import numpy as np
import pandas
import gsw

Read in the data frame and rename the columns

In [55]:
#Easy remapping of the column names
colnames_map = {'latitude':"latitude",
            'longitude':"longitude",
            'year':"year",
            'depth (m)':"depth",
            'pressure (dbar)':"pressure",
            'temperature(degrees C)':"temperature",
            'salinity (psu)':"salinity",
            'oxygen (umol/kg)':"oxygen",
            'silicate (umol/kg)':"silicate",
            'nitrate (umol/kg)':"nitrate",
            'phosphate (umol/kg)':"phosphate",
            'potential density':"sigma0",
            }

ROUNDING_PRECISION = 4


#For some reason, altair chokes when provided data frames with some
# of the original column names. So I am remapping the column names.
def remap_colnames(df, colnames_map):
  remapnames_df = pandas.DataFrame(dict([
      (new_col, np.array(df[orig_col]))
      for new_col,orig_col in colnames_map.items()]))
  return remapnames_df

df = pandas.read_csv("GLODAPv2.2019_Pacific_Ocean.csv", na_values = -9999)
df.columns
#, sep='delimiter',header=None
remapnames_df = remap_colnames(df=df, colnames_map=colnames_map)
print("# examples:", len(remapnames_df))

SALINITY_MINIMUM = 30
print("Examples below the salinity minimum:",
      np.sum(remapnames_df['salinity (psu)'] < SALINITY_MINIMUM))
#Drop rows that have outlier values for the salinity. Note, this will also
# drop rows that have nan values for the salinity.
remapnames_df = pandas.DataFrame(
    remapnames_df[remapnames_df['salinity (psu)'] >= SALINITY_MINIMUM])

#create a column for calculated variables
remapnames_df['NO'] = remapnames_df['oxygen (umol/kg)'] + (remapnames_df['nitrate (umol/kg)']*9.68)
remapnames_df['PO'] = remapnames_df['oxygen (umol/kg)'] + (remapnames_df['phosphate (umol/kg)']*155)
remapnames_df['pt'] = np.round(
    gsw.pt_from_t(remapnames_df['salinity (psu)'],
                  remapnames_df['temperature(degrees C)'],
                  remapnames_df['pressure (dbar)'],
                  remapnames_df['potential density']),
   decimals=ROUNDING_PRECISION)


# examples: 452744
Examples below the salinity minimum: 104


Prepare the features that define the convex hull

In [56]:
import sklearn.impute
import time

#the columns to use for defining the convex hull
COLUMNS_TO_COMPARE = [
            'pt',
            'salinity (psu)',
            'silicate (umol/kg)',
            'potential density',
            'PO',
            'NO']

print("Standardizing feature values")
#Let's standardize each column by subtracting mean and
# dividing by standard deviation. Call it a 'features' dataframe
features_df = pandas.DataFrame()
#keep track of mean and std in order to do inverse transform
colname_to_mean = {}
colname_to_std = {} 
for colname in COLUMNS_TO_COMPARE:
  vals = np.array(remapnames_df[colname])
  #use nanmean and nanstd to ignore nan values for now
  mean = np.nanmean(vals)
  std = np.nanstd(vals)
  colname_to_mean[colname] = mean
  colname_to_std[colname] = std
  features_df['zscore_'+colname] = (vals-mean)/std

#Since imputation takes a while on such a large dataset, we
# will just drop rows that have missing values
print("Proportions of missing values:")
print(np.sum(np.isnan(features_df)) / len(features_df))
print("Original number of rows:", len(features_df))
features_df = features_df.dropna()
print("Remaining rows after dropping missing vals:",len(features_df))
dropna_remapnames_df = remapnames_df.dropna()
assert len(dropna_remapnames_df) == len(features_df)

#print("Running KNN imputation")
#start = time.time()
##we impute nan values using KNNImputer
#features_df = pandas.DataFrame(data=sklearn.impute.KNNImputer(
#    missing_values=np.nan, n_neighbors=5,
#    weights='distance').fit_transform(features_df),
#    columns=features_df.columns)
#print("KNN imputation took", time.time()-start)

#prepare a 'features' matrix for each point
features = np.array([np.array(features_df["zscore_"+col])
                     for col in COLUMNS_TO_COMPARE]).transpose((1,0))

TRANSFORM_MEANS = np.array([colname_to_mean[colname]
                           for colname in COLUMNS_TO_COMPARE])
TRANSFORM_STDS = np.array([colname_to_std[colname]
                           for colname in COLUMNS_TO_COMPARE])
#map features back to original space
def map_features_back(features):
  return features*TRANSFORM_STDS[None,:] + TRANSFORM_MEANS[None,:]

Standardizing feature values
Proportions of missing values:
zscore_pt                    0.002627
zscore_salinity (psu)        0.000000
zscore_silicate (umol/kg)    0.194340
zscore_potential density     0.002627
zscore_PO                    0.191677
zscore_NO                    0.179024
dtype: float64
Original number of rows: 447627
Remaining rows after dropping missing vals: 339851


In [57]:
#Defining some helper functions for storing the archetype info and making
# altair-friendly dataframes
from collections import OrderedDict

#save the reconstructed features in the original feature space
def save_reconstructed_features(reconstructed_features, df):
  #map features back to the original space
  reconstructed_orig_features = map_features_back(reconstructed_features)
  for feature_idx, colname in enumerate(COLUMNS_TO_COMPARE):
    df["reconstructed_"+colname] = np.round(
        reconstructed_orig_features[:,feature_idx], decimals=ROUNDING_PRECISION)


#Create a df where there is a feature called 'composition' and a feature
# called 'archetype', and 'composition' indicates the proportion of the
# archetype
def get_archetype_composition_df(orig_df):
  feature_names = list(remapnames_df.columns)
  df_dict = OrderedDict()

  #initialize feature_dict
  for feature_name in feature_names:
    df_dict[feature_name] = []
  df_dict["archetype"] = []
  df_dict["composition"] = []

  for archetype_num in range(NUM_ENDMEMBERS):
    archetype_str = getarchetypename(archetype_num)
    archetype_fraction = orig_df[archetype_str]
    for feature_name in feature_names:
      df_dict[feature_name].extend(list(orig_df[feature_name]))
    df_dict["composition"].extend(list(orig_df[archetype_str]))
    df_dict["archetype"].extend([archetype_str for i in range(len(orig_df))])

  return pandas.DataFrame(df_dict)

In [60]:
#mount a google drive in which to save the results
from google.colab import drive
drive.mount('/content/gdrive')

#create location to save the data within the google drive
!mkdir -p /content/gdrive/'My Drive'/colab_notebook_data/for_rian/glodap_archetype_analysis
#create a link to the folder with a shorter name
!ln -s /content/gdrive/'My Drive'/colab_notebook_data/for_rian/glodap_archetype_analysis glodap_archetype_analysis

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [67]:
#Let's identify archetypes
from py_pcha import PCHA
import matplotlib

numendmembers_to_archetyperesult = {}

for num_endmembers in [4,5,6,7,8]:
  print("Running archetype analysis with",num_endmembers,"archetypes")
  XC, S, C, SSE, varexpl = PCHA(X=features.T, noc=num_endmembers, verbose=True)
  print(str(num_endmembers)+" archetypes; variance explained:", varexpl)
  numendmembers_to_archetyperesult[num_endmembers] = (XC, S, C, SSE, varexpl)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        62 |     0.9558 | 8.6487e+04 | 1.1313e-03 | 4.1328e-03 | 1.0000e+00 | 4.9282e-02 |     1.0000 

        63 |     0.9559 | 8.6347e+04 | 1.6163e-03 | 1.5993e-03 | 1.0000e+00 | 7.6285e-02 |     1.0000 

        64 |     0.9559 | 8.6246e+04 | 1.1749e-03 | 9.9026e-03 | 1.0000e+00 | 5.9042e-02 |     1.0000 

        65 |     0.9560 | 8.6165e+04 | 9.3690e-04 | 3.8321e-03 | 1.0000e+00 | 4.5696e-02 |     1.0000 

        66 |     0.9560 | 8.6077e+04 | 1.0261e-03 | 5.9319e-03 | 1.0000e+00 | 7.0735e-02 |     1.0000 

        67 |     0.9560 | 8.6026e+04 | 5.9033e-04 | 4.5911e-03 | 1.0000e+00 | 5.4747e-02 |     1.0000 

        68 |     0.9561 | 8.5918e+04 | 1.2559e-03 | 2.8427e-02 | 1.0000e+00 | 4.2372e-02 |     1.0000 

        69 |     0.9561 | 8.5856e+04 | 7.2528e-04 | 1.1001e-02 | 1.0000e+00 | 6.5589e-02 |     1.0000 

        70 |     0.9561 | 8.5807e+04 | 5.7641e-04 | 8.5142e-03 | 1.0000e+00 | 5.0764e-02 |     1.0000 


In [68]:
import h5py
#save the results of the archetype analysis at different numbers for total archetypes
!rm archetype_analysis_results.h5
archetype_analysis_results = h5py.File("glodap_archetype_analysis/archetype_analysis_results.h5", "w")
for num_endmembers in numendmembers_to_archetyperesult:
  grp = archetype_analysis_results.create_group(str(num_endmembers))
  (XC, S, C, SSE, varexpl) = numendmembers_to_archetyperesult[num_endmembers]
  grp.create_dataset("XC", data=np.array(XC))
  grp.create_dataset("S", data=np.array(S))
  grp.create_dataset("C", data=np.array(C))
  grp.attrs["SSE"] = SSE
  grp.attrs["varexpl"] = varexpl
archetype_analysis_results.close()


In [69]:
!du -hs glodap_archetype_analysis/archetype_analysis_results.h5 #file size

156M	glodap_archetype_analysis/archetype_analysis_results.h5


In [70]:
from collections import OrderedDict

def getarchetypename(archetype_num,num_endmembers):
  return str(archetype_num)+"_"+str(num_endmembers)


#save the proportions of each archetype
def save_archetype_compositions(archetype_compositions, num_endmembers, df):
  for archetype_num in range(archetype_compositions.shape[1]):
    df[getarchetypename(archetype_num, num_endmembers)] = np.round(
      archetype_compositions[:, archetype_num],
      decimals=ROUNDING_PRECISION)


#get the archetypes in the original feature space
def get_archetypes_df(numendmembers_to_archetyperesult):

  colname_to_values = OrderedDict()
  for colname in COLUMNS_TO_COMPARE:
    colname_to_values[colname] = []
  colname_to_values["archetype"] = []
  colname_to_values["num_endmembers"] = []

  for num_endmembers in numendmembers_to_archetyperesult:
    archetype_features = np.array(numendmembers_to_archetyperesult[num_endmembers][0]).transpose()
    save_archetype_compositions(
        archetype_compositions=np.array(numendmembers_to_archetyperesult[num_endmembers][1]).transpose(),
        num_endmembers=num_endmembers,
        df=dropna_remapnames_df)

    #map features back to the original space
    archetype_orig_features = map_features_back(archetype_features)
    for feature_idx, colname in enumerate(COLUMNS_TO_COMPARE):
      colname_to_values[colname].extend(archetype_orig_features[:,feature_idx])
    colname_to_values["archetype"].extend([getarchetypename(i,num_endmembers)
                                   for i in range(archetype_features.shape[0])])
    colname_to_values["num_endmembers"].extend(
        [str(num_endmembers) for i in range(archetype_features.shape[0])])
  return pandas.DataFrame(colname_to_values)

#get the archetype_dataframe containing original archetype features
archetypes_df = get_archetypes_df(numendmembers_to_archetyperesult)

archetypes_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,pt,salinity (psu),silicate (umol/kg),potential density,PO,NO,archetype,num_endmembers
0,24.864712,36.05433,0.522479,24.166568,183.777353,167.352287,0_4,4
1,18.553022,30.496375,0.918378,21.489668,264.930155,248.398096,1_4,4
2,0.877457,34.716019,157.515095,27.817573,546.049035,491.972529,2_4,4
3,-0.004944,34.017667,20.138296,27.301571,578.102446,539.040327,3_4,4
4,0.736904,34.70903,164.622947,27.818998,545.527642,492.051386,0_5,5
5,16.044905,35.477164,1.5135,26.101918,255.610881,225.822144,1_5,5
6,-0.091344,34.10707,31.815124,27.384613,644.041599,600.956507,2_5,5
7,29.938258,35.745688,1.268763,22.309574,217.623563,194.311515,3_5,5
8,14.3488,30.064,0.6693,22.306,284.24,265.9304,4_5,5
9,14.150916,35.211772,2.514496,26.31843,260.939751,237.638008,0_6,6


In [71]:
#save_reconstructed_features(
#    reconstructed_features=np.array(XC*S).transpose(),
#    df=remapnames_df)

#archetype_composition_df = get_archetype_composition_df(orig_df=remapnames_df)

##Old code for computing a color when there are only 3 archetypes
#remapnames_df['archetypecolors'] = [
#  matplotlib.colors.rgb2hex((a1,a2,a3))
#  for (a1,a2,a3) in zip(archetype_compositions[:,0],
#                        archetype_compositions[:,1],
#                        archetype_compositions[:,2])
#]

View altair interactive visualizations


In [72]:
import altair as alt

OBS_DF = alt.sample(dropna_remapnames_df, n=5000) #data frame for observations
ARCHETYPES_DF = archetypes_df #data frame for archetypes
CHART_WIDTH=400
CHART_HEIGHT=200
FONTSIZE=10

#convenience functions to turn off default altair behaviour of including
# zero in the axis even if no points are at 0
def nozero_xaxis(field_name):
  return alt.X(field_name, scale=alt.Scale(zero=False))

def nozero_yaxis(field_name, domain=None):
  if (domain is None):
    return alt.Y(field_name, scale=alt.Scale(zero=False))
  else:
    return alt.Y(field_name, scale=alt.Scale(zero=False, domain=domain))

interval_selection = alt.selection_interval()

#define the color property that will be shared for the scatterplots/legend
color = alt.condition(interval_selection, alt.value('lightblue'), alt.value('lightgray'))
#color = alt.condition(COMPOSED_SELECTION, 'squared_errors', alt.value('lightgray'))

tooltip_columns = (
    ['latitude', 'longitude', 'depth (m)']
   +[getarchetypename(i,num_endmembers) for num_endmembers in numendmembers_to_archetyperesult for i in range(num_endmembers)]
   +[x for x in COLUMNS_TO_COMPARE])

#base chart for displaying the observed points

obs_basechart = alt.Chart(OBS_DF).mark_point(opacity=0.2).encode(
  color=color,
  tooltip=tooltip_columns
).properties(width=CHART_WIDTH,
             height=CHART_HEIGHT
).add_selection(interval_selection)

#base chart for displaying the archetypes
numendmembers_select = alt.selection_multi(fields=['num_endmembers'])
numendmembers_color = alt.condition(numendmembers_select,
                        alt.Color('num_endmembers:N', legend=None),
                        alt.value('lightgray'))
archetype_basechart = alt.Chart(ARCHETYPES_DF).mark_point(opacity=1, shape='diamond', size=50).encode(
    color=alt.Color("num_endmembers", scale=alt.Scale(scheme="category10"), legend=None) ).properties(
        width=CHART_WIDTH, height=CHART_HEIGHT).transform_filter(numendmembers_select)
numendmembers_legend = alt.Chart(ARCHETYPES_DF).mark_point().encode(
    y=alt.Y('num_endmembers', axis=alt.Axis(orient='right')),
    color=numendmembers_color).add_selection(numendmembers_select)

def create_scatter(xaxis, yaxis):
  return obs_basechart.encode(nozero_xaxis(xaxis), nozero_yaxis(yaxis)) + archetype_basechart.encode(nozero_xaxis(xaxis), nozero_yaxis(yaxis))
  #return archetype_basechart.encode(nozero_xaxis(xaxis), nozero_yaxis(yaxis))


#compose the whole layout
alt.vconcat(

(create_scatter('pt','salinity (psu)')
| create_scatter('pt', 'potential density')
| create_scatter('potential density', 'salinity (psu)')
| numendmembers_legend),

(create_scatter('salinity (psu)', 'silicate (umol/kg)')
| create_scatter('pt','silicate (umol/kg)')
| create_scatter('potential density','silicate (umol/kg)')),

(create_scatter('pt','NO')
| create_scatter('salinity (psu)', 'NO')
| create_scatter('silicate (umol/kg)','NO')),
 
(create_scatter('pt','PO')
| create_scatter('salinity (psu)', 'PO')
| create_scatter('NO', 'PO')),


).configure_axis(labelFontSize=FONTSIZE,
                 titleFontSize=FONTSIZE)#.properties(padding=0, spacing=0)
# the padding/spacing doesn't propagate to subcharts propertly
