<a href="https://colab.research.google.com/github/AvantiShri/oceanography_colab_notebooks/blob/master/for_rian/Clustering_Of_GP15_WaterMasses.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#for leiden community detection
!pip install leidenalg
!pip install gsw

Collecting leidenalg
[?25l  Downloading https://files.pythonhosted.org/packages/7e/68/01da5910be71e4fd6f96af7c3c0f31f531c96300bbe50b418c0b5a3eaeb6/leidenalg-0.8.1-cp36-cp36m-manylinux2010_x86_64.whl (2.4MB)
[K     |████████████████████████████████| 2.4MB 3.3MB/s 
[?25hCollecting python-igraph>=0.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/8b/74/24a1afbf3abaf1d5f393b668192888d04091d1a6d106319661cd4af05406/python_igraph-0.8.2-cp36-cp36m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 30.7MB/s 
[?25hCollecting texttable>=1.6.2
  Downloading https://files.pythonhosted.org/packages/ec/b1/8a1c659ce288bf771d5b1c7cae318ada466f73bd0e16df8d86f27a2a3ee7/texttable-1.6.2-py2.py3-none-any.whl
Installing collected packages: texttable, python-igraph, leidenalg
Successfully installed leidenalg-0.8.1 python-igraph-0.8.2 texttable-1.6.2
Collecting gsw
[?25l  Downloading https://files.pythonhosted.org/packages/31/88/bfb0b1df0ea0a147bde8020af1ffc08

In [2]:
from matplotlib import pyplot as plt
import numpy as np
import pandas
import gsw

Grab the data

In [3]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1O869bUhoTrPCW4hDaSVswLiQ7vzhzIlP' -O names_added_GP15OMPA_33RR20180918_only_gs_rosette_clean1_hy1.csv

--2020-08-09 00:10:32--  https://docs.google.com/uc?export=download&id=1O869bUhoTrPCW4hDaSVswLiQ7vzhzIlP
Resolving docs.google.com (docs.google.com)... 172.217.214.102, 172.217.214.139, 172.217.214.113, ...
Connecting to docs.google.com (docs.google.com)|172.217.214.102|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-0s-a4-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/h4b7qkhajjsob42bkrebdqdrv14ij1t5/1596931800000/06203730782251856755/*/1O869bUhoTrPCW4hDaSVswLiQ7vzhzIlP?e=download [following]
--2020-08-09 00:10:32--  https://doc-0s-a4-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/h4b7qkhajjsob42bkrebdqdrv14ij1t5/1596931800000/06203730782251856755/*/1O869bUhoTrPCW4hDaSVswLiQ7vzhzIlP?e=download
Resolving doc-0s-a4-docs.googleusercontent.com (doc-0s-a4-docs.googleusercontent.com)... 209.85.145.132, 2607:f8b0:4001:c1e::84
Connecting to doc-0s-a4-docs.googleusercontent.com (doc-0s-

Read in the data frame and rename the columns

In [4]:
#Easy remapping of the column names
colnames_map = {'Station number':"stnnbr",
            'GEOTRACES ID':"geotrc_ID",
            'latitude (degrees)':"lat",
            'longitude (degrees)':"lon",
            'depth (m)':"depth",
            'pressure (dbar)':"pres",
            'temperature(degrees C)':"t",
            'salinity (psu)':"SP",
            'oxygen (umol/kg)':"O2",
            'silicate (umol/kg)':"Si",
            'nitrate (umol/kg)':"NO3",
            'phosphate (umol/kg)':"PO4",
            'potential density':"sig0",
            'PO (umol/kg)':"PO",
            }


#For some reason, altair chokes when provided data frames with some
# of the original column names. So I am remapping the column names.
def remap_colnames(df, colnames_map):
  foraltair_df = pandas.DataFrame(dict([
      (new_col, np.array(df[orig_col]))
      for new_col,orig_col in colnames_map.items()]))
  return foraltair_df

df = pandas.read_csv("names_added_GP15OMPA_33RR20180918_only_gs_rosette_clean1_hy1.csv", na_values = -999)
df.columns
#, sep='delimiter',header=None
foraltair_df = remap_colnames(df=df, colnames_map=colnames_map)
#create a column for calculated variables
foraltair_df['NO'] = foraltair_df['oxygen (umol/kg)'] + (foraltair_df['nitrate (umol/kg)']*9.68)
foraltair_df['pt'] = gsw.pt_from_t(foraltair_df['salinity (psu)'],foraltair_df['temperature(degrees C)'],foraltair_df['pressure (dbar)'],foraltair_df['potential density'])
#foraltair_df['PO'] = foraltair_df['O2']+ foraltair_df['PO4']*155
#foraltair_df['SiO'] = foraltair_df['O2']+ foraltair_df['Si']*15


Prepare the features for clustering (standardize + impute missing values)

In [5]:
import sklearn.impute

#the columns to use for clustering
columns_to_compare = [
            'pt',
            'salinity (psu)',
            #'oxygen (umol/kg)',
            'silicate (umol/kg)',
            'potential density',
            'PO (umol/kg)', 'NO'
            ]

#for clustering purposes, we standardize each column by subtracting mean and
# dividing by standard deviation
forclustering_df = pandas.DataFrame()
for colname in columns_to_compare:
  vals = np.array(foraltair_df[colname])
  #use nanmean and nanstd to ignore nan values for now
  forclustering_df['zscore_'+colname] = (vals-np.nanmean(vals))/np.nanstd(vals)

#we impute nan values using KNNImputer
forclustering_df = pandas.DataFrame(data=sklearn.impute.KNNImputer(
    missing_values=np.nan, n_neighbors=5,
    weights='distance').fit_transform(forclustering_df),
    columns=forclustering_df.columns)

#prepare a 'features' matrix for each point
features = np.array([np.array(forclustering_df["zscore_"+col])
                     for col in columns_to_compare]).transpose((1,0))

Run clustering + compute lower-dimensional t-sne visualization

In [6]:
import leidenalg
import scipy
import sklearn.manifold


#From: https://github.com/theislab/scanpy/blob/8131b05b7a8729eae3d3a5e146292f377dd736f7/scanpy/_utils.py#L159
def get_igraph_from_adjacency(adjacency, directed=None):
    """Get igraph graph from adjacency matrix."""
    import igraph as ig
    sources, targets = adjacency.nonzero()
    weights = adjacency[sources, targets]
    if isinstance(weights, np.matrix):
        weights = weights.A1
    g = ig.Graph(directed=directed)
    g.add_vertices(adjacency.shape[0])  # this adds adjacency.shap[0] vertices
    g.add_edges(list(zip(sources, targets)))
    try:
        g.es['weight'] = weights
    except:
        pass
    if g.vcount() != adjacency.shape[0]:
        print('WARNING: The constructed graph has only '
              +str(g.vcount())+' nodes. '
             'Your adjacency matrix contained redundant nodes.')
    return g


def run_leiden_community_detection(affinity_matrix, seed):
  the_graph = get_igraph_from_adjacency(affinity_matrix)
  partition = leidenalg.find_partition(
                    the_graph, leidenalg.ModularityVertexPartition,
                    weights=(np.array(the_graph.es['weight'])
                             .astype(np.float64)),
                    n_iterations=-1,
                    seed=seed)
  return partition


def run_leiden_with_multiple_seeds_and_take_best(affinity_matrix, num_seeds):
  best_quality = None
  for seedidx in range(num_seeds):
    partition = run_leiden_community_detection(affinity_matrix, seedidx*100)
    quality = partition.quality()
    if ((best_quality is None) or (quality > best_quality)):
        best_quality = quality
        best_clustering = np.array(partition.membership)
  return best_clustering


def run_leiden_using_tsneadapted_distances(features, perplexity):
  pairwise_distances = scipy.spatial.distance.squareform(
      scipy.spatial.distance.pdist(X=features))
  affmat = sklearn.manifold._utils._binary_search_perplexity(
                pairwise_distances.astype("float32"), perplexity, False)
  #symmetrize affinity matrix by addition
  affmat = affmat + affmat.T
  #run louvain with 3 random seeds and take the best one
  leiden_clusters = run_leiden_with_multiple_seeds_and_take_best(
      affinity_matrix=affmat, num_seeds=3)
  return leiden_clusters


#Get Leiden communities using t-sne derived distances
PERPLEXITY = 20
leiden_clusters = run_leiden_using_tsneadapted_distances(
    features=features, perplexity=PERPLEXITY)

#derive t-sne embedding given the features
embedding = sklearn.manifold.TSNE(perplexity=PERPLEXITY,
                                  random_state=123).fit_transform(features)

#Store the results of the clustering and the embedding in the data frame
foraltair_df['tsne_axis1'] = embedding[:,0]
foraltair_df['tsne_axis2'] = embedding[:,1]
#I am storing the clusters as strings so they automaticall get
# interpreted as categorical
foraltair_df['clusters'] = [str(x) for x in leiden_clusters]

View altair interactive visualizations


In [12]:
import altair as alt

DF_TO_USE = foraltair_df
INTERVAL_SELECTION = alt.selection_interval()
LEGEND_SELECTION = alt.selection_multi(fields=['clusters'])
COMPOSED_SELECTION = (INTERVAL_SELECTION | LEGEND_SELECTION)
TOTAL_WIDTH=1200
TOTAL_HEIGHT=680
TSNE_HEIGHTFRAC=0.4
TSNE_WIDTHFRAC=0.2
FONTSIZE=10
PADDING_GUESS=45 #additional padding to subtract off


#convenience functions to turn off default altair behaviour of including
# zero in the axis even if no points are at 0
def nozero_xaxis(field_name):
  return alt.X(field_name, scale=alt.Scale(zero=False))
def nozero_yaxis(field_name, domain=None):
  if (domain is None):
    return alt.Y(field_name, scale=alt.Scale(zero=False))
  else:
    return alt.Y(field_name, scale=alt.Scale(zero=False, domain=domain))

def get_interactive_histogram(colname):
  yaxis = alt.Y('count():Q', title="Count")
  xaxis = alt.X(colname+':Q', bin=alt.Bin(maxbins=100))
  #apparently height/width doesn't include the space for the
  # axes labels, so these need to be adjusted a bit.
  bg_histogram = alt.Chart(DF_TO_USE).mark_bar().encode(
                    y=yaxis,
                    x=xaxis,
                    color=alt.value('lightgrey')).properties(
                      width=TOTAL_WIDTH*(1-TSNE_WIDTHFRAC)/4
                            - (FONTSIZE+PADDING_GUESS),
                      height=TOTAL_HEIGHT*TSNE_HEIGHTFRAC/3
                            - (FONTSIZE+PADDING_GUESS),
                      selection=INTERVAL_SELECTION)
  fg_histogram = alt.Chart(DF_TO_USE).mark_bar().encode(
                      y=yaxis,
                      color=alt.value('steelblue'),
                      x=xaxis).transform_filter(COMPOSED_SELECTION)
  return (bg_histogram+fg_histogram)

#define the color property that will be shared for the scatterplots/legend
color = alt.condition(COMPOSED_SELECTION, 'clusters', alt.value('lightgray'),
                      scale=alt.Scale(scheme='category20'),
                      legend=None)

#base chart for t-sne scatterplot
tsne_base = alt.Chart(DF_TO_USE).mark_point(opacity=0.3).encode(
  color=color
).properties(width=TOTAL_WIDTH*TSNE_WIDTHFRAC - (FONTSIZE+PADDING_GUESS),
             height=TOTAL_HEIGHT*TSNE_HEIGHTFRAC - (FONTSIZE+PADDING_GUESS)
             ).add_selection(INTERVAL_SELECTION)

#base chart for all other scatterplots
base = alt.Chart(DF_TO_USE).mark_point(opacity=0.3).encode(
  color=color
).properties(width=TOTAL_WIDTH/4 - (FONTSIZE+PADDING_GUESS),
             height=(TOTAL_HEIGHT*(1-TSNE_HEIGHTFRAC))/2 
                     - (FONTSIZE+PADDING_GUESS)).add_selection(
                         INTERVAL_SELECTION)
#selectable legend
legend = legend = alt.Chart(DF_TO_USE).mark_point().encode(
            y=alt.Y('clusters:N', axis=alt.Axis(orient='right')),
            color=color
        ).add_selection(LEGEND_SELECTION)

#compose the whole layout
alt.vconcat(
    
(tsne_base.encode(x='tsne_axis1', y='tsne_axis2')
| alt.vconcat(get_interactive_histogram('potential density'),
             get_interactive_histogram('pt'),
             get_interactive_histogram('salinity (psu)'))
| alt.vconcat(get_interactive_histogram('silicate (umol/kg)'),
              get_interactive_histogram('NO'),
              get_interactive_histogram('PO (umol/kg)'))
| alt.vconcat(get_interactive_histogram('oxygen (umol/kg)'),
              get_interactive_histogram('oxygen (umol/kg)'),
              get_interactive_histogram('oxygen (umol/kg)'))
| alt.vconcat(get_interactive_histogram('oxygen (umol/kg)'),
              get_interactive_histogram('oxygen (umol/kg)'),
              get_interactive_histogram('oxygen (umol/kg)'))
| legend
),

(base.encode(nozero_xaxis('pt'), nozero_yaxis('salinity (psu)'))
| base.encode(nozero_xaxis('salinity (psu)'),
              nozero_yaxis('silicate (umol/kg)') )
| base.encode(nozero_xaxis('pt'), 
              nozero_yaxis('silicate (umol/kg)'))
),

(base.encode(nozero_xaxis('pt'),
             nozero_yaxis('NO'))
| base.encode(nozero_xaxis('salinity (psu)'),
              nozero_yaxis('NO'))
| base.encode(nozero_xaxis('silicate (umol/kg)'),
              nozero_yaxis('NO'))
),
 
 (base.encode(nozero_xaxis('pt'), nozero_yaxis('PO (umol/kg)'))
| base.encode(nozero_xaxis('salinity (psu)'), nozero_yaxis('PO (umol/kg)'))
|  base.encode(nozero_xaxis('salinity (psu)'), nozero_yaxis('NO'))
| base.encode(nozero_xaxis('NO'), nozero_yaxis('PO (umol/kg)'))
#| base.encode(x='salinity (psu)', y='PO (umol/kg)')
#| base.encode(x='NO', y='PO (umol/kg)')
),
#

(base.encode(nozero_xaxis('latitude (degrees)'),
             nozero_yaxis('depth (m)', domain=(6000, 0))))

).configure_axis(labelFontSize=FONTSIZE,
                 titleFontSize=FONTSIZE).properties(padding=0, spacing=0)
# the padding/spacing doesn't propagate to subcharts propertly
