## Unsupervised learning and EDA

### GRN Learn

In [None]:
import grn_learn as g
from grn_learn import viz
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import random 
import matplotlib as mpl
from scipy.stats import pearsonr
from sklearn import mixture
from sklearn.metrics import mutual_info_score as mi
from umap import UMAP


import bebi103 #jbois' library 
import hvplot
import hvplot.pandas
import holoviews as hv
from holoviews import dim, opts
import bokeh_catplot
import bokeh 
import bokeh.io
from bokeh.themes import Theme
from bokeh.io import output_file, save, output_notebook
from holoviews.operation.datashader import datashade
output_notebook()
hv.extension('bokeh')

#ss.set_plotting_style_2()
np.random.seed(42)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

g.set_plotting_style()
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2

In [None]:
theme = Theme(json=viz.bokeh_style())
bokeh.io.curdoc().theme = theme
hv.renderer('bokeh').theme = theme

### Unsupervised learning : manifold learning and clustering. 

This is a to - do list for this notebook. 
1. check if TFs in a module are correlated : 

*  e.g. genes in an operon
*  genes in a module SoxR and SoxS
* SoxR and OxyR 

As a first pass, let's try to import the data from *E. coli* we had already denoised using PCA to quickly make sure that we have correlation between genes in a given module. 

Let's try with a couple of genes from the lac operon : LacZ and LacA

In [None]:
path = '~/jupyter-notebooks/rpgroup/data/ml_dfs/'

In [None]:
regulons_with_noise = pd.read_csv(path + 'ecoli_ml.csv', index_col = 0)

In [None]:
data = regulons_with_noise.iloc[:, 3:].values

In [None]:
latent_space = UMAP().fit_transform(data)

In [None]:
latent_space.shape

In [None]:
regulons_with_noise['UMAP 1'], regulons_with_noise['UMAP 2'] = latent_space[:, 0], latent_space[:, 1]

## EDA

In [None]:
dots_kws = {'padding': 0.2,
            'alpha' : 0.3,
            'tools': ['hover'],
            'color' : 'orange',
            'show_grid': True, 
            'width': 420, 
            'height': 300}

shader_opts = {'padding': 0.2,
               'tools': ['hover']}

In [None]:
dots= hv.Points(data = regulons_with_noise,
           kdims = ['UMAP 1', 'UMAP 2'],
           vdims = ['gene name']).opts(**dots_kws,
                                      xlabel= 'UMAP 1',
                                      ylabel = 'UMAP 2' )

In [None]:
datashade(dots).opts(tools = ['hover'])

### Choosing the optimal number of components

In [None]:
n_components = np.arange(1, 21)

models = [mixture.GaussianMixture(n,covariance_type='full',
                                random_state=0).fit(latent_space) for n in n_components]

plt.plot(n_components, [m.bic(latent_space) for m in models],
         label='Bayesian information criterion', color = 'dodgerblue')
plt.plot(n_components, [m.aic(latent_space) for m in models],
         label='Akikake information criterion', color = 'salmon')

plt.legend(loc='best')
plt.xlabel('n_components');

We can see that a good number of clusters would be 5, 7, or 10. We could even go further, as to choose 20 clusters cause we see the IC actually keeps decreasing. 

Let's try 10 clusters. 

In [None]:
# Fit a Dirichlet process Gaussian mixture using five components
dpgmm = mixture.BayesianGaussianMixture(n_components=10,
                                        covariance_type='full').fit(latent_space)

Because we have a probabilistic generative model, we can calculate the probabililty that each gene is a member of each cluster. This let's us assign 

In [None]:
probs = dpgmm.predict_proba(latent_space)

In [None]:
pd.DataFrame(probs[:10, :])

We can also however, get the cluster labels which have the highest probability for each gene.

**Q : How much does the label changes if we cluster in the original high dimensional space?** Not much ! 

In [None]:
labels = dpgmm.predict(latent_space)

In [None]:
# Fit a Dirichlet process Gaussian mixture using five components
dpgmm = mixture.BayesianGaussianMixture(n_components=10,
                                        covariance_type='full').fit(data)

In [None]:
hi_dim_labels = dpgmm.predict(data)

In [None]:
mi(labels, labels)
mi(labels, hi_dim_labels)

In [None]:
pd.Series(labels).value_counts()

In [None]:
regulons_with_noise['cluster_labels'] = labels

In [None]:
regulons_with_noise['hi_dim_clusters'] = labels

In [None]:
regulons_with_noise.hvplot(kind = 'scatter',
                           x = 'UMAP 1',
                           y = 'UMAP 2',
                           c = 'cluster_labels',
                           hover_cols = ['gene name'],
                           s = 80, alpha = 0.4).opts(cmap = 'Set3',
                                                      padding = 0.5,
                                                      height = 350, 
                                                      width = 500,
                                                      colorbar_opts={'title':'clusters'})

In [None]:
regulons_with_noise.hvplot(kind = 'scatter',
                           x = 'UMAP 1',
                           y = 'UMAP 2',
                           c = 'hi_dim_clusters',
                           hover_cols = ['gene name'],
                           s = 80, alpha = 0.4).opts(cmap = 'Set3',
                                                      padding = 0.5,
                                                      height = 350, 
                                                      width = 500,
                                                      colorbar_opts={'title':'clusters'})

## Integrating TF annotation

In [None]:
path_tf = '~/Documents/uni/bioinfo/data/coli/'

In [None]:
tfs = pd.read_csv( path_tf + 'exp_tf_list_gene_name.csv', comment = '#')

In [None]:
hyp_tfs = pd.read_csv(path_tf + 'hypTF_list_genes.csv')

In [None]:
hyp_tfs_genes = hyp_tfs.hyptfs.values
tfs_genes = tfs.TF.values

In [None]:
tfs = list(hyp_tfs_genes) + list(tfs_genes)

In [None]:
tf_annot = []

for index, row in regulons_with_noise.iterrows():
    
    if row['gene name'] in hyp_tfs_genes:
        tf_annot.append('hyp_tf')
    elif row['gene name']  in tfs_genes:
        tf_annot.append('tf')
        
    else:
        tf_annot.append('regular_gene')
        

In [None]:
pd.Series(tf_annot).value_counts()

In [None]:
regulons_with_noise['tf_annot'] = tf_annot

In [None]:
regulons_with_noise.hvplot(kind = 'scatter',
                           x = 'UMAP 1',
                           y = 'UMAP 2',
                           c = 'tf_annot',
                           hover_cols = ['gene name', 'cluster_labels'],
                           s = 80, alpha = 0.4).opts(cmap = 'Set2',
                                                      padding = 0.5,
                                                      height = 400, 
                                                      width = 650,
                                                      colorbar_opts={'title':'clusters'})

TFs seem to co-localize with their target genes in the UMAP latent space. 

### Integrating TF clusters from the TF- TF  network

In [None]:
#Loading the TF-TF TRN, available at RegulonDB

tf_trn = pd.read_csv(path_tf + "tf-tf-l.txt", delimiter= '\t', comment= '#', index_col= False)
tf_trn.head()

In [None]:
net = nx.from_pandas_edgelist(df= tf_trn, source= 'TF', target='TG',
                             edge_attr='regType')

Let's compute the LCC. 

In [None]:
net= max(nx.connected_component_subgraphs(net), key=len)

Now we can run the Louvain clustering algorithm. 

In [None]:
import community

In [None]:
##Cluster the TF-TF network LCC

communities = community.best_partition(net)

n_clusters_tf = max(communities.values())

n_clusters_tf

In [None]:
nx.set_node_attributes(net, values= communities, name='modularity')

In [None]:
tf_clusters = g.get_network_clusters(net, n_clusters_tf)

In [None]:
tf_clusters[0]

In [None]:
cluster1, cluster2, cluster3, cluster4,\
cluster5, cluster6, cluster7, cluster8, \
cluster9, cluster10, cluster11 = tf_clusters

Now we can assign the annotation to the genes in the dataframe. 

In [None]:
annot_dict = dict(zip(np.arange(1,12), tf_clusters))

In [None]:
print(cluster6, end = ' ')

In [None]:
#6
clusters_dict = {0: 'None',
                 10: 'galactose', 
                3: 'DNA repair',
                4: 'stress response + acid', 
                1: 'oxidative stress + drug resistance (mar-rob-sox box)', 
                5: 'fur-purr-oxyr', 
                2: 'flagella + curli + biofilm', 
                3: 'carbon metabolism (laci, rhar, melr, malt)', 
                11: 'toxin-antitoxin', 
                5: 'sugar (puur, xylr, arac, beti)', 
                8 : 'carbon metabolism (laci, cytr, glpr, rhas, crp) ',
                10: 'purine metabolism', 
                 9: 'globar reg / NAPs (fis, hup, cra, hns)',
                 6: 'anaerobic metabolism',
                7: 'sulfur and nitrogen metabolism'}

In [None]:
pd.DataFrame.from_dict?

In [None]:
tf_df = pd.melt(pd.DataFrame.from_dict(annot_dict, orient='index').T,
        var_name = 'tf_tf_cluster', value_name= 'gene name')

In [None]:
tf_df = tf_df.replace('None', np.nan).dropna().reset_index(drop = True)

In [None]:
tf_df.head()

In [None]:
cluster_names = [clusters_dict[val] for val in tf_df['tf_tf_cluster'].values]

In [None]:
tf_df['cluster name'] = cluster_names

In [None]:
tf_annot_regulons = pd.merge(regulons_with_noise, tf_df, how = 'left', on = 'gene name')

In [None]:
tf_annot_regulons.tf_tf_cluster.replace(np.nan, 0, inplace = True)

In [None]:
tf_annot_regulons.head()

In [None]:
ls ..

In [None]:
#tf_annot_regulons.to_csv('../data/regulons_w_noise_post_clustering.csv', index = False)

In [None]:
tf_only = tf_annot_regulons[tf_annot_regulons['tf_tf_cluster'] != 0]

Let's get all of the missing experimental TFs from missmatch of the gene names. 

In [None]:
print(list(set(tfs_genes)- set(tf_only['gene name'].values)), end = ' ')

In [None]:
tf_only.shape

In [None]:
tf_annot_regulons = pd.read_csv('../data/regulons_w_noise_post_clustering.csv')

In [None]:
tf_annot_regulons.hvplot(kind = 'scatter',
                           x = 'UMAP 1',
                           y = 'UMAP 2',
                           c = 'tf_tf_cluster',
                           hover_cols = ['gene name', 'cluster_labels', 'tf_tf_cluster'],
                           s = 80, alpha = 0.3).opts(cmap = 'Set3_r',
                                                      padding = 0.5,
                                                      height = 400, 
                                                      width = 650,
                                                      colorbar_opts={'title':'clusters'})

In [None]:
tf_only.hvplot(kind = 'scatter',
                           x = 'UMAP 1',
                           y = 'UMAP 2',
                           c = 'tf_tf_cluster',
                           hover_cols = ['gene name',
                                         'cluster_labels',
                                         'tf_tf_cluster', 'cluster name'],
                           s = 80, alpha = 0.6).opts(cmap = 'Set3_r',
                                                      padding = 0.5,
                                                      height = 300, 
                                                      width = 500,
                                                      colorbar_opts={'title':'clusters'})

In [None]:
#datashade(dots).opts(**shader_opts)