Skip to content
Permalink
master
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
### clustering.py
#Copyright 2005-2008 J. David Gladstone Institutes, San Francisco California
#Author Nathan Salomonis - nsalomonis@gmail.com
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is furnished
#to do so, subject to the following conditions:
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import platform
useDefaultBackend=False
if platform.system()=='Darwin':
if platform.mac_ver()[0] == '10.14.6':
useDefaultBackend=True
import sys,string,os,copy
sys.path.insert(1, os.path.join(sys.path[0], '..')) ### import parent dir dependencies
command_args = string.join(sys.argv,' ')
if len(sys.argv[1:])>0 and '--' in command_args: commandLine=True
else: commandLine=False
display_label_names = True
benchmark = False
cluster_colors = 'Paired' #Paired #gist_ncar
import traceback
try:
import math
import warnings
with warnings.catch_warnings():
warnings.filterwarnings("ignore",category=UserWarning) ### hides import warnings
warnings.filterwarnings("ignore",category=RuntimeWarning) ### hides import warnings
import matplotlib
if commandLine and 'linux' in sys.platform:
### TkAgg doesn't work when AltAnalyze is run remotely (ssh or sh script)
try: matplotlib.use('Agg');
except Exception: pass
try:
matplotlib.rcParams['backend'] = 'Agg'
except Exception: pass
else:
try:
if useDefaultBackend == False:
import matplotlib.backends.backend_tkagg
if platform.system()=='Darwin':
matplotlib.use('macosx')
else:
matplotlib.use('TkAgg')
except Exception: pass
if useDefaultBackend == False:
if platform.system()=='Darwin':
try: matplotlib.rcParams['backend'] = 'macosx'
except Exception: pass
else:
try: matplotlib.rcParams['backend'] = 'TkAgg'
except Exception: pass
try:
import matplotlib.pyplot as pylab
import matplotlib.colors as mc
import matplotlib.mlab as mlab
import matplotlib.ticker as tic
from matplotlib.patches import Circle
import mpl_toolkits
from mpl_toolkits import mplot3d
from mpl_toolkits.mplot3d import Axes3D
try: from matplotlib.cbook import _string_to_bool
except: pass
matplotlib.rcParams['axes.linewidth'] = 0.5
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = 'Arial'
matplotlib.rcParams['figure.facecolor'] = 'white' ### Added in 2.1.2
#matplotlib.rcParams['figure.dpi'] = 200 ### Control the image resolution for pylab.show()
except Exception:
print traceback.format_exc()
print 'Matplotlib support not enabled'
import scipy
try: from scipy.sparse.csgraph import _validation
except Exception: pass
try: from scipy import stats
except: pass
try:
from scipy.linalg import svd
import scipy.special._ufuncs_cxx
from scipy.spatial import _voronoi
from scipy.spatial import _spherical_voronoi
from scipy.spatial import qhull
import scipy._lib.messagestream
except Exception:
pass
import scipy.cluster.hierarchy as sch
import scipy.spatial.distance as dist
#import scipy.interpolate.interpnd
#from scipy import optimize
try: import numpy; np = numpy
except:
print 'Numpy import error...'
print traceback.format_exc()
### The below is used for binary freeze dependency identification
if 'darwin' in sys.platform:
### The below is used for binary freeze dependency identification
try: import umap
except: pass
try:
from cairo import ImageSurface
except: pass
try:
import igraph.vendor.texttable
except ImportError: pass
try:
from sklearn.decomposition import PCA, FastICA
except Exception: pass
try: from sklearn.neighbors import quad_tree ### supported in sklearn>18.2
except: pass
try: import sklearn.utils.sparsetools._graph_validation
except: pass
try: import sklearn.utils.weight_vector
except: pass
from sklearn.neighbors import *
from sklearn.manifold.t_sne import *
from sklearn.tree import *; from sklearn.tree import _utils
from sklearn.manifold.t_sne import _utils
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
import sklearn.linear_model.sgd_fast
import sklearn.utils.lgamma
try: import scipy.special.cython_special
except: pass
import sklearn.neighbors.typedefs
import sklearn.neighbors.ball_tree
try:
import numba
import numba.config
import llvmlite; from llvmlite import binding; from llvmlite.binding import *
from llvmlite.binding import ffi; from llvmlite.binding import dylib
except:
pass
#pylab.ion() # closes Tk window after show - could be nice to include
except Exception:
print traceback.format_exc()
pass
try: import numpy
except: pass
import time
import unique
from stats_scripts import statistics
import os
import export
import webbrowser
import warnings
import UI
use_default_colors = False
try:
warnings.simplefilter("ignore", numpy.ComplexWarning)
warnings.simplefilter("ignore", DeprecationWarning) ### Annoying depreciation warnings (occurs in sch somewhere)
#This shouldn't be needed in python 2.7 which suppresses DeprecationWarning - Larsson
except Exception: None
from visualization_scripts import WikiPathways_webservice
try:
import fastcluster as fc
#print 'Using fastcluster instead of scipy hierarchical cluster'
#fc = sch
except Exception:
#print 'Using scipy insteady of fastcluster (not installed)'
try: fc = sch ### fastcluster uses the same convention names for linkage as sch
except Exception: print 'Scipy support not present...'
def getColorRange(x):
""" Determines the range of colors, centered at zero, for normalizing cmap """
vmax=x.max()
vmin=x.min()
if vmax<0 and vmin<0: direction = 'negative'
elif vmax>0 and vmin>0: direction = 'positive'
else: direction = 'both'
if direction == 'both':
vmax = max([vmax,abs(vmin)])
vmin = -1*vmax
return vmax,vmin
else:
return vmax,vmin
def heatmap(x, row_header, column_header, row_method, column_method, row_metric, column_metric, color_gradient,
dataset_name, display=False, contrast=None, allowAxisCompression=True,Normalize=True,
PriorColumnClusters=None, PriorRowClusters=None):
print "Performing hieararchical clustering using %s for columns and %s for rows" % (column_metric,row_metric)
show_color_bars = True ### Currently, the color bars don't exactly reflect the dendrogram colors
try: ExportCorreleationMatrix = exportCorreleationMatrix
except Exception: ExportCorreleationMatrix = False
try: os.mkdir(root_dir) ### May need to create this directory
except Exception: None
if display == False:
pylab.figure() ### Add this to avoid a Tkinter bug after running MarkerFinder (not sure why it is needed) - creates a second empty window when display == True
if row_method == 'hopach' or column_method == 'hopach':
### Test R and hopach
"""
try:
import R_test
except Exception,e:
#print traceback.format_exc()
print 'Failed to install hopach or R not installed (install R before using hopach)'
row_method = 'average'; column_method = 'average'
if len(column_header)==2: column_method = 'average'
if len(row_header)==2: row_method = 'average'
"""
pass
"""
Prototype methods:
http://old.nabble.com/How-to-plot-heatmap-with-matplotlib--td32534593.html
http://stackoverflow.com/questions/7664826/how-to-get-flat-clustering-corresponding-to-color-clusters-in-the-dendrogram-cre
Scaling the color gradient so that zero is white:
http://stackoverflow.com/questions/2369492/generate-a-heatmap-in-matplotlib-using-a-scatter-data-set
Other cluster methods:
http://stackoverflow.com/questions/9362304/how-to-get-centroids-from-scipys-hierarchical-agglomerative-clustering
x is a m by n ndarray, m observations, n genes
"""
### Perform the associated clustering by HOPACH via PYPE or Rpy to R
if row_method == 'hopach' or column_method == 'hopach':
try:
""" HOPACH is a clustering method implemented in R that builds a hierarchical tree of clusters by recursively
partitioning a data set, while ordering and possibly collapsing clusters at each level:
http://www.bioconductor.org/packages/release/bioc/html/hopach.html
"""
import R_interface
#reload(R_interface)
if row_method == 'hopach' and column_method == 'hopach': cluster_method = 'both'
elif row_method == 'hopach': cluster_method = 'gene'
else: cluster_method = 'array'
if row_metric == 'cosine': metric_gene = "euclid"
elif row_metric == 'euclidean': metric_gene = "cosangle"
elif row_metric == 'correlation': metric_gene = "cor"
else: metric_gene = "cosangle"
if column_metric == 'cosine': metric_array = "euclid"
elif column_metric == 'euclidean': metric_array = "cosangle"
elif column_metric == 'correlation': metric_array = "cor"
else: metric_array = "euclid"
### Returned are the row_order and column_order in the Scipy clustering output format
newFilename, Z1, Z2 = R_interface.remoteHopach(inputFilename,cluster_method,metric_gene,metric_array)
if newFilename != inputFilename:
### If there were duplicates, re-import the matrix data for the cleaned up filename
try:
matrix, column_header, row_header, dataset_name, group_db = importData(newFilename,Normalize=normalize,reverseOrder=False)
except Exception:
matrix, column_header, row_header, dataset_name, group_db = importData(newFilename)
x = numpy.array(matrix)
except Exception:
row_method = 'average'; column_method = 'average'
print traceback.format_exc()
print 'hopach failed... continue with an alternative method'
skipClustering = False
try:
if len(PriorColumnClusters)>0 and len(PriorRowClusters)>0 and row_method==None and column_method == None:
print 'Prior generated clusters being used rather re-clustering'
"""
try:
if len(targetGeneIDs)>0:
PriorColumnClusters=[] ### If orderded genes input, we want to retain this order rather than change
except Exception: pass
"""
if len(PriorColumnClusters)>0: ### this corresponds to the above line
Z1={}; Z2={}
Z1['level'] = PriorRowClusters; Z1['level'].reverse()
Z2['level'] = PriorColumnClusters; #Z2['level'].reverse()
Z1['leaves'] = range(0,len(row_header)); #Z1['leaves'].reverse()
Z2['leaves'] = range(0,len(column_header)); #Z2['leaves'].reverse()
skipClustering = True
### When clusters are imported, you need something other than None, otherwise, you need None (need to fix here)
row_method = None
column_method = None
row_method = 'hopach'
column_method = 'hopach'
except Exception,e:
#print traceback.format_exc()
pass
n = len(x[0]); m = len(x)
if color_gradient == 'red_white_blue':
cmap=pylab.cm.bwr
if color_gradient == 'red_black_sky':
cmap=RedBlackSkyBlue()
if color_gradient == 'red_black_blue':
cmap=RedBlackBlue()
if color_gradient == 'red_black_green':
cmap=RedBlackGreen()
if color_gradient == 'yellow_black_blue':
cmap=YellowBlackBlue()
if color_gradient == 'yellow_black':
cmap=YellowBlack()
if color_gradient == 'black_yellow_blue':
cmap=BlackYellowBlue()
if color_gradient == 'seismic':
cmap=pylab.cm.seismic
if color_gradient == 'green_white_purple':
cmap=pylab.cm.PiYG_r
if color_gradient == 'coolwarm':
cmap=pylab.cm.coolwarm
if color_gradient == 'Greys':
cmap=pylab.cm.Greys
if color_gradient == 'yellow_orange_red':
cmap=pylab.cm.YlOrRd
if color_gradient == 'Spectral':
cmap = pylab.cm.Spectral_r
vmin=x.min()
vmax=x.max()
vmax = max([vmax,abs(vmin)])
if Normalize != False:
vmin = vmax*-1
elif 'Clustering-Zscores-' in dataset_name:
vmin = vmax*-1
elif vmin<0 and vmax>0 and Normalize==False:
vmin = vmax*-1
#vmin = vmax*-1
#print vmax, vmin
default_window_hight = 8.5
default_window_width = 12
if len(column_header)>80:
default_window_width = 14
if len(column_header)>100:
default_window_width = 16
if contrast == None:
scaling_factor = 2.5 #2.5
else:
try: scaling_factor = float(contrast)
except Exception: scaling_factor = 2.5
#print vmin/scaling_factor
norm = matplotlib.colors.Normalize(vmin/scaling_factor, vmax/scaling_factor) ### adjust the max and min to scale these colors by 2.5 (1 scales to the highest change)
#fig = pylab.figure(figsize=(default_window_width,default_window_hight)) ### could use m,n to scale here
fig = pylab.figure() ### could use m,n to scale here - figsize=(12,10)
fig.set_figwidth(12)
fig.set_figheight(7)
fig.patch.set_facecolor('white')
pylab.rcParams['font.size'] = 7.5
#pylab.rcParams['axes.facecolor'] = 'white' ### Added in 2.1.2
if show_color_bars == False:
color_bar_w = 0.000001 ### Invisible but not gone (otherwise an error persists)
else:
color_bar_w = 0.0125 ### Sufficient size to show
bigSampleDendrogram = True
if bigSampleDendrogram == True and row_method==None and column_method != None and allowAxisCompression == True:
dg2 = 0.30
dg1 = 0.43
else: dg2 = 0.1; dg1 = 0.63
try:
if EliteGeneSets != [''] and EliteGeneSets !=[]:
matrix_horiz_pos = 0.27
elif skipClustering:
if len(row_header)<100:
matrix_horiz_pos = 0.20
else:
matrix_horiz_pos = 0.27
else:
matrix_horiz_pos = 0.14
except Exception:
matrix_horiz_pos = 0.14
""" Adjust the position of the heatmap based on the number of columns """
if len(column_header)<50:
matrix_horiz_pos+=0.1
## calculate positions for all elements
# ax1, placement of dendrogram 1, on the left of the heatmap
[ax1_x, ax1_y, ax1_w, ax1_h] = [0.05,0.235,matrix_horiz_pos,dg1] ### The last controls matrix hight, second value controls the position of the matrix relative to the bottom of the view [0.05,0.22,0.2,0.6]
width_between_ax1_axr = 0.004
height_between_ax1_axc = 0.004 ### distance between the top color bar axis and the matrix
# axr, placement of row side colorbar
[axr_x, axr_y, axr_w, axr_h] = [0.31,0.1,color_bar_w-0.002,0.6] ### second to last controls the width of the side color bar - 0.015 when showing [0.31,0.1,color_bar_w,0.6]
axr_x = ax1_x + ax1_w + width_between_ax1_axr
axr_y = ax1_y; axr_h = ax1_h
width_between_axr_axm = 0.004
# axc, placement of column side colorbar (3rd value controls the width of the matrix!)
[axc_x, axc_y, axc_w, axc_h] = [0.5,0.63,0.5,color_bar_w] ### last one controls the hight of the top color bar - 0.015 when showing [0.4,0.63,0.5,color_bar_w]
""" Adjust the width of the heatmap based on the number of columns """
if len(column_header)<50:
axc_w = 0.3
if len(column_header)<20:
axc_w = 0.2
axc_x = axr_x + axr_w + width_between_axr_axm
axc_y = ax1_y + ax1_h + height_between_ax1_axc
height_between_axc_ax2 = 0.004
# axm, placement of heatmap for the data matrix
[axm_x, axm_y, axm_w, axm_h] = [0.4,0.9,2.5,0.5] #[0.4,0.9,2.5,0.5]
axm_x = axr_x + axr_w + width_between_axr_axm
axm_y = ax1_y; axm_h = ax1_h
axm_w = axc_w
# ax2, placement of dendrogram 2, on the top of the heatmap
[ax2_x, ax2_y, ax2_w, ax2_h] = [0.3,0.72,0.6,dg2] ### last one controls hight of the dendrogram [0.3,0.72,0.6,0.135]
ax2_x = axr_x + axr_w + width_between_axr_axm
ax2_y = ax1_y + ax1_h + height_between_ax1_axc + axc_h + height_between_axc_ax2
ax2_w = axc_w
# axcb - placement of the color legend
[axcb_x, axcb_y, axcb_w, axcb_h] = [0.02,0.938,0.17,0.025] ### Last one controls the hight [0.07,0.88,0.18,0.076]
# axcc - placement of the colum colormap legend colormap (distinct map)
[axcc_x, axcc_y, axcc_w, axcc_h] = [0.02,0.12,0.17,0.025] ### Last one controls the hight [0.07,0.88,0.18,0.076]
# Compute and plot top dendrogram
if column_method == 'hopach':
ind2 = numpy.array(Z2['level']) ### from R_interface - hopach root cluster level
elif column_method != None:
start_time = time.time()
#print x;sys.exit()
d2 = dist.pdist(x.T)
#print d2
#import mdistance2
#d2 = mdistance2.mpdist(x.T)
#print d2;sys.exit()
D2 = dist.squareform(d2)
ax2 = fig.add_axes([ax2_x, ax2_y, ax2_w, ax2_h], frame_on=False)
if ExportCorreleationMatrix:
new_matrix=[]
for i in D2:
#string.join(map(inverseDist,i),'\t')
log2_data = map(inverseDist,i)
avg = statistics.avg(log2_data)
log2_norm = map(lambda x: x-avg,log2_data)
new_matrix.append(log2_norm)
x = numpy.array(new_matrix)
row_header = column_header
#sys.exit()
Y2 = fc.linkage(D2, method=column_method, metric=column_metric) ### array-clustering metric - 'average', 'single', 'centroid', 'complete'
#Y2 = sch.fcluster(Y2, 10, criterion = "maxclust")
try: Z2 = sch.dendrogram(Y2)
except Exception:
if column_method == 'average':
column_metric = 'euclidean'
else: column_method = 'average'
Y2 = fc.linkage(D2, method=column_method, metric=column_metric)
Z2 = sch.dendrogram(Y2)
#ind2 = sch.fcluster(Y2,0.6*D2.max(), 'distance') ### get the correlations
#ind2 = sch.fcluster(Y2,0.2*D2.max(), 'maxclust') ### alternative method biased based on number of clusters to obtain (like K-means)
ind2 = sch.fcluster(Y2,0.7*max(Y2[:,2]),'distance') ### This is the default behavior of dendrogram
ax2.set_xticks([]) ### Hides ticks
ax2.set_yticks([])
time_diff = str(round(time.time()-start_time,1))
print 'Column clustering completed in %s seconds' % time_diff
else:
ind2 = ['NA']*len(column_header) ### Used for exporting the flat cluster data
# Compute and plot left dendrogram
if row_method == 'hopach':
ind1 = numpy.array(Z1['level']) ### from R_interface - hopach root cluster level
elif row_method != None:
start_time = time.time()
d1 = dist.pdist(x)
D1 = dist.squareform(d1) # full matrix
# postion = [left(x), bottom(y), width, height]
#print D1;sys.exit()
Y1 = fc.linkage(D1, method=row_method, metric=row_metric) ### gene-clustering metric - 'average', 'single', 'centroid', 'complete'
no_plot=False ### Indicates that we want to show the dendrogram
try:
if runGOElite: no_plot = True
elif len(PriorColumnClusters)>0 and len(PriorRowClusters)>0 and row_method==None and column_method == None:
no_plot = True ### If trying to instantly view prior results, no dendrogram will be display, but prior GO-Elite can
else:
ax1 = fig.add_axes([ax1_x, ax1_y, ax1_w, ax1_h], frame_on=False) # frame_on may be False - this window conflicts with GO-Elite labels
except Exception:
ax1 = fig.add_axes([ax1_x, ax1_y, ax1_w, ax1_h], frame_on=False) # frame_on may be False
try: Z1 = sch.dendrogram(Y1, orientation='left',no_plot=no_plot) ### This is where plotting occurs - orientation 'right' in old matplotlib
except Exception:
row_method = 'average'
try:
Y1 = fc.linkage(D1, method=row_method, metric=row_metric)
Z1 = sch.dendrogram(Y1, orientation='right',no_plot=no_plot)
except Exception:
row_method = 'ward'
Y1 = fc.linkage(D1, method=row_method, metric=row_metric)
Z1 = sch.dendrogram(Y1, orientation='right',no_plot=no_plot)
#ind1 = sch.fcluster(Y1,0.6*D1.max(),'distance') ### get the correlations
#ind1 = sch.fcluster(Y1,0.2*D1.max(),'maxclust')
ind1 = sch.fcluster(Y1,0.7*max(Y1[:,2]),'distance') ### This is the default behavior of dendrogram
if ExportCorreleationMatrix:
Z1 = sch.dendrogram(Y2, orientation='right')
Y1 = Y2
d1 = d2
D1 = D2
ind1 = ind2
try: ax1.set_xticks([]); ax1.set_yticks([]) ### Hides ticks
except Exception: pass
time_diff = str(round(time.time()-start_time,1))
print 'Row clustering completed in %s seconds' % time_diff
else:
ind1 = ['NA']*len(row_header) ### Used for exporting the flat cluster data
# Plot distance matrix.
axm = fig.add_axes([axm_x, axm_y, axm_w, axm_h]) # axes for the data matrix
xt = x
if column_method != None:
idx2 = Z2['leaves'] ### apply the clustering for the array-dendrograms to the actual matrix data
xt = xt[:,idx2]
#ind2 = ind2[:,idx2] ### reorder the flat cluster to match the order of the leaves the dendrogram
""" Error can occur here if hopach was selected in a prior run but now running NONE """
try: ind2 = [ind2[i] for i in idx2] ### replaces the above due to numpy specific windows version issue
except Exception:
column_method=None
xt = x
ind2 = ['NA']*len(column_header) ### Used for exporting the flat cluster data
ind1 = ['NA']*len(row_header) ### Used for exporting the flat cluster data
if row_method != None:
idx1 = Z1['leaves'] ### apply the clustering for the gene-dendrograms to the actual matrix data
prior_xt = xt
xt = xt[idx1,:] # xt is transformed x
#ind1 = ind1[idx1,:] ### reorder the flat cluster to match the order of the leaves the dendrogram
try: ind1 = [ind1[i] for i in idx1] ### replaces the above due to numpy specific windows version issue
except Exception:
if 'MarkerGenes' in dataset_name:
ind1 = ['NA']*len(row_header) ### Used for exporting the flat cluster data
row_method = None
### taken from http://stackoverflow.com/questions/2982929/plotting-results-of-hierarchical-clustering-ontop-of-a-matrix-of-data-in-python/3011894#3011894
im = axm.matshow(xt, aspect='auto', origin='lower', cmap=cmap, norm=norm) ### norm=norm added to scale coloring of expression with zero = white or black
axm.set_xticks([]) ### Hides x-ticks
axm.set_yticks([])
#axm.set_axis_off() ### Hide border
#fix_verts(ax1,1)
#fix_verts(ax2,0)
### Adjust the size of the fonts for genes and arrays based on size and character length
row_fontsize = 4
column_fontsize = 5
column_text_max_len = max(map(lambda x: len(x), column_header)) ### Get the maximum length of a column annotation
if len(row_header)<75:
row_fontsize = 4.5
if len(row_header)<50:
row_fontsize = 5.5
if len(row_header)<25:
row_fontsize = 7
if len(column_header)<75:
column_fontsize = 6.5
if len(column_header)<50:
column_fontsize = 8
if len(column_header)<25:
column_fontsize = 11
if column_text_max_len < 15:
column_fontsize = 15
elif column_text_max_len > 30:
column_fontsize = 6.5
else:
column_fontsize = 10
try:
if len(justShowTheseIDs)>50:
column_fontsize = 7
elif len(justShowTheseIDs)>0:
column_fontsize = 10
if len(justShowTheseIDs)>0:
additional_symbols=[]
import gene_associations
try:
gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol'))
except Exception: gene_to_symbol={}; symbol_to_gene={}
JustShowTheseIDs = copy.deepcopy(justShowTheseIDs)
except Exception:
JustShowTheseIDs=[]
# Add text
new_row_header=[]
new_column_header=[]
for i in range(x.shape[0]):
if row_method != None:
new_row_header.append(row_header[idx1[i]])
else:
new_row_header.append(row_header[i])
for i in range(x.shape[1]):
if column_method != None:
new_column_header.append(column_header[idx2[i]])
else: ### When not clustering columns
new_column_header.append(column_header[i])
dataset_name = string.replace(dataset_name,'Clustering-','')### clean up the name if already a clustered file
if '-hierarchical' in dataset_name:
dataset_name = string.split(dataset_name,'-hierarchical')[0]
filename = 'Clustering-%s-hierarchical_%s_%s.pdf' % (dataset_name,column_metric,row_metric)
if 'MarkerGenes' in dataset_name:
time_stamp = timestamp() ### Don't overwrite the previous version
filename = string.replace(filename,'hierarchical',time_stamp)
elite_dir, cdt_file, markers, SystemCode = exportFlatClusterData(root_dir + filename, root_dir, dataset_name, new_row_header,new_column_header,xt,ind1,ind2,vmax,display)
def ViewPNG(png_file_dir):
if os.name == 'nt':
try: os.startfile('"'+png_file_dir+'"')
except Exception: os.system('open "'+png_file_dir+'"')
elif 'darwin' in sys.platform: os.system('open "'+png_file_dir+'"')
elif 'linux' in sys.platform: os.system('xdg-open "'+png_file_dir+'"')
try:
try:
temp1=len(justShowTheseIDs)
if 'monocle' in justShowTheseIDs and ('guide' not in justShowTheseIDs):
import R_interface
print 'Running Monocle through R (be patient, this can take 20 minutes+)'
R_interface.performMonocleAnalysisFromHeatmap(species,cdt_file[:-3]+'txt',cdt_file[:-3]+'txt')
png_file_dir = root_dir+'/Monocle/monoclePseudotime.png'
#print png_file_dir
ViewPNG(png_file_dir)
except Exception: pass # no justShowTheseIDs
except Exception:
print '...Monocle error:'
print traceback.format_exc()
pass
cluster_elite_terms={}; ge_fontsize=11.5; top_genes=[]; proceed=True
try:
try:
if 'guide' in justShowTheseIDs: proceed = False
except Exception: pass
if proceed:
try:
cluster_elite_terms,top_genes = remoteGOElite(elite_dir,SystemCode=SystemCode)
if cluster_elite_terms['label-size']>40: ge_fontsize = 9.5
except Exception:
pass
except Exception: pass #print traceback.format_exc()
if len(cluster_elite_terms)<1:
try:
elite_dirs = string.split(elite_dir,'GO-Elite')
old_elite_dir = elite_dirs[0]+'GO-Elite'+elite_dirs[-1] ### There are actually GO-Elite/GO-Elite directories for the already clustered
old_elite_dir = string.replace(old_elite_dir,'ICGS/','')
#if skipClustering and len(PriorColumnClusters)>0 and len(PriorRowClusters)>0
cluster_elite_terms,top_genes = importGOEliteResults(old_elite_dir)
except Exception,e:
#print traceback.format_exc()
pass
try:
if len(justShowTheseIDs)<1 and len(markers) > 0 and column_fontsize < 9: ### substituted top_genes with markers
column_fontsize = 10
if len(justShowTheseIDs)<1:
additional_symbols=[]
import gene_associations; from import_scripts import OBO_import
try:
gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol'))
#symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol)
except Exception: gene_to_symbol={}; symbol_to_gene={}
except Exception: pass
def formatpval(p):
if '-' in p: p1=p[:1]+p[-4:]
else:
p1 = '{number:.{digits}f}'.format(number=float(p), digits=3)
p1=str(p1)
#print traceback.format_exc();sys.exit()
return p1
# Add text
new_row_header=[]
new_column_header=[]
ci=0 ### index of entries in the cluster
last_cluster=1
""" The below interval variable determines the spacing of GO-Elite labels """
interval = int(float(string.split(str(len(row_header)/35.0),'.')[0]))+1 ### for enrichment term labels with over 100 genes
increment=interval-2
if len(row_header)<100: increment = interval-1
cluster_num={}
for i in cluster_elite_terms: cluster_num[i[0]]=[]
cluster_num = len(cluster_num)
if cluster_num>15:
interval = int(float(string.split(str(len(row_header)/40.0),'.')[0]))+1 ### for enrichment term labels with over 100 genes
increment=interval-2
ge_fontsize = 6
column_fontsize = 6
if cluster_num>25:
interval = int(float(string.split(str(len(row_header)/50.0),'.')[0]))+1 ### for enrichment term labels with over 100 genes
increment=interval-2
ge_fontsize = 5
column_fontsize = 5
if cluster_num>40:
ge_fontsize = 4
column_fontsize = 4
label_pos=-0.03*len(column_header)-.8
alternate=1
#increment =1 # 7
#interval = 1 #9 controls number of terms you see per cluster (smaller = more terms)
#ge_fontsize = 1
#print ge_fontsize, cluster_num
#print label_pos
try:
if 'top' in justShowTheseIDs: justShowTheseIDs.remove('top')
if 'positive' in justShowTheseIDs: justShowTheseIDs.remove('positive')
if 'amplify' in justShowTheseIDs: justShowTheseIDs.remove('amplify')
if 'IntraCorrelatedOnly' in justShowTheseIDs: justShowTheseIDs.remove('IntraCorrelatedOnly')
if 'GuideOnlyCorrelation' in justShowTheseIDs: justShowTheseIDs.remove('GuideOnlyCorrelation')
except Exception:
pass
for i in range(x.shape[0]):
if len(row_header)<40:
radj = len(row_header)*0.009 ### row offset value to center the vertical position of the row label
elif len(row_header)<70:
radj = len(row_header)*0.007 ### row offset value to center the vertical position of the row label
else:
radj = len(row_header)*0.005
try: cluster = str(ind1[i])
except Exception: cluster = 'NA'
if cluster == 'NA':
new_index = i
try: cluster = 'cluster-'+string.split(row_header[new_index],':')[0]
except Exception: pass
if cluster != last_cluster:
ci=0
increment=0
#print cluster,i,row_header[idx1[i]]
color = 'black'
if row_method != None:
try:
if row_header[idx1[i]] in JustShowTheseIDs:
if len(row_header)>len(justShowTheseIDs):
color = 'red'
else: color = 'black'
except Exception: pass
if len(row_header)<106: ### Don't visualize gene associations when more than 100 rows
if display_label_names == False or 'ticks' in JustShowTheseIDs:
if color=='red':
axm.text(x.shape[1]-0.5, i-radj, ' '+'-',fontsize=row_fontsize, color=color, picker=True)
else:
axm.text(x.shape[1]-0.5, i-radj, ' '+'',fontsize=row_fontsize, color=color, picker=True)
else:
axm.text(x.shape[1]-0.5, i-radj, ' '+row_header[idx1[i]],fontsize=row_fontsize, color=color, picker=True)
new_row_header.append(row_header[idx1[i]])
new_index = idx1[i]
else:
try:
feature_id = row_header[i]
if ':' in feature_id:
feature_id = string.split(feature_id,':')[1]
try:
if feature_id[-1]==' ': feature_id = feature_id[:-1]
except:
pass
if feature_id in JustShowTheseIDs:
color = 'red'
else: color = 'black'
except Exception: pass
if len(row_header)<106: ### Don't visualize gene associations when more than 100 rows
if display_label_names == False or 'ticks' in JustShowTheseIDs:
if color=='red':
axm.text(x.shape[1]-0.5, i-radj, ' '+'-',fontsize=row_fontsize, color=color, picker=True)
else:
axm.text(x.shape[1]-0.5, i-radj, ' '+'',fontsize=row_fontsize, color=color, picker=True)
else:
axm.text(x.shape[1]-0.5, i-radj, ' '+row_header[i],fontsize=row_fontsize, color=color, picker=True) ### When not clustering rows
new_row_header.append(row_header[i])
new_index = i ### This is different when clustering rows versus not
if len(row_header)<106:
"""
if cluster in cluster_elite_terms:
try:
term = cluster_elite_terms[cluster][ci][1]
axm.text(-1.5, i-radj, term,horizontalalignment='right',fontsize=row_fontsize)
except Exception: pass
ci+=1
"""
pass
else:
feature_id = row_header[new_index]
original_feature_id = feature_id
if ':' in feature_id:
if 'ENS' != feature_id[:3] or 'G0000' in feature_id:
feature_id = string.split(feature_id,':')[1]
try:
if feature_id[-1]==' ': feature_id = feature_id[:-1]
except:
pass
else:
feature_id = string.split(feature_id,':')[0]
try: feature_id = gene_to_symbol[feature_id][0]
except Exception: pass
if (' ' in feature_id and ('ENS' in feature_id or 'G0000' in feature_id)):
feature_id = string.split(feature_id,' ')[1]
try:
if feature_id in JustShowTheseIDs or original_feature_id in JustShowTheseIDs: color = 'red'
else: color = 'black'
except Exception: pass
try:
if feature_id in justShowTheseIDs or (len(justShowTheseIDs)<1 and feature_id in markers) or original_feature_id in justShowTheseIDs: ### substitutes top_genes with markers
if 'ENS' in feature_id or 'G0000' in feature_id:
if feature_id in gene_to_symbol:
feature_id = gene_to_symbol[feature_id][0]
if original_feature_id in justShowTheseIDs:
feature_id = original_feature_id
if len(justShowTheseIDs)<40:
axm.text(x.shape[1]-0.4, i-radj, feature_id,fontsize=column_fontsize, color=color,picker=True) ### When not clustering rows
elif display_label_names and 'ticks' not in justShowTheseIDs:
if alternate==1: buffer=1.2; alternate=2
elif alternate==2: buffer=2.4; alternate=3
elif alternate==3: buffer=3.6; alternate=4
elif alternate==4: buffer=0; alternate=1
axm.text(x.shape[1]-0.4+buffer, i-radj, feature_id,fontsize=column_fontsize, color=color,picker=True) ### When not clustering rows
else:
axm.text(x.shape[1]-0.5, i-radj, ' '+"-",fontsize=column_fontsize, color=color,picker=True) ### When not clustering rows
elif ' ' in row_header[new_index]:
symbol = string.split(row_header[new_index], ' ')[-1]
if len(symbol)>0:
if symbol in justShowTheseIDs:
if display_label_names and 'ticks' not in justShowTheseIDs:
axm.text(x.shape[1]-0.5, i-radj, ' '+row_header[new_index],fontsize=column_fontsize, color=color,picker=True)
else:
axm.text(x.shape[1]-0.5, i-radj, ' '+"-",fontsize=column_fontsize, color=color,picker=True)
except Exception: pass
if cluster in cluster_elite_terms or 'cluster-'+cluster in cluster_elite_terms:
if 'cluster-'+cluster in cluster_elite_terms:
new_cluster_id = 'cluster-'+cluster
else:
new_cluster_id = cluster
if cluster != last_cluster:
cluster_intialized = False
try:
increment+=1
#print [increment,interval,cluster],cluster_elite_terms[cluster][ci][1];sys.exit()
#if increment == interval or (
#print increment,interval,len(row_header),cluster_intialized
if (increment == interval) or (len(row_header)>200 and increment == (interval-9) and cluster_intialized==False): ### second argument brings the label down
cluster_intialized=True
atypical_cluster = False
if ind1[i+9] == 'NA': ### This occurs for custom cluster, such MarkerFinder (not cluster numbers)
atypical_cluster = True
cluster9 = 'cluster-'+string.split(row_header[new_index+9],':')[0]
if (len(row_header)>200 and str(cluster9)!=cluster): continue
elif (len(row_header)>200 and str(ind1[i+9])!=cluster): continue ### prevents the last label in a cluster from overlapping with the first in the next cluster
pvalue,original_term = cluster_elite_terms[new_cluster_id][ci]
term = original_term
if 'GO:' in term:
term = string.split(term, '(')[0]
if ':WP' in term:
term = string.split(term, ':WP')[0]
pvalue = formatpval(str(pvalue))
term += ' p='+pvalue
if atypical_cluster == False:
term += ' (c'+str(cluster)+')'
try: cluster_elite_terms[term] = cluster_elite_terms[cluster,original_term] ### store the new term name with the associated genes
except Exception: pass
axm.text(label_pos-0.1, i-radj, term,horizontalalignment='right',fontsize=ge_fontsize, picker=True, color = 'blue')
increment=0
ci+=1
except Exception,e:
#print traceback.format_exc();sys.exit()
increment=0
last_cluster = cluster
def onpick1(event):
text = event.artist
print('onpick1 text:', text.get_text())
if 'TreeView' in text.get_text():
try: openTreeView(cdt_file)
except Exception: print 'Failed to open TreeView'
elif 'p=' not in text.get_text():
webbrowser.open('http://www.genecards.org/cgi-bin/carddisp.pl?gene='+string.replace(text.get_text(),' ',''))
else:
#"""
from visualization_scripts import TableViewer
header = ['Associated Genes']
tuple_list = []
for gene in cluster_elite_terms[text.get_text()]:
tuple_list.append([(gene)])
if matplotlib.rcParams['backend'] != 'MacOSX': ### Throws an error when macosx is the backend for matplotlib
try: TableViewer.viewTable(text.get_text(),header,tuple_list)
except: pass ### Due to an an issue using a nonTkAgg backend
#"""
cluster_prefix = 'c'+string.split(text.get_text(),'(c')[-1][:-1]+'-'
for geneSet in EliteGeneSets:
if geneSet == 'GeneOntology':
png_file_dir = elite_dir+'/GO-Elite_results/networks/'+cluster_prefix+'GO'+'.png'
elif geneSet == 'WikiPathways':
png_file_dir = elite_dir+'/GO-Elite_results/networks/'+cluster_prefix+'local'+'.png'
elif len(geneSet)>1:
png_file_dir = elite_dir+'/GO-Elite_results/networks/'+cluster_prefix+geneSet+'.png'
#try: UI.GUI(root_dir,'ViewPNG',[],png_file_dir)
#except Exception: print traceback.format_exc()
try:
alt_png_file_dir = elite_dir+'/GO-Elite_results/networks/'+cluster_prefix+eliteGeneSet+'.png'
png_file_dirs = string.split(alt_png_file_dir,'GO-Elite/')
alt_png_file_dir = png_file_dirs[0]+'GO-Elite/'+png_file_dirs[-1]
except Exception: pass
if os.name == 'nt':
try: os.startfile('"'+png_file_dir+'"')
except Exception:
try: os.system('open "'+png_file_dir+'"')
except Exception: os.startfile('"'+alt_png_file_dir+'"')
elif 'darwin' in sys.platform:
try: os.system('open "'+png_file_dir+'"')
except Exception: os.system('open "'+alt_png_file_dir+'"')
elif 'linux' in sys.platform:
try: os.system('xdg-open "'+png_file_dir+'"')
except Exception: os.system('xdg-open "'+alt_png_file_dir+'"')
#print cluster_elite_terms[text.get_text()]
fig.canvas.mpl_connect('pick_event', onpick1)
""" Write x-axis labels """
for i in range(x.shape[1]):
adji = i
### Controls the vertical position of the column (array) labels
if len(row_header)<3:
cadj = len(row_header)*-0.26 ### column offset value
elif len(row_header)<4:
cadj = len(row_header)*-0.23 ### column offset value
elif len(row_header)<6:
cadj = len(row_header)*-0.18 ### column offset value
elif len(row_header)<10:
cadj = len(row_header)*-0.08 ### column offset value
elif len(row_header)<15:
cadj = len(row_header)*-0.04 ### column offset value
elif len(row_header)<20:
cadj = len(row_header)*-0.05 ### column offset value
elif len(row_header)<22:
cadj = len(row_header)*-0.06 ### column offset value
elif len(row_header)<23:
cadj = len(row_header)*-0.08 ### column offset value
elif len(row_header)>200:
cadj = -2
else:
cadj = -0.9
#cadj = -1
if len(column_header)>15:
adji = i-0.1 ### adjust the relative position of the column label horizontally
if len(column_header)>20:
adji = i-0.2 ### adjust the relative position of the column label horizontally
if len(column_header)>25:
adji = i-0.2 ### adjust the relative position of the column label horizontally
if len(column_header)>30:
adji = i-0.25 ### adjust the relative position of the column label horizontally
if len(column_header)>35:
adji = i-0.3 ### adjust the relative position of the column label horizontally
if len(column_header)>200:
column_fontsize = 2
if column_method != None:
if len(column_header)<150: ### Don't show the headers when too many values exist
axm.text(adji, cadj, ''+column_header[idx2[i]], rotation=270, verticalalignment="top",fontsize=column_fontsize) # rotation could also be degrees
new_column_header.append(column_header[idx2[i]])
else: ### When not clustering columns
if len(column_header)<300: ### Don't show the headers when too many values exist
axm.text(adji, cadj, ''+column_header[i], rotation=270, verticalalignment="top",fontsize=column_fontsize)
new_column_header.append(column_header[i])
# Plot colside colors
# axc --> axes for column side colorbar
group_name_list=[]
ind1_clust,ind2_clust = ind1,ind2
ind1,ind2,group_name_list,cb_status = updateColorBarData(ind1,ind2,new_column_header,new_row_header,row_method)
if (column_method != None or 'column' in cb_status) and show_color_bars == True:
axc = fig.add_axes([axc_x, axc_y, axc_w, axc_h]) # axes for column side colorbar
cmap_c = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF', '#CCCCE0','#000066','#FFFF00', '#FF1493'])
if use_default_colors:
cmap_c = PairedColorMap()
#cmap_c = pylab.cm.gist_ncar
else:
#cmap_c = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF','#FFFF00', '#FF1493'])
if len(unique.unique(ind2))==2: ### cmap_c is too few colors
cmap_c = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF'])
#cmap_c = matplotlib.colors.ListedColormap(['#7CFC00','k'])
cmap_c = matplotlib.colors.ListedColormap(['w', 'k'])
elif len(unique.unique(ind2))==3: ### cmap_c is too few colors
cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C'])
cmap_c = matplotlib.colors.ListedColormap(['b', 'y', 'r'])
elif len(unique.unique(ind2))>0: ### cmap_c is too few colors
#cmap_c = pylab.cm.Paired
cmap_c = PairedColorMap()
"""
elif len(unique.unique(ind2))==4: ### cmap_c is too few colors
cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C','#FEBC18']) #['#FEBC18','#EE2C3C','#3D3181','#88BF47']
#cmap_c = matplotlib.colors.ListedColormap(['k', 'w', 'w', 'w'])
elif len(unique.unique(ind2))==5: ### cmap_c is too few colors
cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#63C6BB', '#3D3181', '#FEBC18', '#EE2C3C'])
elif len(unique.unique(ind2))==6: ### cmap_c is too few colors
cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#29C3EC', '#3D3181', '#7B4976','#FEBC18', '#EE2C3C'])
#cmap_c = matplotlib.colors.ListedColormap(['black', '#1DA532', '#88BF47','b', 'grey','r'])
#cmap_c = matplotlib.colors.ListedColormap(['w', '#0B9B48', 'w', '#5D82C1','#4CB1E4','#71C065'])
#cmap_c = matplotlib.colors.ListedColormap(['w', 'w', 'k', 'w','w','w'])
elif len(unique.unique(ind2))==7: ### cmap_c is too few colors
cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#63C6BB', '#29C3EC', '#3D3181', '#7B4976','#FEBC18', '#EE2C3C'])
#cmap_c = matplotlib.colors.ListedColormap(['w', 'w', 'w', 'k', 'w','w','w'])
#cmap_c = matplotlib.colors.ListedColormap(['w','w', '#0B9B48', 'w', '#5D82C1','#4CB1E4','#71C065'])
#elif len(unique.unique(ind2))==9: cmap_c = matplotlib.colors.ListedColormap(['k', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w'])
#elif len(unique.unique(ind2))==11:
#cmap_c = matplotlib.colors.ListedColormap(['w', '#DC2342', '#0B9B48', '#FDDF5E', '#E0B724', 'k', '#5D82C1', '#F79020', '#4CB1E4', '#983894', '#71C065'])
"""
try: dc = numpy.array(ind2, dtype=int)
except:
### occurs with the cluster numbers are cluster annotation names (cell types)
ind2 = convertClusterNameToInt(ind2)
dc = numpy.array(ind2, dtype=int)
dc.shape = (1,len(ind2))
im_c = axc.matshow(dc, aspect='auto', origin='lower', cmap=cmap_c)
axc.set_xticks([]) ### Hides ticks
if 'hopach' == column_method and len(group_name_list)>0:
axc.set_yticklabels(['','Groups'],fontsize=10)
else:
axc.set_yticks([])
#axc.set_frame_on(False) ### Hide border
if len(group_name_list)>0: ### Add a group color legend key
if 'hopach' == column_method: ### allows us to add the second color bar
axcd = fig.add_axes([ax2_x, ax2_y, ax2_w, color_bar_w]) # dendrogram coordinates with color_bar_w substituted - can use because dendrogram is not used
cmap_c = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF', '#CCCCE0','#000066','#FFFF00', '#FF1493'])
#cmap_c = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF','#FFFF00', '#FF1493'])
if use_default_colors:
#cmap_c = pylab.cm.Paired
cmap_c = PairedColorMap()
else:
if len(unique.unique(ind2_clust))==2: ### cmap_c is too few colors
#cmap_c = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF'])
cmap_c = matplotlib.colors.ListedColormap(['w', 'k'])
elif len(unique.unique(ind2_clust))==3: ### cmap_c is too few colors
cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C'])
cmap_c = matplotlib.colors.ListedColormap(['b', 'y', 'r'])
elif len(unique.unique(ind2_clust))>0: ### cmap_c is too few colors
#cmap_c = pylab.cm.Paired
cmap_c = PairedColorMap()
"""
elif len(unique.unique(ind2_clust))==4: ### cmap_c is too few colors
cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C', '#FEBC18'])
#cmap_c = matplotlib.colors.ListedColormap(['black', '#1DA532', 'b','r'])
elif len(unique.unique(ind2_clust))==5: ### cmap_c is too few colors
cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#63C6BB', '#3D3181', '#FEBC18', '#EE2C3C'])
elif len(unique.unique(ind2_clust))==6: ### cmap_c is too few colors
cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#29C3EC', '#3D3181', '#7B4976','#FEBC18', '#EE2C3C'])
elif len(unique.unique(ind2_clust))==7: ### cmap_c is too few colors
cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#63C6BB', '#29C3EC', '#3D3181', '#7B4976','#FEBC18', '#EE2C3C'])
"""
try: dc = numpy.array(ind2_clust, dtype=int)
except:
### occurs with the cluster numbers are cluster annotation names (cell types)
ind2_clust = convertClusterNameToInt(ind2_clust)
dc = numpy.array(ind2_clust, dtype=int)
dc.shape = (1,len(ind2_clust))
im_cd = axcd.matshow(dc, aspect='auto', origin='lower', cmap=cmap_c)
#axcd.text(-1,-1,'clusters')
axcd.set_yticklabels(['','Clusters'],fontsize=10)
#pylab.yticks(range(1),['HOPACH clusters'])
axcd.set_xticks([]) ### Hides ticks
#axcd.set_yticks([])
axd = fig.add_axes([axcc_x, axcc_y, axcc_w, axcc_h])
group_name_list.sort()
group_colors = map(lambda x: x[0],group_name_list)
group_names = map(lambda x: x[1],group_name_list)
cmap_d = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF', '#CCCCE0','#000066','#FFFF00', '#FF1493'])
#cmap_d = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF','#FFFF00', '#FF1493'])
if len(unique.unique(ind2))==2: ### cmap_c is too few colors
#cmap_d = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF'])
cmap_d = matplotlib.colors.ListedColormap(['w', 'k'])
elif len(unique.unique(ind2))==3: ### cmap_c is too few colors
cmap_d = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C'])
cmap_d = matplotlib.colors.ListedColormap(['b', 'y', 'r'])
#cmap_d = matplotlib.colors.ListedColormap(['b', 'y', 'r'])
elif len(unique.unique(ind2))>0: ### cmap_c is too few colors
#cmap_d = pylab.cm.Paired
cmap_d = PairedColorMap()
"""
elif len(unique.unique(ind2))==4: ### cmap_c is too few colors
cmap_d = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C', '#FEBC18'])
elif len(unique.unique(ind2))==5: ### cmap_c is too few colors
cmap_d = matplotlib.colors.ListedColormap(['#88BF47', '#63C6BB', '#3D3181', '#FEBC18', '#EE2C3C'])
elif len(unique.unique(ind2))==6: ### cmap_c is too few colors
cmap_d = matplotlib.colors.ListedColormap(['#88BF47', '#29C3EC', '#3D3181', '#7B4976','#FEBC18', '#EE2C3C'])
#cmap_d = matplotlib.colors.ListedColormap(['black', '#1DA532', '#88BF47','b', 'grey','r'])
#cmap_d = matplotlib.colors.ListedColormap(['w', '#0B9B48', 'w', '#5D82C1','#4CB1E4','#71C065'])
#cmap_d = matplotlib.colors.ListedColormap(['w', 'w', 'k', 'w', 'w','w','w'])
elif len(unique.unique(ind2))==7: ### cmap_c is too few colors
cmap_d = matplotlib.colors.ListedColormap(['#88BF47', '#63C6BB', '#29C3EC', '#3D3181', '#7B4976','#FEBC18', '#EE2C3C'])
#cmap_d = matplotlib.colors.ListedColormap(['w', 'w', 'w', 'k', 'w','w','w'])
#cmap_d = matplotlib.colors.ListedColormap(['w','w', '#0B9B48', 'w', '#5D82C1','#4CB1E4','#71C065'])
#elif len(unique.unique(ind2))==10: cmap_d = matplotlib.colors.ListedColormap(['w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'w', 'k'])
"""
dc = numpy.array(group_colors, dtype=int)
dc.shape = (1,len(group_colors))
im_c = axd.matshow(dc, aspect='auto', origin='lower', cmap=cmap_d)
axd.set_yticks([])
#axd.set_xticklabels(group_names, rotation=45, ha='left')
#if len(group_names)<200:
pylab.xticks(range(len(group_names)),group_names,rotation=90,ha='left') #rotation = 45
#cmap_c = matplotlib.colors.ListedColormap(map(lambda x: GroupDB[x][-1], new_column_header))
if show_color_bars == False:
axc = fig.add_axes([axc_x, axc_y, axc_w, axc_h]) # axes for column side colorbar
axc.set_frame_on(False)
# Plot rowside colors
# axr --> axes for row side colorbar
if (row_method != None or 'row' in cb_status) and show_color_bars == True:
axr = fig.add_axes([axr_x, axr_y, axr_w, axr_h]) # axes for column side colorbar
try:
dr = numpy.array(ind1, dtype=int)
dr.shape = (len(ind1),1)
cmap_r = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF', '#FFFF00', '#FF1493'])
if len(unique.unique(ind1))>4: ### cmap_r is too few colors
#cmap_r = pylab.cm.Paired
cmap_r = PairedColorMap()
if 'MarkerGenes' in dataset_name: ### reverse the order of the colors to match the top color bar
#cmap_r = pylab.cm.Paired_r
cmap_r = PairedColorMap().reversed()
if len(unique.unique(ind1))==2:
cmap_r = matplotlib.colors.ListedColormap(['w', 'k'])
im_r = axr.matshow(dr, aspect='auto', origin='lower', cmap=cmap_r)
axr.set_xticks([]) ### Hides ticks
axr.set_yticks([])
#axr.set_frame_on(False) ### Hide border
except Exception:
row_method = None
pass ### likely occurs for some reason when row_method should be None
if show_color_bars == False:
axr = fig.add_axes([axr_x, axr_y, axr_w, axr_h]) # axes for column side colorbar
axr.set_frame_on(False)
""" write x-axis group labels """
groupNames_to_cell={}
cluster_to_cell={}
try: ### Group names (from groups file or embeded groups annotations)
for i in range(x.shape[1]):
cluster = str(ind2[i])
try: groupNames_to_cell[cluster].append(i)
except: groupNames_to_cell[cluster]=[i]
except: pass
try: ### Cluster names from clustering
for i in range(x.shape[1]):
cluster = str(ind2_clust[i])
try: cluster_to_cell[cluster].append(i)
except: cluster_to_cell[cluster]=[i]
except: pass
### Use the groups rather than clusters if not clustered
cluster_group_matching = False
group_length=[]
cluster_length=[]
try:
index=0
for c in cluster_to_cell:
cluster_length.append(len(cluster_to_cell[c]))
for c in groupNames_to_cell:
group_length.append(len(groupNames_to_cell[c]))
### if the clusters and groups are the same size
if max(cluster_length) == max(group_length) and (len(cluster_to_cell) == len(groupNames_to_cell)):
cluster_group_matching = True
except: pass
clusterType = 'Numbers'
if (len(cluster_to_cell) < 2) or cluster_group_matching:
cluster_to_cell = groupNames_to_cell
ind2_clust = ind2
clusterType = 'Groups'
try:
fontsize = 6
if len(cluster_to_cell)<10:
fontsize = 10
last_cluster = None
group_index=0
cluster_count = 0
cluster_borders=[]
if len(column_header)>1:
for i in range(x.shape[1]):
adji = i
cadj = 0.6
try: cluster = str(ind2_clust[i])
except Exception: cluster = 'NA'
middle_cluster_index = len(cluster_to_cell[cluster])/3
if cluster != last_cluster:
cluster_count=0
#ax.plot([70, 70], [100, 250], 'w-', lw=0.5)
if i>0: ### Don't need to draw a white line at 0
cluster_borders.append(i-0.5)
if cluster_count == middle_cluster_index:
if clusterType == 'Numbers':
try:
axcd.text(adji, cadj, ''+cluster, rotation=45, verticalalignment="bottom",fontsize=fontsize) # rotation could also be degrees
except:
axc.text(adji, cadj, ''+cluster, rotation=45, verticalalignment="bottom",fontsize=fontsize) # rotation could also be degrees
else:
try:
axcd.text(adji, cadj, ''+group_name_list[group_index][1], rotation=45, verticalalignment="bottom",fontsize=fontsize) # rotation could also be degrees
except:
try:
axc.text(adji, cadj, ''+group_name_list[group_index][1], rotation=45, verticalalignment="bottom",fontsize=fontsize) # rotation could also be degrees
except:
try:
axcd.text(adji, cadj, ''+cluster, rotation=45, verticalalignment="bottom",fontsize=fontsize) # rotation could also be degrees
except:
axc.text(adji, cadj, ''+cluster, rotation=45, verticalalignment="bottom",fontsize=fontsize) # rotation could also be degrees
group_index+=1
last_cluster = cluster
cluster_count+=1
except:
#print group_name_list
#print len(group_name_list), group_index
#print traceback.format_exc()
pass
try:
#print cluster_borders
axm.vlines(cluster_borders, color='w',lw=0.3, *axm.get_ylim())
except:
pass
# Plot color legend
axcb = fig.add_axes([axcb_x, axcb_y, axcb_w, axcb_h], frame_on=False) # axes for colorbar
cb = matplotlib.colorbar.ColorbarBase(axcb, cmap=cmap, norm=norm, orientation='horizontal')
#axcb.set_title("colorkey",fontsize=14)
if 'LineageCorrelations' in dataset_name:
cb.set_label("Lineage Correlation Z Scores",fontsize=11)
elif 'Heatmap' in root_dir:
if 'psi' in filename or 'PSI' in filename or 'MarkerHeatmaps' in root_dir:
cb.set_label("Percent Spliced In (PSI) normalized",fontsize=11)
else:
cb.set_label("GO-Elite Z Scores",fontsize=11)
else:
cb.set_label("Differential Expression (log2)",fontsize=10)
### Add filename label to the heatmap
if len(dataset_name)>30:fontsize = 10
else: fontsize = 12.5
fig.text(0.015, 0.970, dataset_name, fontsize = fontsize)
### Render and save the graphic
try: pylab.savefig(root_dir + filename,dpi=1000)
except:
print 'Warning... pdf image export error. '
#print 'Exporting:',filename
filename = filename[:-3]+'png'
try: pylab.savefig(root_dir + filename, dpi=150) #,dpi=200
except:
print 'Warning... png image export error. '
includeBackground=False
try:
if 'TkAgg' != matplotlib.rcParams['backend']:
includeBackground = False
except Exception: pass
if includeBackground:
fig.text(0.020, 0.070, 'Open heatmap in TreeView (click here)', fontsize = 11.5, picker=True,color = 'red', backgroundcolor='white')
else:
fig.text(0.020, 0.070, 'Open heatmap in TreeView (click here)', fontsize = 11.5, picker=True,color = 'red')
if 'Outlier' in dataset_name and 'Removed' not in dataset_name:
graphic_link.append(['Hierarchical Clustering - Outlier Genes Genes',root_dir+filename])
elif 'Relative' in dataset_name:
graphic_link.append(['Hierarchical Clustering - Significant Genes (Relative comparisons)',root_dir+filename])
elif 'LineageCorrelations' in filename:
graphic_link.append(['Hierarchical Clustering - Lineage Correlations',root_dir+filename])
elif 'MarkerGenes' in filename:
graphic_link.append(['Hierarchical Clustering - MarkerFinder',root_dir+filename])
elif 'AltExonConfirmed' in filename:
graphic_link.append(['Hierarchical Clustering - AltExonConfirmed',root_dir+filename])
elif 'AltExon' in filename:
graphic_link.append(['Hierarchical Clustering - AltExon',root_dir+filename])
elif 'alt_junction' in filename:
graphic_link.append(['Hierarchical Clustering - Variable Splice-Events',root_dir+filename])
else:
graphic_link.append(['Hierarchical Clustering - Significant Genes',root_dir+filename])
if display:
proceed=True
try:
if 'guide' in justShowTheseIDs:
proceed = False
except Exception: pass
if proceed:
print 'Exporting:',filename
try: pylab.show()
except Exception: None ### when run in headless mode
fig.clf()
#fig.close() causes segfault
#pylab.close() causes segfault
def openTreeView(filename):
import subprocess
fn = filepath("AltDatabase/TreeView/TreeView.jar")
print 'java', "-Xmx4000m", '-jar', fn, "-r", filename
retcode = subprocess.Popen(['java', "-Xmx4000m", '-jar', fn, "-r", filename])
def remoteGOElite(elite_dir,SystemCode = None):
mod = 'Ensembl'
if SystemCode == 'Ae':
mod = 'AltExon'
pathway_permutations = 'FisherExactTest'
filter_method = 'z-score'
z_threshold = 1.96
p_val_threshold = 0.005
change_threshold = 2
if runGOElite:
resources_to_analyze = EliteGeneSets
if 'all' in resources_to_analyze:
resources_to_analyze = 'all'
returnPathways = 'no'
root = None
import GO_Elite
reload(GO_Elite)
input_files = dir_list = unique.read_directory(elite_dir) ### Are there any files to analyze?
if len(input_files)>0 and resources_to_analyze !=['']:
print '\nBeginning to run GO-Elite analysis on all results'
file_dirs = elite_dir,None,elite_dir
enrichmentAnalysisType = 'ORA'
#enrichmentAnalysisType = 'URA'
variables = species,mod,pathway_permutations,filter_method,z_threshold,p_val_threshold,change_threshold,resources_to_analyze,returnPathways,file_dirs,enrichmentAnalysisType,root
try:
GO_Elite.remoteAnalysis(variables, 'non-UI Heatmap')
except Exception:
print 'GO-Elite failed for:', elite_dir
print traceback.format_exc()
if commandLine==False and 'RelativeSampleLogFolds' not in elite_dir and 'OutlierLogFolds' not in elite_dir:
try: UI.openDirectory(elite_dir+'/GO-Elite_results')
except Exception: None
cluster_elite_terms,top_genes = importGOEliteResults(elite_dir)
return cluster_elite_terms,top_genes
else:
return {},[]
else:
return {},[]
def importGOEliteResults(elite_dir):
global eliteGeneSet
pruned_results = elite_dir+'/GO-Elite_results/CompleteResults/ORA_pruned/pruned-results_z-score_elite.txt' ### This is the exception (not moved)
if os.path.isfile(pruned_results) == False:
pruned_results = elite_dir+'/GO-Elite_results/pruned-results_z-score_elite.txt'
firstLine=True
cluster_elite_terms={}
all_term_length=[0]
for line in open(pruned_results,'rU').xreadlines():
data = line.rstrip()
values = string.split(data,'\t')
if firstLine:
firstLine=False
try: symbol_index = values.index('gene symbols')
except Exception: symbol_index = None
else:
try: symbol_index = values.index('gene symbols')
except Exception: pass
try:
eliteGeneSet = string.split(values[0][1:],'-')[1][:-4]
try: cluster = str(int(float(string.split(values[0][1:],'-')[0])))
except Exception:
cluster = string.join(string.split(values[0],'-')[:-1],'-')
term = values[2]
num_genes_changed = int(values[3])
all_term_length.append(len(term))
pval = float(values[9]) ### adjusted is values[10]
#pval = float(values[10]) ### adjusted is values[10]
if num_genes_changed>2:
try: cluster_elite_terms[cluster].append([pval,term])
except Exception: cluster_elite_terms[cluster] = [[pval,term]]
if symbol_index!=None:
symbols = string.split(values[symbol_index],'|')
cluster_elite_terms[cluster,term] = symbols
except Exception,e: pass
for cluster in cluster_elite_terms:
cluster_elite_terms[cluster].sort()
cluster_elite_terms['label-size'] = max(all_term_length)
top_genes = []; count=0
ranked_genes = elite_dir+'/GO-Elite_results/CompleteResults/ORA_pruned/gene_associations/pruned-gene-ranking.txt'
for line in open(ranked_genes,'rU').xreadlines():
data = line.rstrip()
values = string.split(data,'\t')
count+=1
if len(values)>2:
if values[2]!='Symbol':
try: top_genes.append((int(values[4]),values[2]))
except Exception: pass
top_genes.sort(); top_genes.reverse()
top_genes = map(lambda x: x[1],top_genes[:21])
return cluster_elite_terms,top_genes
def mergeRotateAroundPointPage(page, page2, rotation, tx, ty):
from pyPdf import PdfFileWriter, PdfFileReader
translation = [[1, 0, 0],
[0, 1, 0],
[-tx,-ty,1]]
rotation = math.radians(rotation)
rotating = [[math.cos(rotation), math.sin(rotation),0],
[-math.sin(rotation),math.cos(rotation), 0],
[0, 0, 1]]
rtranslation = [[1, 0, 0],
[0, 1, 0],
[tx,ty,1]]
ctm = numpy.dot(translation, rotating)
ctm = numpy.dot(ctm, rtranslation)
return page.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
ctm[1][0], ctm[1][1],
ctm[2][0], ctm[2][1]])
def mergePDFs2(pdf1,pdf2,outPdf):
from pyPdf import PdfFileWriter, PdfFileReader
input1 = PdfFileReader(file(pdf1, "rb"))
page1 = input1.getPage(0)
input2 = PdfFileReader(file(pdf2, "rb"))
page2 = input2.getPage(0)
page3 = mergeRotateAroundPointPage(page1, page2,
page1.get('/Rotate') or 0,
page2.mediaBox.getWidth()/2, page2.mediaBox.getWidth()/2)
output = PdfFileWriter()
output.addPage(page3)
outputStream = file(outPdf, "wb")
output.write(outputStream)
outputStream.close()
def mergePDFs(pdf1,pdf2,outPdf):
# http://stackoverflow.com/questions/6041244/how-to-merge-two-landscape-pdf-pages-using-pypdf
from pyPdf import PdfFileWriter, PdfFileReader
input1 = PdfFileReader(file(pdf1, "rb"))
page1 = input1.getPage(0)
page1.mediaBox.upperRight = (page1.mediaBox.getUpperRight_x(), page1.mediaBox.getUpperRight_y())
input2 = PdfFileReader(file(pdf2, "rb"))
page2 = input2.getPage(0)
page2.mediaBox.getLowerLeft_x = (page2.mediaBox.getLowerLeft_x(), page2.mediaBox.getLowerLeft_y())
# Merge
page2.mergePage(page1)
# Output
output = PdfFileWriter()
output.addPage(page1)
outputStream = file(outPdf, "wb")
output.write(outputStream)
outputStream.close()
"""
def merge_horizontal(out_filename, left_filename, right_filename):
#Merge the first page of two PDFs side-to-side
import pyPdf
# open the PDF files to be merged
with open(left_filename) as left_file, open(right_filename) as right_file, open(out_filename, 'w') as output_file:
left_pdf = pyPdf.PdfFileReader(left_file)
right_pdf = pyPdf.PdfFileReader(right_file)
output = pyPdf.PdfFileWriter()
# get the first page from each pdf
left_page = left_pdf.pages[0]
right_page = right_pdf.pages[0]
# start a new blank page with a size that can fit the merged pages side by side
page = output.addBlankPage(
width=left_page.mediaBox.getWidth() + right_page.mediaBox.getWidth(),
height=max(left_page.mediaBox.getHeight(), right_page.mediaBox.getHeight()),
)
# draw the pages on that new page
page.mergeTranslatedPage(left_page, 0, 0)
page.mergeTranslatedPage(right_page, left_page.mediaBox.getWidth(), 0)
# write to file
output.write(output_file)
"""
def inverseDist(value):
if value == 0: value = 1
return math.log(value,2)
def getGOEliteExportDir(root_dir,dataset_name):
if 'AltResults' in root_dir:
root_dir = string.split(root_dir,'AltResults')[0]
if 'ExpressionInput' in root_dir:
root_dir = string.split(root_dir,'ExpressionInput')[0]
if 'ExpressionOutput' in root_dir:
root_dir = string.split(root_dir,'ExpressionOutput')[0]
if 'DataPlots' in root_dir:
root_dir = string.replace(root_dir,'DataPlots','GO-Elite')
elite_dir = root_dir
else:
elite_dir = root_dir+'/GO-Elite'
try: os.mkdir(elite_dir)
except Exception: pass
return elite_dir+'/clustering/'+dataset_name
def systemCodeCheck(IDs):
import gene_associations
id_type_db={}
for id in IDs:
if ':' in id:
id = string.split(id,':')[1]
id_type = gene_associations.predictIDSourceSimple(id)
try: id_type_db[id_type]+=1
except Exception: id_type_db[id_type]=1
id_type_count=[]
for i in id_type_db:
id_type_count.append((id_type_db[i],i))
id_type_count.sort()
id_type = id_type_count[-1][-1]
return id_type
def exportFlatClusterData(filename, root_dir, dataset_name, new_row_header,new_column_header,xt,ind1,ind2,vmax,display):
""" Export the clustered results as a text file, only indicating the flat-clusters rather than the tree """
filename = string.replace(filename,'.pdf','.txt')
export_text = export.ExportFile(filename)
column_header = string.join(['UID','row_clusters-flat']+new_column_header,'\t')+'\n' ### format column-names for export
export_text.write(column_header)
column_clusters = string.join(['column_clusters-flat','']+ map(str, ind2),'\t')+'\n' ### format column-flat-clusters for export
export_text.write(column_clusters)
### The clusters, dendrogram and flat clusters are drawn bottom-up, so we need to reverse the order to match
#new_row_header = new_row_header[::-1]
#xt = xt[::-1]
try: elite_dir = getGOEliteExportDir(root_dir,dataset_name)
except Exception: elite_dir = None
elite_columns = string.join(['InputID','SystemCode'])
try: sy = systemCodeCheck(new_row_header)
except Exception: sy = None
### Export each row in the clustered data matrix xt
i=0
cluster_db={}
export_lines = []
last_cluster=None
cluster_top_marker={}
for row in xt:
try:
id = new_row_header[i]
new_id = id
original_id = str(id)
if sy == 'Ae' and '--' in id:
cluster = 'cluster-' + string.split(id, ':')[0]
elif sy == '$En:Sy':
cluster = 'cluster-'+string.split(id,':')[0]
elif sy == 'S' and ':' in id:
cluster = 'cluster-'+string.split(id,':')[0]
elif sy == 'Sy' and ':' in id:
cluster = 'cluster-'+string.split(id,':')[0]
elif sy == 'En' and ':' in id: ### Added in 2.1.5 - 6/9/21
cluster = 'cluster-'+string.split(id,':')[0]
else:
cluster = 'c'+str(ind1[i])
if ':' in id:
new_id = string.split(id,':')[1]
if ' ' in new_id:
new_id = string.split(new_id,' ')[0]
#if cluster not in cluster_top_marker: ### Display the top marker gene
cluster_top_marker[cluster] = new_id
last_cluster = cluster
except Exception:
pass
try:
if 'MarkerGenes' in originalFilename:
cluster = 'cluster-' + string.split(id, ':')[0]
id = string.split(id, ':')[1]
if ' ' in id:
id = string.split(id, ' ')[0]
if 'G000' in id:
sy = 'En'
else:
sy = 'Sy'
except Exception: pass
try: cluster_db[cluster].append(id)
except Exception: cluster_db[cluster] = [id]
try: export_lines.append(string.join([original_id,str(ind1[i])]+map(str, row),'\t')+'\n')
except Exception:
export_lines.append(string.join([original_id,'NA']+map(str, row),'\t')+'\n')
i+=1
### Reverse the order of the file
export_lines.reverse()
for line in export_lines:
export_text.write(line)
export_text.close()
### Export GO-Elite input files
allGenes={}
sc=sy
for cluster in cluster_db:
export_elite = export.ExportFile(elite_dir + '/' + cluster + '.txt')
if sy == None:
export_elite.write('ID\n')
else:
export_elite.write('ID\tSystemCode\n')
for id in cluster_db[cluster]:
if ' ' in id and ':' not in id:
ids = string.split(id, ' ')
if ids[0] == ids[1]:
id = ids[0]
elif ' ' in id and ':' in id:
id = string.split(id, ':')[-1]
id = string.split(id, ' ')[0]
if sy == '$En:Sy':
try: id = string.split(id, ':')[1]
except:
if 'ENS' in id:
sy = 'En'
continue
ids = string.split(id, ' ')
if 'ENS' in ids[0] or 'G0000' in ids[0]:
id = ids[0]
else:
id = ids[(-1)]
sc = 'En'
elif sy == 'Sy' and ':' in id:
id = string.split(id, ':')[1]
ids = string.split(id, ' ')
sc = 'Sy'
elif sy == 'En:Sy':
id = string.split(id, ' ')[0]
sc = 'En'
elif sy == 'En' and ':' in id:
ids = string.split(id,':')
if len(ids) == 2:
id = ids[1]
else:
id = ids[1]
elif sy == 'Ae':
if '--' in id:
sc = 'En'
id = string.split(id, ':')[(-1)]
else:
l = string.split(id, ':')
if len(l) == 2:
id = string.split(id, ':')[0]
if len(l) == 3:
id = string.split(id, ':')[1]
sc = 'En'
if ' ' in id:
ids = string.split(id, ' ')
if 'ENS' in ids[(-1)] or 'G0000' in ids[(-1)]:
id = ids[(-1)]
else: