9/13/2020

Critical bug fixes 1) Initial outlier filtering of ICGS was commented out 2) Added a SVM probability cutoff > 0 in ICGS-NMF 3) Fixed a UI bug for 10x Genomics mtx.gz format files 4) Added a partial match option for sampleIndexSelection
nsalomonis · Sep 13, 2020 · 6cf103d · 6cf103d
1 parent 1de7729
commit 6cf103d
Show file tree

Hide file tree

Showing 8 changed files with 123 additions and 91 deletions.
diff --git a/Config/arrays.txt b/Config/arrays.txt
@@ -1,16 +1,16 @@
-array_type	array_name	manufacturer	constitutive_source	compatible_species
-3'array	Codelink expression array	Codelink	Codelink	Hs|Mm|Rn
-junction	Junction Glue array	Affymetrix	Ensembl	Hs
-AltMouse	Junction AltMouse array	Affymetrix	Ensembl	Mm
-RNASeq	10X Genomics aligned	RNASeq	Ensembl	Mm|Hs|Rn|Dm|Sc|Ce|Gg|Dr|Zm|Ma|Nc|Ee|Cf|Go|Mi|Am|Og|Md|Ml|Op|Dn|Pc|Pp|La|Ec|Ts|Bt|Vp|Pv|Me|Tb|Pt|Fc|Tt|Ch|Oc|Oa|Cj|Sa|Et|Pb|Am|Cp|St|Pf|Aa|Ce|Cs|Ag|Ci|Pa|Xl|Ta|Vv|Os|Mg|Tg|Tn|Ol
-junction	Junction JAY array	Affymetrix	Ensembl	Mm|Hs
-exon	Exon 1.0 ST array	Affymetrix	Ensembl	Mm|Hs|Rn
-RNASeq	Raw sequence or processed	RNASeq	Ensembl	Mm|Hs|Rn|Dm|Sc|Ce|Gg|Dr|Zm|Ma|Nc|Ee|Cf|Go|Mi|Am|Og|Md|Ml|Op|Dn|Pc|Pp|La|Ec|Ts|Bt|Vp|Pv|Me|Tb|Pt|Fc|Tt|Ch|Oc|Oa|Cj|Sa|Et|Pb|Am|Cp|St|Pf|Aa|Ce|Cs|Ag|Ci|Pa|Xl|Ta|Vv|Os|Mg|Tg|Tn|Ol
-3'array	Affymetrix expression array	Affymetrix	Affymetrix	Mm|Hs|Rn|Dm|Sc|Ce|Gg|Dr|Zm|Ma|Nc|Ee|Cf|Go|Mi|Am|Og|Md|Ml|Op|Dn|Pc|Pp|La|Ec|Ts|Bt|Vp|Pv|Me|Tb|Pt|Fc|Tt|Ch|Oc|Oa|Cj|Sa|Et|Pb|Am|Cp|St|Pf|Aa|Ce|Cs|Ag|Ci|Pa|Xl|Ta|Vv|Os|Mg|Tg|Tn|Ol
-junction	Junction MTA 1.0 array	Affymetrix	Ensembl	Mm
-gene	Gene 1.0 ST array	Affymetrix	Ensembl	Mm|Hs|Rn
-3'array	Illumina expression array	Illumina	Illumina	Hs|Mm
-3'array	Ensembl annotated data	Other ID	Ensembl	Mm|Hs|Rn|Dm|Sc|Ce|Gg|Dr|Zm|Ma|Nc|Ee|Cf|Go|Mi|Am|Og|Md|Ml|Op|Dn|Pc|Pp|La|Ec|Ts|Bt|Vp|Pv|Me|Tb|Pt|Fc|Tt|Ch|Oc|Oa|Cj|Sa|Et|Pb|Am|Cp|St|Pf|Aa|Ce|Cs|Ag|Ci|Pa|Xl|Ta|Vv|Os|Mg|Tg|Tn|Ol
-junction	Junction HTA 2.0 array	Affymetrix	Ensembl	Hs
-3'array	Misc. expression array	Misc	Misc	
-3'array	Agilent expression array	Agilent	Agilent	Hs|Mm|Dr|Rn
+array_type	array_name	manufacturer	constitutive_source	compatible_species
+3'array	Codelink expression array	Codelink	Codelink	Hs|Mm|Rn
+junction	Junction Glue array	Affymetrix	Ensembl	Hs
+AltMouse	Junction AltMouse array	Affymetrix	Ensembl	Mm
+RNASeq	10X Genomics aligned	RNASeq	Ensembl	Mm|Hs|Rn|Dm|Sc|Ce|Gg|Dr|Zm|Ma|Nc|Ee|Cf|Go|Mi|Am|Og|Md|Ml|Op|Dn|Pc|Pp|La|Ec|Ts|Bt|Vp|Pv|Me|Tb|Pt|Fc|Tt|Ch|Oc|Oa|Cj|Sa|Et|Pb|Am|Cp|St|Pf|Aa|Ce|Cs|Ag|Ci|Pa|Xl|Ta|Vv|Os|Mg|Tg|Tn|Ol
+junction	Junction JAY array	Affymetrix	Ensembl	Mm|Hs
+exon	Exon 1.0 ST array	Affymetrix	Ensembl	Mm|Hs|Rn
+RNASeq	Raw sequence or processed	RNASeq	Ensembl	Mm|Hs|Rn|Dm|Sc|Ce|Gg|Dr|Zm|Ma|Nc|Ee|Cf|Go|Mi|Am|Og|Md|Ml|Op|Dn|Pc|Pp|La|Ec|Ts|Bt|Vp|Pv|Me|Tb|Pt|Fc|Tt|Ch|Oc|Oa|Cj|Sa|Et|Pb|Am|Cp|St|Pf|Aa|Ce|Cs|Ag|Ci|Pa|Xl|Ta|Vv|Os|Mg|Tg|Tn|Ol
+3'array	Affymetrix expression array	Affymetrix	Affymetrix	Mm|Hs|Rn|Dm|Sc|Ce|Gg|Dr|Zm|Ma|Nc|Ee|Cf|Go|Mi|Am|Og|Md|Ml|Op|Dn|Pc|Pp|La|Ec|Ts|Bt|Vp|Pv|Me|Tb|Pt|Fc|Tt|Ch|Oc|Oa|Cj|Sa|Et|Pb|Am|Cp|St|Pf|Aa|Ce|Cs|Ag|Ci|Pa|Xl|Ta|Vv|Os|Mg|Tg|Tn|Ol
+junction	Junction MTA 1.0 array	Affymetrix	Ensembl	Mm
+gene	Gene 1.0 ST array	Affymetrix	Ensembl	Mm|Hs|Rn
+3'array	Illumina expression array	Illumina	Illumina	Hs|Mm
+3'array	Ensembl annotated data	Other ID	Ensembl	Mm|Hs|Rn|Dm|Sc|Ce|Gg|Dr|Zm|Ma|Nc|Ee|Cf|Go|Mi|Am|Og|Md|Ml|Op|Dn|Pc|Pp|La|Ec|Ts|Bt|Vp|Pv|Me|Tb|Pt|Fc|Tt|Ch|Oc|Oa|Cj|Sa|Et|Pb|Am|Cp|St|Pf|Aa|Ce|Cs|Ag|Ci|Pa|Xl|Ta|Vv|Os|Mg|Tg|Tn|Ol
+junction	Junction HTA 2.0 array	Affymetrix	Ensembl	Hs
+3'array	Misc. expression array	Misc	Misc	
+3'array	Agilent expression array	Agilent	Agilent	Hs|Mm|Dr|Rn
diff --git a/Config/source_data.txt b/Config/source_data.txt
@@ -1,70 +1,70 @@
 System	SystemCode	MOD_status
 Xenopus_Jamboree	Xj	
-Kyotograil	Ky	
 modCB	Mg	
 MIM	Mi	
 sharesCDS	Sc	
-RefSeq	Q	
 Illumina	Il	
 CAS	Ca	
-Ensembl	En	MOD
+RFAM	Rf	
+TubercuList	Tb	
 GadFly	Gf	
 Ens_Tg_translation	Ens	
 Vega_transcript	Vr	
 XenTro	Xen	
 goslim_goa	gos	
 wormpep_id	Wp	
 CCDS	Cc	
-Cint	Cj	
 EPD	Epd	
+Cint	Cj	
 ENST	Enst	
-TetNig	Tet	
+TakRub	Tak	
 Symbol	Sy	
 WikiGene	Wg	
 Agilent	Ag	
 Genoscope_pred_gene	Gen	
-TakRub	Tak	
+miRBase	Mb	
 GPCR	Gpcr	
-EntrezGene	L	MOD
+Vega	Vg	
 CioInt	Cio	
-UniGene	Ug	
 HMDB	Ch	MOD
 AltExon	Ae	MOD
 Osford_FGU	Of	
-ProteinID	Pi	
+TransFac	Transfac	
 FlyBase	F	
 SGD	D	
 Fantom	Fa	
 FlyGrid	Fg	
 KeggCompound	Ck	
+Vega_translation	Vt	
 EMBL	Em	
 DBASS3	Dba	
 Affymetrix	X	
-Vega_translation	Vt	
+MiscArray	Ma	
+ProteinID	Pi	
 CloneID	Clb	
 BDGP_insitu_expr	Bd	
 Platypus_olfactory_receptor	Pr	
-Vega	Vg	
+PUBMED	Pu	
 Uniprot	S	
-TubercuList	Tb	
+Ensembl	En	MOD
 Zfin	Z	
 HPA	Hpa	
 ChEBI	Ce	
 PDB	Pd	
-miRBase	Mb	
+UniGene	Ug	
 IMGT	Im	
 Tgut_symbol	Ts	
 Codelink	Co	
-MEROPS	Merops	
 MGI	M	
 IPI	Ip	
-MiscArray	Ma	
+TetNig	Tet	
+MEROPS	Merops	
 PubChem	Cp	
 UCSC	Ucsc	
-PUBMED	Pu	
-TransFac	Transfac	
+EntrezGene	L	MOD
+Kyotograil	Ky	
 Sanger	Sh	
-RFAM	Rf	
+RefSeq	Q	
 DEDb	De	
 OTT	Ot	
 Wormbase	Wb	

diff --git a/RNASeq.py b/RNASeq.py
@@ -578,7 +578,10 @@ def importBEDFile(bed_dir,root_dir,species,normalize_feature_exp,getReads=False,
             if testImport == 'yes': print "Reading the bed file", [fn], condition
             ### If the BED was manually created on a Mac, will neeed 'rU' - test this
             for line in open(fn,delim).xreadlines(): break
-            if len(line)>500: delim = 'rU'
+            try:
+                if len(line)>500: delim = 'rU'
+            except:
+                pass
             for line in open(fn,delim).xreadlines(): ### changed rU to r to remove \r effectively, rather than read as end-lines
                 data = cleanUpLine(line)
                 t = string.split(data,'\t')

diff --git a/UI.py b/UI.py
@@ -5898,8 +5898,13 @@ def rebootAltAnalyzeGUI(selected_parameters,user_variables):
                             sparse_matrix_file = gu.Results()['input_cel_dir'] # 'filtered_gene_bc_matrices'
                             def import10XSparseMatrixHeaders(matrix_file):
                                 import csv
+                                import gzip
                                 barcodes_path = string.replace(matrix_file,'matrix.mtx','barcodes.tsv' )
-                                barcodes = [row[0] for row in csv.reader(open(barcodes_path), delimiter="\t")]
+                                try:
+                                    barcodes = [row[0] for row in csv.reader(open(barcodes_path), delimiter="\t")]
+                                except:
+                                    print barcodes_path
+                                    barcodes = [row[0] for row in csv.reader(gzip.open(barcodes_path), delimiter="\t")]
                                 barcodes = map(lambda x: string.replace(x,'-1',''), barcodes)
                                 return barcodes
                             def importH5(h5_filename):

diff --git a/import_scripts/sampleIndexSelection.py b/import_scripts/sampleIndexSelection.py
@@ -19,7 +19,7 @@ def makeTestFile():
     return input_file
 
 def filterFile(input_file,output_file,filter_names,force=False,calculateCentroids=False,comparisons=[],
-               log2=False,convertPSIUID=False):
+               log2=False,convertPSIUID=False,partialMatch=False):
     if calculateCentroids:
         filter_names,group_index_db=filter_names
 
@@ -88,7 +88,16 @@ def filterFile(input_file,output_file,filter_names,force=False,calculateCentroid
                         filter_names = filter_names2
                         #filter_names = map(lambda x: string.split(x,'.')[0], filter_names)
                         #values = map(lambda x: string.split(x,'.')[0], values)
-                        sample_index_list = map(lambda x: values.index(x), filter_names)              
+                        sample_index_list = map(lambda x: values.index(x), filter_names)
+                    elif partialMatch:
+                        filter_names_upated = []
+                        for x in filter_names:
+                            if x not in values:
+                                for y in values:
+                                    if x in y:
+                                        filter_names_upated.append(y)
+                        filter_names = filter_names_upated
+                        sample_index_list = map(lambda x: values.index(x), filter_names)
                     else:
                         temp_count=1
                         for x in filter_names:
@@ -472,6 +481,7 @@ def transposeMatrix(input_file):
     binarize=True
     transpose=False
     log2=False
+    partialMatch=False
 
     fileFormat = 'columns'
     if len(sys.argv[1:])<=1:  ### Indicates that there are insufficient number of command-line arguments
@@ -481,7 +491,7 @@ def transposeMatrix(input_file):
     else:
         options, remainder = getopt.getopt(sys.argv[1:],'', ['i=','f=','r=','median=','medoid=', 'fold=', 'folds=',
                             'centroid=','force=','minGeneCutoff=','expressionCutoff=','geneCountFilter=', 'binarize=',
-                            'transpose=','fileFormat=','log2='])
+                            'transpose=','fileFormat=','log2=','partialMatch='])
         #print sys.argv[1:]
         for opt, arg in options:
             if opt == '--i': input_file=arg
@@ -490,6 +500,7 @@ def transposeMatrix(input_file):
             elif opt == '--median' or opt=='--medoid' or opt=='--centroid': calculateCentroids = True
             elif opt == '--fold': returnComparisons = True
             elif opt == '--log2': log2 = True
+            elif opt == '--partialMatch': partialMatch = True
             elif opt == '--r':
                 if arg == 'exclude':
                     filter_rows=True
@@ -534,5 +545,5 @@ def transposeMatrix(input_file):
         filterFile(input_file,output_file,(filter_names,group_index_db),force=force,calculateCentroids=calculateCentroids,comparisons=comparisons)
     else:
         filter_names = getFilters(filter_file)
-        filterFile(input_file,output_file,filter_names,force=force,log2=log2)
+        filterFile(input_file,output_file,filter_names,force=force,log2=log2,partialMatch=partialMatch)
 
diff --git a/stats_scripts/ExpandSampleClusters.py b/stats_scripts/ExpandSampleClusters.py
@@ -295,11 +295,16 @@ def Classify(header,Xobs,output_file,grplst,name,turn,platform,output_dir,root_d
             export_class3.write(header[iq+1])
             for jq in range(0,len(name)):
                 export_class1.write("\t"+str(prob_[iq][jq]))
+                #print prob_[iq][jq],'\t',max(prob_[iq,:])
                 if prob_[iq][jq]==max(prob_[iq,:]):
                     #print ordersamp[header[iq+1]],name[jq]
-                    if ordersamp[header[iq+1]][0]==name[jq]: 
-                        order.append([header[iq+1],name[jq],prob_[iq][jq],ordersamp[header[iq+1]][1]])
-                    export_class3.write("\t"+str(1))
+                    if ordersamp[header[iq+1]][0]==name[jq]:
+                        if max(prob_[iq,:])>0: ### Increase this value to increase SVM alignment specificity
+                            class_assignment = 1
+                            order.append([header[iq+1],name[jq],prob_[iq][jq],ordersamp[header[iq+1]][1]])
+                        else:
+                            class_assignment = 0 ### The best match is poor, hence, the cell will be excluded from final results!!!
+                    export_class3.write("\t"+str(class_assignment))
                 else:
                     export_class3.write("\t"+str(0))
 
@@ -340,7 +345,7 @@ def Classify(header,Xobs,output_file,grplst,name,turn,platform,output_dir,root_d
                 export_class2.write("\n")
         else:
             prob_=regr.fit(Xobs,X[:,0]).decision_function(Y)
-        #k=list(prob_)
+            #k=list(prob_)
 
             export_class1.write("uid")
             #export_class2.write("uid")

diff --git a/stats_scripts/ICGS_NMF.py b/stats_scripts/ICGS_NMF.py
@@ -84,23 +84,16 @@ def estimateK(inputfile):
             head=1
             continue
         else:
-            val=[]
-
+            val=[]      
             line=line.rstrip('\r\n')
             q= string.split(line,'\t')
-            #header.append(q[0])
             for i in range(1,len(q)):
                 try:
                     val.append(float(q[i]))
 
                 except Exception:
                     continue
-
-
             counter+=1
-
-             #   break
-
         X.append(val)
     #X=zip(*X)
     X=np.array(X)
@@ -135,32 +128,28 @@ def estimateK(inputfile):
     return k
 
 def caldist(X,i,keys,keylist):
-        D=[]
-        Xxd=[]
-        newlist=[]
-    #for i in range(len(visited)):
-        #Xd=np.array(X[i])
-        #Xd=Xd.reshape(1, -1)
-        for ii in keys:
-            if ii==i: continue
-            newlist.append(ii)
-            Xxd.append(X[ii].tolist())
-
-        Xxd=np.array(Xxd)
-        Xd=X[i]
-
-        #Xd=Xxd
-        #Xxd=Xxd.tolist()
-        Xd=Xd.reshape(1, -1)
-        D=pairwise_distances(Xd,Xxd,metric='euclidean').tolist()
-
-        for q in range(len(np.argsort(D)[0])):
-            if newlist[q] in keylist:
-                continue
-            else:
-                key1=newlist[q]
-                break
-        return key1
+    D=[]
+    Xxd=[]
+    newlist=[]
+    for ii in keys:
+        if ii==i: continue
+        newlist.append(ii)
+        Xxd.append(X[ii].tolist())
+
+    Xxd=np.array(Xxd)
+    Xd=X[i]
+
+    #Xxd=Xxd.tolist()
+    Xd=Xd.reshape(1, -1)
+    D=pairwise_distances(Xd,Xxd,metric='euclidean').tolist()
+
+    for q in range(len(np.argsort(D)[0])):
+        if newlist[q] in keylist:
+            continue
+        else:
+            key1=newlist[q]
+            break
+    return key1
 
 def hgvfinder(inputfile,numVarGenes=500):
     """ Find the highly variable genes by dispersion """
@@ -1045,7 +1034,7 @@ def CompleteICGSWorkflow(root_dir,processedInputExpFile,EventAnnot,iteration,rho
         Guidefile=graphic_links3[-1][-1]
         Guidefile=Guidefile[:-4]+'.txt'
     else:
-        Guidefile="/Users/saljh8/Desktop/dataAnalysis/Collaborative/Harinder/D21-D35-PC-5p/forICGS/Euclidean/ICGS/Clustering-exp.D21-D35-PageRank-downsampled-Guide3-hierarchical_euclidean_correlation.txt"
+        Guidefile="/Users/saljh8/Dropbox/Collaborations/KPMP-Eric/3e284c30-51a6-480f-a248-0497b9863486_expression_matrix/ICGS/Clustering-exp.3e284c30-PageRank-downsampled-OutliersRemoved-Guide3-hierarchical_cosine_correlation.txt"
 
     rho_cutoff=0.2
     try:
@@ -1078,11 +1067,24 @@ def CompleteICGSWorkflow(root_dir,processedInputExpFile,EventAnnot,iteration,rho
             try: NMFResult,BinarizedOutput,Metadata,Annotation=NMF_Analysis.NMFAnalysis(filteredInputExpFile,NMFinput,Rank,platform,iteration)
             except:
                 try:
-                    Rank=k*1.5
+                    if Rank>60:
+                        Rank = 60
+                    if Rank < 10:
+                        Rank=k*1.5
+                    print 'Forcing k =',Rank
                     NMFResult,BinarizedOutput,Metadata,Annotation=NMF_Analysis.NMFAnalysis(filteredInputExpFile,NMFinput,Rank,platform,iteration)
                 except:
-                    Rank=k
-                    NMFResult,BinarizedOutput,Metadata,Annotation=NMF_Analysis.NMFAnalysis(filteredInputExpFile,NMFinput,Rank,platform,iteration)
+                    try:
+                        if Rank>40:
+                            Rank = 30
+                        else:
+                            Rank = 15
+                        print 'Forcing k =',Rank
+                        NMFResult,BinarizedOutput,Metadata,Annotation=NMF_Analysis.NMFAnalysis(filteredInputExpFile,NMFinput,Rank,platform,iteration)
+                    except:
+                        Rank=k
+                        print 'Forcing k =',Rank
+                        NMFResult,BinarizedOutput,Metadata,Annotation=NMF_Analysis.NMFAnalysis(filteredInputExpFile,NMFinput,Rank,platform,iteration)
         else:
             Rank=k
 
@@ -1414,9 +1416,10 @@ def runICGS_NMF(inputExpFile,scaling,platform,species,gsp,enrichmentInput='',dyn
         print 'Scaling counts as column normalized log2 values.',
         from import_scripts import CountsNormalize
         inputExpFile = CountsNormalize.normalizeDropSeqCountsMemoryEfficient(inputExpFile)
-
-    print 'Filtering the expression dataset (be patient).',
-    #print_out, inputExpFile = RNASeq.singleCellRNASeqWorkflow(species,platform,inputExpFile,mlp,rpkm_threshold=0,parameters=gsp,reportOnly=True)
+
+    if test_mode == False:
+        print 'Filtering the expression dataset (be patient).',
+        print_out, inputExpFile = RNASeq.singleCellRNASeqWorkflow(species,platform,inputExpFile,mlp,rpkm_threshold=0,parameters=gsp,reportOnly=True)
 
     print 'Running ICGS-NMF'
     ### Find the parent dir of the output directory (expression file from the GUI will be stored in the output dir [ExpressionInput])
@@ -1475,7 +1478,6 @@ def runICGS_NMF(inputExpFile,scaling,platform,species,gsp,enrichmentInput='',dyn
             processedInputExpFile = root_dir+'/ExpressionInput/'+exp_file_name[:-4]+'-PageRank-downsampled.txt' ### down-sampled file
             sampleIndexSelection.filterFile(inputExpFile,processedInputExpFile,sampmark)
         else:
-
             output_dir = root_dir+'/ExpressionInput'
             try: export.createExportFolder(output_dir)
             except: pass ### Already exists
@@ -1489,7 +1491,7 @@ def runICGS_NMF(inputExpFile,scaling,platform,species,gsp,enrichmentInput='',dyn
             else: processedInputExpFile = inputExpFile
     else:
         ### Re-run using a prior produced ICGS2 result
-        processedInputExpFile = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Harinder/D21-D35-PC-5p/forICGS/Euclidean/ExpressionInput/exp.D21-D35-PageRank-downsampled.txt'
+        processedInputExpFile = '/Users/saljh8/Dropbox/Collaborations/KPMP-Eric/3e284c30-51a6-480f-a248-0497b9863486_expression_matrix/ExpressionInput/exp.3e284c30-PageRank-downsampled.txt'
 
     flag=True
     iteration=1 ### Always equal to 1 for scRNA-Seq but can increment for splice-ICGS