2.1.1 11/25/17

Database build bug fixes, introduced with new Ensembl transcript versioning, python-build specific eUTIL issues, too stringent UCSC isoform filtering and isoform-exon ID coordinate matching.
nsalomonis · Nov 26, 2017 · 0b12094 · 0b12094
1 parent dd16305
commit 0b12094
Show file tree

Hide file tree

Showing 6 changed files with 164 additions and 89 deletions.
diff --git a/build_scripts/EnsemblImport.py b/build_scripts/EnsemblImport.py
@@ -2145,6 +2145,7 @@ def importExonRegionCoordinates(species):
     filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt'
     fn=filepath(filename); x=0
     exon_region_coord_db={}
+    all_coord_db={}
     exon_region_db={}
     strand_db={}
     for line in open(fn,'rU').xreadlines():
@@ -2153,6 +2154,8 @@ def importExonRegionCoordinates(species):
         if x==0: x+=1
         else:
             gene, exonid, chr, strand, start, stop, constitutive_call, ens_exon_ids, splice_events, splice_junctions = t
+            start_end = [start, stop]; start_end.sort()
+            all_coord_db[gene,exonid] = start_end
             try:
                 ### start and stop should be unique
                 db = exon_region_coord_db[gene]
@@ -2167,8 +2170,7 @@ def importExonRegionCoordinates(species):
             try: exon_region_db[gene].append((exonid,start))
             except Exception: exon_region_db[gene] = [(exonid,start)]
             strand_db[gene] = strand
-    return exon_region_coord_db, exon_region_db, strand_db
-
+    return exon_region_coord_db, all_coord_db, exon_region_db, strand_db
 
 def exportTranscriptExonIDAssociations(species):
     """ Determine the exon region ID composition of all analyzed mRNAs """
@@ -2178,51 +2180,60 @@ def exportTranscriptExonIDAssociations(species):
     from build_scripts import IdentifyAltIsoforms
     seq_files, transcript_protein_db = IdentifyAltIsoforms.importProteinSequences(species,just_get_ids=True) ### get mRNA-protein IDs
 
-    exon_region_coord_db, exon_region_db, strand_db = importExonRegionCoordinates(species)
+    exon_region_coord_db, all_coord_db, exon_region_db, strand_db = importExonRegionCoordinates(species)
     export_file = 'AltDatabase/ensembl/'+species+'/mRNA-ExonIDs.txt'
     export_data = export.ExportFile(export_file)
     transcript_region_db={}
     transcripts_with_missing_regions={}
-    for key in relative_exon_locations:
+    errorCount=0
+    retainedIntrons=0
+    for key in relative_exon_locations:  ### From UCSC or Ensembl transcript coordinates only
+        all_exon_intron_regions = {}
         ens_transcriptid,gene,strand = key
+        #if ens_transcriptid != 'AK185721': continue
         region_db={} ### track the coordinates for cleaning up the order
         coord_db = exon_region_coord_db[gene]
         for (region,start) in exon_region_db[gene]:
+            all_exon_intron_regions[region] = all_coord_db[gene,region]
             region_db[region] = start
-        for exon_data in relative_exon_locations[key]:
+        for exon_data in relative_exon_locations[key]: ### each transcript exon
             regions=[]
             exon_start,exon_end,ens_exonid = exon_data
-            try: r1 = coord_db['s',exon_start]
-            except Exception: r1 = None
-            try: r2 = coord_db['e',exon_end]
-            except Exception: r2 = None ### Occurs with UCSC for Ensembl exons with longer or shorter 5' mRNA and 3' mRNA boundaries
-            if r2 == None and r1 == None:
-                transcripts_with_missing_regions[ens_transcriptid]=None
-                #print 'Error:',key, exon_start,exon_end,ens_exonid; sys.exit() ### Occurs due to a minor bug in EnsemblImport where certain exons are excluded due to length but eliminated longer exons sharing the same start position
-            else:
-                if r1 == None: r1 = str(r2)
-                if r2 == None: r2 = str(r1) ### don't replace the original values
-                regions.append(r1) ### starting exon-region for this Ensembl/UCSC exon
-                all_regions_coord = exon_region_db[gene] ### get all intermediate exon regions
-                all_regions = map(lambda (r,s): r, all_regions_coord)
-                if r1!=r2:
-                    #print gene,exon_start,exon_end
-                    #print r1, r2
-                    ri1 = all_regions.index(r1)
-                    ri2 = all_regions.index(r2)
-                    #print ri1, ri2, all_regions
-                    #print 'alpha',all_regions[ri1+1:ri2]#;sys.exit()
-                    regions+=all_regions[ri1+1:ri2]
-                if r2 not in regions:
-                    regions.append(r2) ### ending exon-region for this Ensembl/UCSC exon
-                if None in regions:
-                    print ens_transcriptid, r1, r2, regions, all_regions[ri1+1:ri2];sys.exit()
-                for r in regions:
+            start_end = [exon_start,exon_end]; start_end.sort(); start,end = start_end
+            partial_matches=[]
+            added=False
+            for region in all_exon_intron_regions: ### search each transcript exon against every annotated region
+                e5,e3 = all_exon_intron_regions[region]
+                annotated = [int(e5),int(e3)]
+                annotated.sort()
+                coords = [start_end[0],start_end[1]]+annotated
+                coords.sort()
+                if coords[0] == start_end[0] and coords[-1] == start_end[-1]:
+                    ### Hence, the exon/intron region is contained within or is equal to the transcript exon
                     if (gene,ens_transcriptid) in transcript_region_db:
-                        if r not in transcript_region_db[gene,ens_transcriptid] and 'I' not in r:
-                            transcript_region_db[gene,ens_transcriptid].append((r))
+                        if region not in transcript_region_db[gene,ens_transcriptid]:
+                            transcript_region_db[gene,ens_transcriptid].append((region))
                     else:
-                        transcript_region_db[gene,ens_transcriptid] = [r]
+                        transcript_region_db[gene,ens_transcriptid] = [region]
+                    added=True
+                    retainedIntrons+=1
+                else:
+                    if annotated[0] == start_end[0] or annotated[-1] == start_end[-1]:
+                        #print exon_start,exon_end, start_end, region, ens_transcriptid
+                        partial_matches.append(region)
+            if added ==False:
+                if len(partial_matches)>0:
+                    ### Occurs typically when a UCSC exon has a longer or shorter 3' or 5'UTR exon boundaries
+                    for region in partial_matches:
+                        if (gene,ens_transcriptid) in transcript_region_db:
+                            if region not in transcript_region_db[gene,ens_transcriptid]:
+                                transcript_region_db[gene,ens_transcriptid].append((region))
+                        else:
+                            transcript_region_db[gene,ens_transcriptid] = [region]
+                else:
+                    transcripts_with_missing_regions[ens_transcriptid]=None
+                    errorCount+=1
+                    #if errorCount<100: print 'Error:',key, exon_start,exon_end,ens_exonid
         if (gene,ens_transcriptid) in transcript_region_db:
             regions = transcript_region_db[gene,ens_transcriptid]
             regions_sort=[]
@@ -2232,14 +2243,14 @@ def exportTranscriptExonIDAssociations(species):
                 regions_sort.append([start,region])
             regions_sort.sort()
             if strand_db[gene] == '-':regions_sort.reverse()
-            transcript_region_db[gene,ens_transcriptid] = map(lambda (s,r): r, regions_sort)    
+            transcript_region_db[gene,ens_transcriptid] = map(lambda (s,r): r, regions_sort)
+            #print gene, ens_transcriptid, transcript_region_db[gene,ens_transcriptid];sys.exit()
     print len(transcripts_with_missing_regions), 'transcripts with missing exon regions out of', len(transcript_region_db)+len(transcripts_with_missing_regions)
 
     t1=[]
     for i in transcripts_with_missing_regions:
         t1.append(i)
     print 'missing:',t1[:15]
-
 
     exon_db={}
     gene_region_db={}
@@ -2632,7 +2643,7 @@ def importComparisonSplicingData4Primers(filename,species):
 
 if __name__ == '__main__':
     ###KNOWN PROBLEMS: the junction analysis program calls exons as cassette-exons if there a new C-terminal exon occurs downstream of that exon in a different transcript (ENSG00000197991).
-    Species = 'Hs'
+    Species = 'Mm'
     test = 'yes'
     Data_type = 'ncRNA'
     Data_type = 'mRNA'
@@ -2641,7 +2652,7 @@ def importComparisonSplicingData4Primers(filename,species):
     filename = '/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/Leucegene/July-2017/PSI/Events-dPSI_0.1_adjp/PSI.U2AF1-like_vs_OthersQPCR.txt'
     #filename = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Ichi/Combined-junction-exon-evidence.txt'
 
-    #exportTranscriptExonIDAssociations(Species)
+    exportTranscriptExonIDAssociations(Species);sys.exit()
     #createExonRegionSequenceDB(Species,'RNASeq'); sys.exit()
     importComparisonSplicingData4Primers(filename,Species); sys.exit()
 

diff --git a/build_scripts/IdentifyAltIsoforms.py b/build_scripts/IdentifyAltIsoforms.py
@@ -29,6 +29,7 @@
 from Bio import Entrez
 from build_scripts import ExonAnalyze_module
 from build_scripts import mRNASeqAlign
+import traceback
 
 def filepath(filename):
     fn = unique.filepath(filename)
@@ -75,7 +76,7 @@ def importSplicingAnnotationDatabase(array_type,species,import_coordinates):
     if array_type == 'exon' or array_type == 'gene': filename = "AltDatabase/"+species+"/"+array_type+"/"+species+"_Ensembl_probesets.txt"    
     elif array_type == 'junction': filename = "AltDatabase/"+species+"/"+array_type+"/"+species+"_Ensembl_"+array_type+"_probesets.txt"
     elif array_type == 'RNASeq': filename = "AltDatabase/"+species+"/"+array_type+"/"+species+"_Ensembl_exons.txt"
-        
+
     fn=filepath(filename); x=0; probeset_gene_db={}; probeset_coordinate_db={}
     for line in open(fn,'rU').xreadlines():             
         probeset_data = cleanUpLine(line)  #remove endline
@@ -509,7 +510,10 @@ def translateRNAs(unique_transcripts,unique_ens_transcripts,analysis_type):
             output_file = 'AltDatabase/'+species+'/SequenceData/output/sequences/Transcript-Protein_sequences_'+str(2)+'.txt'
             fn=filepath(output_file); os.remove(fn)
         except Exception: null=[]
-        fetchSeq(ac_list,'nucleotide',1)
+        try: fetchSeq(ac_list,'nucleotide',1)
+        except Exception:
+            print traceback.format_exc(),'\n'
+            print 'WARNING!!!!! NCBI webservice connectivity failed due to the above error!!!!!\n'
 
     ###Import protein sequences
     seq_files, transcript_protein_db = importProteinSequences(species,just_get_ids=True)
@@ -528,9 +532,15 @@ def translateRNAs(unique_transcripts,unique_ens_transcripts,analysis_type):
         try: missing_gi_list = searchEntrez(missing_protein_data,'nucleotide')
         except Exception:
             print 'Exception encountered:',e
-            missing_gi_list = searchEntrez(missing_protein_data,'nucleotide')
-
-    fetchSeq(missing_gi_list,'nucleotide',len(seq_files)-2)
+            try: missing_gi_list = searchEntrez(missing_protein_data,'nucleotide')
+            except Exception:
+                print traceback.format_exc(),'\n'
+                print 'WARNING!!!!! NCBI webservice connectivity failed due to the above error!!!!!\n'
+
+    try: fetchSeq(missing_gi_list,'nucleotide',len(seq_files)-2)
+    except Exception:
+        print traceback.format_exc(),'\n'
+        print 'WARNING!!!!! NCBI webservice connectivity failed due to the above error!!!!!\n'
     seq_files, transcript_protein_seq_db = importProteinSequences(species)
     print len(unique_ens_transcripts)+len(unique_transcripts), 'examined transcripts.'
     print len(transcript_protein_seq_db),'transcripts with associated protein sequence.'
@@ -637,6 +647,7 @@ def importProbesetTranscriptMatches(species,array_type,compare_all_features):
         if (array_type == 'junction' or array_type == 'RNASeq') and data_type != 'null': 
             filename = 'AltDatabase/'+species+'/'+array_type+'/'+data_type+'/'+species+'_top-transcript-matches.txt'
         else: filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_top-transcript-matches.txt'
+
     fn=filepath(filename); probeset_transcript_db={}; unique_transcripts={}; unique_ens_transcripts={}; all_transcripts={}
     for line in open(fn,'rU').xreadlines():             
         probeset_data = cleanUpLine(line)  #remove endline
@@ -789,6 +800,8 @@ def importEnsemblTranscriptSequence(missing_protein_ACs):
                     #>ENST00000400685 cdna:known supercontig::NT_113903:9607:12778:1 gene:ENSG00000215618
                     t= string.split(data[1:],':'); sequence=''
                     transid_data = string.split(t[0],' '); transid = transid_data[0]
+                    if '.' in transid:
+                        transid = string.split(transid,'.')[0]
                     #try: ensembl_id,chr,strand,transid,prot_id = t
                     #except ValueError: ensembl_id,chr,strand,transid = t
         except IndexError: continue
@@ -812,10 +825,11 @@ def importEnsemblTranscriptSequence(missing_protein_ACs):
     end_time = time.time(); time_diff = int(end_time-start_time)
     print "Ensembl transcript sequences analyzed in %d seconds" % time_diff
 
-    missing_protein_ACs_inSilico={}
+    missing_protein_ACs_inSilico=[]
     for mRNA_AC in missing_protein_ACs:
-        if mRNA_AC not in translated_mRNAs: missing_protein_ACs_inSilico[mRNA_AC]=[]
-    print len(missing_protein_ACs_inSilico), 'Ensembl mRNAs without mRNA sequence that were NOT translated into protein'    
+        if mRNA_AC not in translated_mRNAs:
+            missing_protein_ACs_inSilico.append(mRNA_AC)
+    print len(missing_protein_ACs_inSilico), 'Ensembl mRNAs without mRNA sequence NOT in silico translated (e.g., lncRNAs)', missing_protein_ACs_inSilico[:10]
 
 def importEnsemblProteinSeqData(species,unique_ens_transcripts):
     from build_scripts import FeatureAlignment
@@ -939,16 +953,26 @@ def BuildInSilicoTranslations(mRNA_db):
     translation_db={}
 
     from Bio.Seq import Seq
-    
+
     ### New Biopython methods - http://biopython.org/wiki/Seq
     from Bio.Alphabet import generic_dna
 
     ### Deprecated
     #from Bio.Alphabet import IUPAC
     #from Bio import Translate ### deprecated
+
+    #print 'Begining in silco translation for',len(mRNA_db),'sequences.'
 
-    print 'Begining in silco translation for',len(mRNA_db),'sequences.'
-
+    def cleanSeq(input_seq):
+        """Wrapper for Biopython translate function.  Bio.Seq.translate will complain if input sequence is 
+        not a mulitple of 3.  This wrapper function passes an acceptable input to Bio.Seq.translate in order to
+        avoid this warning."""
+        #https://github.com/broadinstitute/oncotator/pull/265/commits/94b20aabff48741a92b3f9e608e159957af6af30
+        trailing_bases = len(input_seq) % 3
+        if trailing_bases:
+            input_seq = ''.join([input_seq, 'NN']) if trailing_bases == 1 else ''.join([input_seq, 'N'])
+        return input_seq
+
     first_time = 1
     for mRNA_AC in mRNA_db:
         if mRNA_AC == 'AK025306': print '@@@@@@@@@@^^^^AK025306...attempting in silico translation'
@@ -966,7 +990,8 @@ def BuildInSilicoTranslations(mRNA_db):
             sequence_met = sequence[x:]  #x gives us the position where the first Met* is.
 
             ### New Biopython methods - http://biopython.org/wiki/Seq
-            dna_seq = Seq(sequence_met, generic_dna)
+            dna_clean = cleanSeq(sequence_met)
+            dna_seq = Seq(dna_clean, generic_dna)
             prot_seq = dna_seq.translate(to_stop=True)
 
             ### Deprecated code
@@ -975,7 +1000,8 @@ def BuildInSilicoTranslations(mRNA_db):
             #standard_translator = Translate.unambiguous_dna_by_id[1]
             #prot_seq = standard_translator.translate_to_stop(dna_seq) #convert the dna to protein sequence
 
-            prot_seq_string = prot_seq.tostring()
+            #prot_seq_string = prot_seq.tostring()
+            prot_seq_string = str(prot_seq)
             prot_seq_tuple = len(prot_seq_string),y,prot_seq_string,dna_seq  #added DNA sequence to determine which exon we're in later  
             temp_protein_list.append(prot_seq_tuple) #create a list of protein sequences to select the longest one
             sequence = sequence_met[3:]  # sequence_met is the sequence after the first or proceeduring methionine, reset the sequence for the next loop
@@ -1022,7 +1048,8 @@ def BuildInSilicoTranslations(mRNA_db):
                     else: break
                 else: break ###do not need to keep looking
             dl = (len(prot_seq_string))*3  #figure out what the DNA coding sequence length is
-            dna_seq_string_coding_to_end = coding_dna_seq_string.tostring()
+            #dna_seq_string_coding_to_end = coding_dna_seq_string.tostring()
+            dna_seq_string_coding_to_end = str(coding_dna_seq_string)
             coding_dna_seq_string = dna_seq_string_coding_to_end[0:dl]
             cds_location = str(pos1+1)+'..'+str(pos1+len(prot_seq_string)*3+3)
             ### Determine if a stop codon is in the sequence or there's a premature end
@@ -1258,10 +1285,11 @@ def importUCSCSequences(missing_protein_ACs):
     end_time = time.time(); time_diff = int(end_time-start_time)
     print "UCSC mRNA sequences analyzed in %d seconds" % time_diff
 
-    missing_protein_ACs_inSilico={}
+    missing_protein_ACs_inSilico=[]
     for mRNA_AC in missing_protein_ACs:
-        if mRNA_AC not in translated_mRNAs: missing_protein_ACs_inSilico[mRNA_AC]=[]
-    print len(missing_protein_ACs_inSilico), 'mRNAs without mRNA sequence that were NOT translated into protein'
+        if mRNA_AC not in translated_mRNAs:
+            missing_protein_ACs_inSilico.append(mRNA_AC)
+    print len(missing_protein_ACs_inSilico), 'mRNAs without mRNA sequence NOT in silico translated (e.g., lncRNAs)',missing_protein_ACs_inSilico[:10]
     return missing_protein_ACs_inSilico
 
 ############# END Code currently not used (LOCAL PROTEIN SEQUENCE ANALYSIS) ##############
@@ -1306,9 +1334,10 @@ def runProgramTest(Species,Array_type,Data_type,translate_seq,run_seqcomp):
 if __name__ == '__main__':
     species = 'Mm'; array_type = 'AltMouse'; translate='no'; run_seqcomp = 'no'; data_type = 'exon'
     species = 'Mm'; array_type = 'RNASeq'; translate='yes'
-    species = 'Dr'; array_type = 'RNASeq'; translate='yes'; data_type = 'junction'
+    #species = 'Dr'; array_type = 'RNASeq'; translate='yes'; data_type = 'junction'
     test='no'
-
+    a = ['ENSMUST00000138102', 'ENSMUST00000193415', 'ENSMUST00000124412', 'ENSMUST00000200569']
+    importEnsemblTranscriptSequence(a);sys.exit()
     filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_transcript-annotations.txt'    
     ens_transcript_exon_db,ens_gene_transcript_db,ens_gene_exon_db = importEnsExonStructureDataSimple(filename,species,{},{},{},{})
     #print len(ens_transcript_exon_db), len(ens_gene_transcript_db), len(ens_gene_exon_db);sys.exit()