Skip to content

Commit

Permalink
2.1.1 11/25/17
Browse files Browse the repository at this point in the history
Database build bug fixes, introduced with new Ensembl transcript
versioning, python-build specific eUTIL issues, too stringent UCSC
isoform filtering and isoform-exon ID coordinate matching.
  • Loading branch information
nsalomonis committed Nov 26, 2017
1 parent dd16305 commit 0b12094
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 89 deletions.
87 changes: 49 additions & 38 deletions build_scripts/EnsemblImport.py
Expand Up @@ -2145,6 +2145,7 @@ def importExonRegionCoordinates(species):
filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt'
fn=filepath(filename); x=0
exon_region_coord_db={}
all_coord_db={}
exon_region_db={}
strand_db={}
for line in open(fn,'rU').xreadlines():
Expand All @@ -2153,6 +2154,8 @@ def importExonRegionCoordinates(species):
if x==0: x+=1
else:
gene, exonid, chr, strand, start, stop, constitutive_call, ens_exon_ids, splice_events, splice_junctions = t
start_end = [start, stop]; start_end.sort()
all_coord_db[gene,exonid] = start_end
try:
### start and stop should be unique
db = exon_region_coord_db[gene]
Expand All @@ -2167,8 +2170,7 @@ def importExonRegionCoordinates(species):
try: exon_region_db[gene].append((exonid,start))
except Exception: exon_region_db[gene] = [(exonid,start)]
strand_db[gene] = strand
return exon_region_coord_db, exon_region_db, strand_db

return exon_region_coord_db, all_coord_db, exon_region_db, strand_db

def exportTranscriptExonIDAssociations(species):
""" Determine the exon region ID composition of all analyzed mRNAs """
Expand All @@ -2178,51 +2180,60 @@ def exportTranscriptExonIDAssociations(species):
from build_scripts import IdentifyAltIsoforms
seq_files, transcript_protein_db = IdentifyAltIsoforms.importProteinSequences(species,just_get_ids=True) ### get mRNA-protein IDs

exon_region_coord_db, exon_region_db, strand_db = importExonRegionCoordinates(species)
exon_region_coord_db, all_coord_db, exon_region_db, strand_db = importExonRegionCoordinates(species)
export_file = 'AltDatabase/ensembl/'+species+'/mRNA-ExonIDs.txt'
export_data = export.ExportFile(export_file)
transcript_region_db={}
transcripts_with_missing_regions={}
for key in relative_exon_locations:
errorCount=0
retainedIntrons=0
for key in relative_exon_locations: ### From UCSC or Ensembl transcript coordinates only
all_exon_intron_regions = {}
ens_transcriptid,gene,strand = key
#if ens_transcriptid != 'AK185721': continue
region_db={} ### track the coordinates for cleaning up the order
coord_db = exon_region_coord_db[gene]
for (region,start) in exon_region_db[gene]:
all_exon_intron_regions[region] = all_coord_db[gene,region]
region_db[region] = start
for exon_data in relative_exon_locations[key]:
for exon_data in relative_exon_locations[key]: ### each transcript exon
regions=[]
exon_start,exon_end,ens_exonid = exon_data
try: r1 = coord_db['s',exon_start]
except Exception: r1 = None
try: r2 = coord_db['e',exon_end]
except Exception: r2 = None ### Occurs with UCSC for Ensembl exons with longer or shorter 5' mRNA and 3' mRNA boundaries
if r2 == None and r1 == None:
transcripts_with_missing_regions[ens_transcriptid]=None
#print 'Error:',key, exon_start,exon_end,ens_exonid; sys.exit() ### Occurs due to a minor bug in EnsemblImport where certain exons are excluded due to length but eliminated longer exons sharing the same start position
else:
if r1 == None: r1 = str(r2)
if r2 == None: r2 = str(r1) ### don't replace the original values
regions.append(r1) ### starting exon-region for this Ensembl/UCSC exon
all_regions_coord = exon_region_db[gene] ### get all intermediate exon regions
all_regions = map(lambda (r,s): r, all_regions_coord)
if r1!=r2:
#print gene,exon_start,exon_end
#print r1, r2
ri1 = all_regions.index(r1)
ri2 = all_regions.index(r2)
#print ri1, ri2, all_regions
#print 'alpha',all_regions[ri1+1:ri2]#;sys.exit()
regions+=all_regions[ri1+1:ri2]
if r2 not in regions:
regions.append(r2) ### ending exon-region for this Ensembl/UCSC exon
if None in regions:
print ens_transcriptid, r1, r2, regions, all_regions[ri1+1:ri2];sys.exit()
for r in regions:
start_end = [exon_start,exon_end]; start_end.sort(); start,end = start_end
partial_matches=[]
added=False
for region in all_exon_intron_regions: ### search each transcript exon against every annotated region
e5,e3 = all_exon_intron_regions[region]
annotated = [int(e5),int(e3)]
annotated.sort()
coords = [start_end[0],start_end[1]]+annotated
coords.sort()
if coords[0] == start_end[0] and coords[-1] == start_end[-1]:
### Hence, the exon/intron region is contained within or is equal to the transcript exon
if (gene,ens_transcriptid) in transcript_region_db:
if r not in transcript_region_db[gene,ens_transcriptid] and 'I' not in r:
transcript_region_db[gene,ens_transcriptid].append((r))
if region not in transcript_region_db[gene,ens_transcriptid]:
transcript_region_db[gene,ens_transcriptid].append((region))
else:
transcript_region_db[gene,ens_transcriptid] = [r]
transcript_region_db[gene,ens_transcriptid] = [region]
added=True
retainedIntrons+=1
else:
if annotated[0] == start_end[0] or annotated[-1] == start_end[-1]:
#print exon_start,exon_end, start_end, region, ens_transcriptid
partial_matches.append(region)
if added ==False:
if len(partial_matches)>0:
### Occurs typically when a UCSC exon has a longer or shorter 3' or 5'UTR exon boundaries
for region in partial_matches:
if (gene,ens_transcriptid) in transcript_region_db:
if region not in transcript_region_db[gene,ens_transcriptid]:
transcript_region_db[gene,ens_transcriptid].append((region))
else:
transcript_region_db[gene,ens_transcriptid] = [region]
else:
transcripts_with_missing_regions[ens_transcriptid]=None
errorCount+=1
#if errorCount<100: print 'Error:',key, exon_start,exon_end,ens_exonid
if (gene,ens_transcriptid) in transcript_region_db:
regions = transcript_region_db[gene,ens_transcriptid]
regions_sort=[]
Expand All @@ -2232,14 +2243,14 @@ def exportTranscriptExonIDAssociations(species):
regions_sort.append([start,region])
regions_sort.sort()
if strand_db[gene] == '-':regions_sort.reverse()
transcript_region_db[gene,ens_transcriptid] = map(lambda (s,r): r, regions_sort)
transcript_region_db[gene,ens_transcriptid] = map(lambda (s,r): r, regions_sort)
#print gene, ens_transcriptid, transcript_region_db[gene,ens_transcriptid];sys.exit()
print len(transcripts_with_missing_regions), 'transcripts with missing exon regions out of', len(transcript_region_db)+len(transcripts_with_missing_regions)

t1=[]
for i in transcripts_with_missing_regions:
t1.append(i)
print 'missing:',t1[:15]


exon_db={}
gene_region_db={}
Expand Down Expand Up @@ -2632,7 +2643,7 @@ def importComparisonSplicingData4Primers(filename,species):

if __name__ == '__main__':
###KNOWN PROBLEMS: the junction analysis program calls exons as cassette-exons if there a new C-terminal exon occurs downstream of that exon in a different transcript (ENSG00000197991).
Species = 'Hs'
Species = 'Mm'
test = 'yes'
Data_type = 'ncRNA'
Data_type = 'mRNA'
Expand All @@ -2641,7 +2652,7 @@ def importComparisonSplicingData4Primers(filename,species):
filename = '/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/Leucegene/July-2017/PSI/Events-dPSI_0.1_adjp/PSI.U2AF1-like_vs_OthersQPCR.txt'
#filename = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Ichi/Combined-junction-exon-evidence.txt'

#exportTranscriptExonIDAssociations(Species)
exportTranscriptExonIDAssociations(Species);sys.exit()
#createExonRegionSequenceDB(Species,'RNASeq'); sys.exit()
importComparisonSplicingData4Primers(filename,Species); sys.exit()

Expand Down
67 changes: 48 additions & 19 deletions build_scripts/IdentifyAltIsoforms.py
Expand Up @@ -29,6 +29,7 @@
from Bio import Entrez
from build_scripts import ExonAnalyze_module
from build_scripts import mRNASeqAlign
import traceback

def filepath(filename):
fn = unique.filepath(filename)
Expand Down Expand Up @@ -75,7 +76,7 @@ def importSplicingAnnotationDatabase(array_type,species,import_coordinates):
if array_type == 'exon' or array_type == 'gene': filename = "AltDatabase/"+species+"/"+array_type+"/"+species+"_Ensembl_probesets.txt"
elif array_type == 'junction': filename = "AltDatabase/"+species+"/"+array_type+"/"+species+"_Ensembl_"+array_type+"_probesets.txt"
elif array_type == 'RNASeq': filename = "AltDatabase/"+species+"/"+array_type+"/"+species+"_Ensembl_exons.txt"

fn=filepath(filename); x=0; probeset_gene_db={}; probeset_coordinate_db={}
for line in open(fn,'rU').xreadlines():
probeset_data = cleanUpLine(line) #remove endline
Expand Down Expand Up @@ -509,7 +510,10 @@ def translateRNAs(unique_transcripts,unique_ens_transcripts,analysis_type):
output_file = 'AltDatabase/'+species+'/SequenceData/output/sequences/Transcript-Protein_sequences_'+str(2)+'.txt'
fn=filepath(output_file); os.remove(fn)
except Exception: null=[]
fetchSeq(ac_list,'nucleotide',1)
try: fetchSeq(ac_list,'nucleotide',1)
except Exception:
print traceback.format_exc(),'\n'
print 'WARNING!!!!! NCBI webservice connectivity failed due to the above error!!!!!\n'

###Import protein sequences
seq_files, transcript_protein_db = importProteinSequences(species,just_get_ids=True)
Expand All @@ -528,9 +532,15 @@ def translateRNAs(unique_transcripts,unique_ens_transcripts,analysis_type):
try: missing_gi_list = searchEntrez(missing_protein_data,'nucleotide')
except Exception:
print 'Exception encountered:',e
missing_gi_list = searchEntrez(missing_protein_data,'nucleotide')

fetchSeq(missing_gi_list,'nucleotide',len(seq_files)-2)
try: missing_gi_list = searchEntrez(missing_protein_data,'nucleotide')
except Exception:
print traceback.format_exc(),'\n'
print 'WARNING!!!!! NCBI webservice connectivity failed due to the above error!!!!!\n'

try: fetchSeq(missing_gi_list,'nucleotide',len(seq_files)-2)
except Exception:
print traceback.format_exc(),'\n'
print 'WARNING!!!!! NCBI webservice connectivity failed due to the above error!!!!!\n'
seq_files, transcript_protein_seq_db = importProteinSequences(species)
print len(unique_ens_transcripts)+len(unique_transcripts), 'examined transcripts.'
print len(transcript_protein_seq_db),'transcripts with associated protein sequence.'
Expand Down Expand Up @@ -637,6 +647,7 @@ def importProbesetTranscriptMatches(species,array_type,compare_all_features):
if (array_type == 'junction' or array_type == 'RNASeq') and data_type != 'null':
filename = 'AltDatabase/'+species+'/'+array_type+'/'+data_type+'/'+species+'_top-transcript-matches.txt'
else: filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_top-transcript-matches.txt'

fn=filepath(filename); probeset_transcript_db={}; unique_transcripts={}; unique_ens_transcripts={}; all_transcripts={}
for line in open(fn,'rU').xreadlines():
probeset_data = cleanUpLine(line) #remove endline
Expand Down Expand Up @@ -789,6 +800,8 @@ def importEnsemblTranscriptSequence(missing_protein_ACs):
#>ENST00000400685 cdna:known supercontig::NT_113903:9607:12778:1 gene:ENSG00000215618
t= string.split(data[1:],':'); sequence=''
transid_data = string.split(t[0],' '); transid = transid_data[0]
if '.' in transid:
transid = string.split(transid,'.')[0]
#try: ensembl_id,chr,strand,transid,prot_id = t
#except ValueError: ensembl_id,chr,strand,transid = t
except IndexError: continue
Expand All @@ -812,10 +825,11 @@ def importEnsemblTranscriptSequence(missing_protein_ACs):
end_time = time.time(); time_diff = int(end_time-start_time)
print "Ensembl transcript sequences analyzed in %d seconds" % time_diff

missing_protein_ACs_inSilico={}
missing_protein_ACs_inSilico=[]
for mRNA_AC in missing_protein_ACs:
if mRNA_AC not in translated_mRNAs: missing_protein_ACs_inSilico[mRNA_AC]=[]
print len(missing_protein_ACs_inSilico), 'Ensembl mRNAs without mRNA sequence that were NOT translated into protein'
if mRNA_AC not in translated_mRNAs:
missing_protein_ACs_inSilico.append(mRNA_AC)
print len(missing_protein_ACs_inSilico), 'Ensembl mRNAs without mRNA sequence NOT in silico translated (e.g., lncRNAs)', missing_protein_ACs_inSilico[:10]

def importEnsemblProteinSeqData(species,unique_ens_transcripts):
from build_scripts import FeatureAlignment
Expand Down Expand Up @@ -939,16 +953,26 @@ def BuildInSilicoTranslations(mRNA_db):
translation_db={}

from Bio.Seq import Seq

### New Biopython methods - http://biopython.org/wiki/Seq
from Bio.Alphabet import generic_dna

### Deprecated
#from Bio.Alphabet import IUPAC
#from Bio import Translate ### deprecated

#print 'Begining in silco translation for',len(mRNA_db),'sequences.'

print 'Begining in silco translation for',len(mRNA_db),'sequences.'

def cleanSeq(input_seq):
"""Wrapper for Biopython translate function. Bio.Seq.translate will complain if input sequence is
not a mulitple of 3. This wrapper function passes an acceptable input to Bio.Seq.translate in order to
avoid this warning."""
#https://github.com/broadinstitute/oncotator/pull/265/commits/94b20aabff48741a92b3f9e608e159957af6af30
trailing_bases = len(input_seq) % 3
if trailing_bases:
input_seq = ''.join([input_seq, 'NN']) if trailing_bases == 1 else ''.join([input_seq, 'N'])
return input_seq

first_time = 1
for mRNA_AC in mRNA_db:
if mRNA_AC == 'AK025306': print '@@@@@@@@@@^^^^AK025306...attempting in silico translation'
Expand All @@ -966,7 +990,8 @@ def BuildInSilicoTranslations(mRNA_db):
sequence_met = sequence[x:] #x gives us the position where the first Met* is.

### New Biopython methods - http://biopython.org/wiki/Seq
dna_seq = Seq(sequence_met, generic_dna)
dna_clean = cleanSeq(sequence_met)
dna_seq = Seq(dna_clean, generic_dna)
prot_seq = dna_seq.translate(to_stop=True)

### Deprecated code
Expand All @@ -975,7 +1000,8 @@ def BuildInSilicoTranslations(mRNA_db):
#standard_translator = Translate.unambiguous_dna_by_id[1]
#prot_seq = standard_translator.translate_to_stop(dna_seq) #convert the dna to protein sequence

prot_seq_string = prot_seq.tostring()
#prot_seq_string = prot_seq.tostring()
prot_seq_string = str(prot_seq)
prot_seq_tuple = len(prot_seq_string),y,prot_seq_string,dna_seq #added DNA sequence to determine which exon we're in later
temp_protein_list.append(prot_seq_tuple) #create a list of protein sequences to select the longest one
sequence = sequence_met[3:] # sequence_met is the sequence after the first or proceeduring methionine, reset the sequence for the next loop
Expand Down Expand Up @@ -1022,7 +1048,8 @@ def BuildInSilicoTranslations(mRNA_db):
else: break
else: break ###do not need to keep looking
dl = (len(prot_seq_string))*3 #figure out what the DNA coding sequence length is
dna_seq_string_coding_to_end = coding_dna_seq_string.tostring()
#dna_seq_string_coding_to_end = coding_dna_seq_string.tostring()
dna_seq_string_coding_to_end = str(coding_dna_seq_string)
coding_dna_seq_string = dna_seq_string_coding_to_end[0:dl]
cds_location = str(pos1+1)+'..'+str(pos1+len(prot_seq_string)*3+3)
### Determine if a stop codon is in the sequence or there's a premature end
Expand Down Expand Up @@ -1258,10 +1285,11 @@ def importUCSCSequences(missing_protein_ACs):
end_time = time.time(); time_diff = int(end_time-start_time)
print "UCSC mRNA sequences analyzed in %d seconds" % time_diff

missing_protein_ACs_inSilico={}
missing_protein_ACs_inSilico=[]
for mRNA_AC in missing_protein_ACs:
if mRNA_AC not in translated_mRNAs: missing_protein_ACs_inSilico[mRNA_AC]=[]
print len(missing_protein_ACs_inSilico), 'mRNAs without mRNA sequence that were NOT translated into protein'
if mRNA_AC not in translated_mRNAs:
missing_protein_ACs_inSilico.append(mRNA_AC)
print len(missing_protein_ACs_inSilico), 'mRNAs without mRNA sequence NOT in silico translated (e.g., lncRNAs)',missing_protein_ACs_inSilico[:10]
return missing_protein_ACs_inSilico

############# END Code currently not used (LOCAL PROTEIN SEQUENCE ANALYSIS) ##############
Expand Down Expand Up @@ -1306,9 +1334,10 @@ def runProgramTest(Species,Array_type,Data_type,translate_seq,run_seqcomp):
if __name__ == '__main__':
species = 'Mm'; array_type = 'AltMouse'; translate='no'; run_seqcomp = 'no'; data_type = 'exon'
species = 'Mm'; array_type = 'RNASeq'; translate='yes'
species = 'Dr'; array_type = 'RNASeq'; translate='yes'; data_type = 'junction'
#species = 'Dr'; array_type = 'RNASeq'; translate='yes'; data_type = 'junction'
test='no'

a = ['ENSMUST00000138102', 'ENSMUST00000193415', 'ENSMUST00000124412', 'ENSMUST00000200569']
importEnsemblTranscriptSequence(a);sys.exit()
filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_transcript-annotations.txt'
ens_transcript_exon_db,ens_gene_transcript_db,ens_gene_exon_db = importEnsExonStructureDataSimple(filename,species,{},{},{},{})
#print len(ens_transcript_exon_db), len(ens_gene_transcript_db), len(ens_gene_exon_db);sys.exit()
Expand Down

0 comments on commit 0b12094

Please sign in to comment.