/
IdentifyAltIsoforms.py
executable file
·1345 lines (1207 loc) · 79.1 KB
/
IdentifyAltIsoforms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/local/bin/python2.6
###IdentifyAltIsoforms
#Copyright 2005-2008 J. David Gladstone Institutes, San Francisco California
#Author Nathan Salomonis - nsalomonis@gmail.com
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is furnished
#to do so, subject to the following conditions:
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import sys,string,os
sys.path.insert(1, os.path.join(sys.path[0], '..')) ### import parent dir dependencies
import os.path
import unique
import export
import time
import copy
from build_scripts import FeatureAlignment; reload(FeatureAlignment)
from Bio import Entrez
from build_scripts import ExonAnalyze_module
from build_scripts import mRNASeqAlign
import traceback
def filepath(filename):
fn = unique.filepath(filename)
return fn
def read_directory(sub_dir):
dir_list = unique.read_directory(sub_dir); dir_list2 = []
###Code to prevent folder names from being included
for entry in dir_list:
if (entry[-4:] == ".txt"or entry[-4:] == ".tab" or entry[-4:] == ".csv" or '.fa' in entry) and '.gz' not in entry and '.zip' not in entry: dir_list2.append(entry)
return dir_list2
class GrabFiles:
def setdirectory(self,value):
self.data = value
def display(self):
print self.data
def searchdirectory(self,search_term):
#self is an instance while self.data is the value of the instance
file_dirs = getDirectoryFiles(self.data,str(search_term))
if len(file_dirs)<1: print search_term,'not found',self.data
return file_dirs
def getDirectoryFiles(import_dir, search_term):
exact_file = ''; exact_file_dirs=[]
dir_list = read_directory(import_dir) #send a sub_directory to a function to identify all files in a directory
dir_list.sort() ### Get the latest files
for data in dir_list: #loop through each file in the directory to output results
affy_data_dir = import_dir[1:]+'/'+data
if search_term in affy_data_dir: exact_file_dirs.append(affy_data_dir)
return exact_file_dirs
########## End generic file import ##########
def cleanUpLine(line):
line = string.replace(line,'\n','')
line = string.replace(line,'\c','')
data = string.replace(line,'\r','')
data = string.replace(data,'"','')
return data
def importSplicingAnnotationDatabase(array_type,species,import_coordinates):
if array_type == 'exon' or array_type == 'gene': filename = "AltDatabase/"+species+"/"+array_type+"/"+species+"_Ensembl_probesets.txt"
elif array_type == 'junction': filename = "AltDatabase/"+species+"/"+array_type+"/"+species+"_Ensembl_"+array_type+"_probesets.txt"
elif array_type == 'RNASeq': filename = "AltDatabase/"+species+"/"+array_type+"/"+species+"_Ensembl_exons.txt"
fn=filepath(filename); x=0; probeset_gene_db={}; probeset_coordinate_db={}
for line in open(fn,'rU').xreadlines():
probeset_data = cleanUpLine(line) #remove endline
if x == 0: x = 1
else:
t=string.split(probeset_data,'\t'); probeset=t[0]; exon_id=t[1]; ens_gene=t[2]; start=int(t[6]); stop=int(t[7])
if array_type != 'RNASeq':
if ':' in probeset: probeset = string.split(probeset,':')[1]
start_stop = [start,stop]; start_stop.sort(); start,stop = start_stop; proceed = 'yes'
#if probeset[-2] != '|': ### These are the 5' and 3' exons for a splice-junction (don't need to analyze)
if test == 'yes':
if ens_gene not in test_genes: proceed = 'no'
if proceed == 'yes':
probeset_gene_db[probeset]=ens_gene,exon_id
if import_coordinates == 'yes': probeset_coordinate_db[probeset] = start,stop
print 'Probeset to Ensembl gene data imported'
if import_coordinates == 'yes': return probeset_gene_db,probeset_coordinate_db
else: return probeset_gene_db
def importAltMouseJunctionDatabase(species,array_type):
### Import AffyGene to Ensembl associations (e.g., AltMouse array)
filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'-Ensembl.txt'
fn=filepath(filename); array_ens_db={}; x = 0
for line in open(fn,'r').xreadlines():
data, newline = string.split(line,'\n'); t = string.split(data,'\t')
if x==0: x=1
else:
array_gene,ens_gene = t
array_ens_db[array_gene]=ens_gene
#try: array_ens_db[array_gene].append(ens_gene)
#except KeyError: array_ens_db[array_gene]=[ens_gene]
filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_junction-comparisons.txt'
fn=filepath(filename); probeset_gene_db={}; x = 0
for line in open(fn,'r').xreadlines():
data, newline = string.split(line,'\n'); t = string.split(data,'\t')
if x==0: x=1
else:
array_gene,probeset1,probeset2,critical_exons = t #; critical_exons = string.split(critical_exons,'|')
if array_gene in array_ens_db:
ensembl_gene_ids = array_ens_db[array_gene]
else: ensembl_gene_ids=[]
probeset_gene_db[probeset1+'|'+probeset2] = array_gene,critical_exons
probeset_gene_db[probeset1] = array_gene,critical_exons
probeset_gene_db[probeset2] = array_gene,critical_exons
return probeset_gene_db
def importJunctionDatabase(species,array_type):
if array_type == 'junction': filename = 'AltDatabase/' + species + '/'+array_type+'/'+ species + '_junction_comps_updated.txt'
else: filename = 'AltDatabase/' + species + '/'+array_type+'/'+ species + '_junction_comps.txt'
fn=filepath(filename); probeset_gene_db={}; x=0
for line in open(fn,'rU').xreadlines():
if x==0: x=1
else:
data = cleanUpLine(line)
gene,critical_exons,excl_junction,incl_junction,excl_junction_probeset,incl_junction_probeset,source = string.split(data,'\t')
probeset_gene_db[incl_junction_probeset+'|'+excl_junction_probeset] = gene,critical_exons
probeset_gene_db[excl_junction_probeset] = gene,critical_exons
probeset_gene_db[incl_junction_probeset] = gene,critical_exons
return probeset_gene_db
def importEnsExonStructureDataSimple(filename,species,ens_transcript_exon_db,ens_gene_transcript_db,ens_gene_exon_db,filter_transcripts):
fn=filepath(filename); x=0
print fn
"""
if 'Ensembl' not in filename:
original_ensembl_transcripts={} ### Keep track of the original ensembl transcripts
for i in ens_transcript_exon_db: original_ensembl_transcripts[i]=[]"""
for line in open(fn,'rU').xreadlines():
data = cleanUpLine(line)
t = string.split(data,'\t')
if x==0: x=1
else:
gene, chr, strand, exon_start, exon_end, ens_exonid, constitutive_exon, ens_transcriptid = t
exon_start = int(exon_start); exon_end = int(exon_end); proceed = 'yes'
if test == 'yes':
if gene not in test_genes: proceed = 'no'
if (ens_transcriptid in filter_transcripts or len(filter_transcripts)==0) and proceed == 'yes':
### Only import transcripts that are in the set to analyze
try: ens_transcript_exon_db[ens_transcriptid][exon_start,exon_end]=[]
except KeyError: ens_transcript_exon_db[ens_transcriptid] = db = {(exon_start,exon_end):[]}
try: ens_gene_transcript_db[gene][ens_transcriptid]=[]
except KeyError: ens_gene_transcript_db[gene] = db = {ens_transcriptid:[]}
try: ens_gene_exon_db[gene][exon_start,exon_end]=[]
except KeyError: ens_gene_exon_db[gene] = db = {(exon_start,exon_end):[]}
"""
### Some transcripts will have the same coordinates - get rid of these (not implemented yet)
if 'Ensembl' not in filename:
for gene in ens_gene_transcript_db:
exon_structure_db={}
for transcript in ens_gene_transcript_db[gene]:
ls=[]
for coord in ens_transcript_exon_db[transcript]: ls.append(coord)
ls.sort()
try: exon_structure_db[tuple(ls)].append(transcript)
except Exception: exon_structure_db[tuple(ls)] = [transcript]"""
#ens_gene_transcript_db = eliminateRedundant(ens_gene_transcript_db)
#ens_gene_exon_db = eliminateRedundant(ens_gene_exon_db)
#if 'ENSG00000240173' in ens_gene_exon_db: print 'here'
print 'Exon/transcript data imported for %s transcripts' % len(ens_transcript_exon_db), len(ens_gene_exon_db)
return ens_transcript_exon_db,ens_gene_transcript_db,ens_gene_exon_db
def eliminateRedundant(database):
for key in database:
try:
list = makeUnique(database[key])
list.sort()
except Exception: list = unique.unique(database[key])
database[key] = list
return database
def makeUnique(item):
db1={}; list1=[]
for i in item: db1[i]=[]
for i in db1: list1.append(i)
list1.sort()
return list1
def getProbesetExonCoordinates(probeset_coordinate_db,probeset_gene_db,ens_gene_exon_db):
probeset_exon_coor_db={}
for probeset in probeset_gene_db:
gene = probeset_gene_db[probeset][0]
start,stop = probeset_coordinate_db[probeset]
coor_list = ens_gene_exon_db[gene]
proceed = 'yes'
if test == 'yes':
if gene not in test_genes: proceed = 'no'
if proceed == 'yes':
for (t_start,t_stop) in coor_list:
if ((start >= t_start) and (start <= t_stop)) and ((stop >= t_start) and (stop <= t_stop)):
### Thus the probeset is completely in this exon
try: probeset_exon_coor_db[probeset].append((t_start,t_stop))
except KeyError: probeset_exon_coor_db[probeset]=[(t_start,t_stop)]
#if '20.8' in probeset: print (t_start,t_stop),start,stop
#sys.exit()
return probeset_exon_coor_db
def compareExonComposition(species,array_type):
probeset_gene_db,probeset_coordinate_db = importSplicingAnnotationDatabase(array_type, species,'yes')
filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_transcript-annotations.txt'
ens_transcript_exon_db,ens_gene_transcript_db,ens_gene_exon_db = importEnsExonStructureDataSimple(filename,species,{},{},{},{})
### Add UCSC transcript data to ens_transcript_exon_db and ens_gene_transcript_db
try:
filename = 'AltDatabase/ucsc/'+species+'/'+species+'_UCSC_transcript_structure_COMPLETE-mrna.txt' ### Use the non-filtered database to propperly analyze exon composition
ens_transcript_exon_db,ens_gene_transcript_db,ens_gene_exon_db = importEnsExonStructureDataSimple(filename,species,ens_transcript_exon_db,ens_gene_transcript_db,ens_gene_exon_db,{})
except Exception:pass
### Derive probeset to exon associations De Novo
probeset_exon_coor_db = getProbesetExonCoordinates(probeset_coordinate_db,probeset_gene_db,ens_gene_exon_db)
if (array_type == 'junction' or array_type == 'RNASeq') and data_type == 'exon':
export_file = 'AltDatabase/'+species+'/'+array_type+'/'+data_type+'/'+species+'_all-transcript-matches.txt'
else:
export_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_all-transcript-matches.txt'
data = export.ExportFile(export_file)
### Identifying isforms containing and not containing the probeset
probeset_transcript_db={}; match_pairs_missing=0; valid_transcript_pairs={}; ok_transcript_pairs={}; probesets_not_found=0
print len(probeset_exon_coor_db)
print len(ens_transcript_exon_db)
print len(ens_gene_transcript_db)
print len(ens_gene_exon_db)
for probeset in probeset_exon_coor_db:
geneid = probeset_gene_db[probeset][0]
transcripts = ens_gene_transcript_db[geneid]
matching_transcripts=[]; not_matching_transcripts=[]; matching={}; not_matching={}
for coordinates in probeset_exon_coor_db[probeset]: ### Multiple exons may align
for transcript in transcripts:
### Build a cursory list of matching and non-matching transcripts
if coordinates in ens_transcript_exon_db[transcript]:
matching_transcripts.append(transcript)
else:
not_matching_transcripts.append(transcript)
### Filter large non-matching transcript sets to facilate processing
not_matching_transcripts = mRNASeqAlign.filterNullMatch(not_matching_transcripts,matching_transcripts)
### Re-analyze the filtered list for exon content
transcripts = unique.unique(not_matching_transcripts+matching_transcripts)
matching_transcripts=[]; not_matching_transcripts=[]
for coordinates in probeset_exon_coor_db[probeset]: ### Multiple exons may align
for transcript in transcripts:
if coordinates in ens_transcript_exon_db[transcript]:
matching_transcripts.append(transcript)
other_coordinate_list=[]
for other_coordinates in ens_transcript_exon_db[transcript]:
if coordinates != other_coordinates: ### Add exon coordinates for all exons that DO NOT aligning to the probeset
other_coordinate_list.append(other_coordinates)
if len(other_coordinate_list)>1:
other_coordinate_list.sort()
### Instead of replacing the values in place, we need to do this (otherwise the original object will be modified)
other_coordinate_list = [[0,other_coordinate_list[0][1]]]+other_coordinate_list[1:-1]+[[other_coordinate_list[-1][0],0]]
#other_coordinate_list[0][0] = 0; other_coordinate_list[-1][-1] = 0
other_coordinate_list = convertListsToTuple(other_coordinate_list)
matching[tuple(other_coordinate_list)]=transcript
else:
not_matching_transcripts.append(transcript)
other_coordinate_list=[]
for other_coordinates in ens_transcript_exon_db[transcript]:
if coordinates != other_coordinates: ### Add exon coordinates for all exons that DO NOT aligning to the probeset
other_coordinate_list.append(other_coordinates)
if len(other_coordinate_list)>1:
other_coordinate_list.sort()
other_coordinate_list = [[0,other_coordinate_list[0][1]]]+other_coordinate_list[1:-1]+[[other_coordinate_list[-1][0],0]]
other_coordinate_list = convertListsToTuple(other_coordinate_list)
not_matching[tuple(other_coordinate_list)]=transcript
#print '\n',len(matching_transcripts), len(not_matching_transcripts);kill
### Can't have transcripts in matching and not matching
not_matching_transcripts2=[]; not_matching2={}
for transcript in not_matching_transcripts:
if transcript not in matching_transcripts: not_matching_transcripts2.append(transcript)
for coord in not_matching:
transcript = not_matching[coord]
if transcript in not_matching_transcripts2: not_matching2[coord] = not_matching[coord]
not_matching = not_matching2; not_matching_transcripts = not_matching_transcripts2
#if probeset == '3431530': print '3431530a', matching_transcripts,not_matching_transcripts
if len(matching)>0 and len(not_matching)>0:
perfect_match_found='no'; exon_match_data=[] ### List is only used if more than a single cassette exon difference between transcripts
for exon_list in matching:
matching_transcript = matching[exon_list]
if exon_list in not_matching:
not_matching_transcript = not_matching[exon_list]
valid_transcript_pairs[probeset] = matching_transcript, not_matching_transcript
perfect_match_found = 'yes'
#print probeset,matching_transcript, not_matching_transcript
#print ens_transcript_exon_db[matching_transcript],'\n'
#print ens_transcript_exon_db[not_matching_transcript]; kill
else:
unique_exon_count_db={} ### Determine how many exons are common and which are transcript distinct for a pair-wise comp
for exon_coor in exon_list:
try: unique_exon_count_db[exon_coor] +=1
except KeyError: unique_exon_count_db[exon_coor] =1
for exon_list in not_matching:
not_matching_transcript = not_matching[exon_list]
for exon_coor in exon_list:
try: unique_exon_count_db[exon_coor] +=1
except KeyError: unique_exon_count_db[exon_coor] =1
exon_count_db={}
for exon_coor in unique_exon_count_db:
num_trans_present = unique_exon_count_db[exon_coor]
try: exon_count_db[num_trans_present]+=1
except KeyError: exon_count_db[num_trans_present]=1
try:
exon_count_results = [exon_count_db[1],-1*(exon_count_db[2]),matching_transcript,not_matching_transcript]
exon_match_data.append(exon_count_results)
except KeyError:
null =[] ###Occurs if no exons are found in common (2)
#if probeset == '3431530': print '3431530b', exon_count_results
if perfect_match_found == 'no' and len(exon_match_data)>0:
exon_match_data.sort()
matching_transcript = exon_match_data[0][-2]
not_matching_transcript = exon_match_data[0][-1]
ok_transcript_pairs[probeset] = matching_transcript, not_matching_transcript
#if probeset == '3431530': print '3431530', matching_transcript, not_matching_transcript
else: match_pairs_missing+=1
###Export transcript comparison sets to an external file for different analyses
matching_transcripts = unique.unique(matching_transcripts)
not_matching_transcripts = unique.unique(not_matching_transcripts)
matching_transcripts=string.join(matching_transcripts,'|')
not_matching_transcripts=string.join(not_matching_transcripts,'|')
values = string.join([probeset,matching_transcripts,not_matching_transcripts],'\t')+'\n'
data.write(values)
print match_pairs_missing,'probesets missing either an alinging or non-alinging transcript'
print len(valid_transcript_pairs),'probesets with a single exon difference aligning to two isoforms'
print len(ok_transcript_pairs),'probesets with more than one exon difference aligning to two isoforms'
data.close()
if (array_type == 'junction' or array_type == 'RNASeq') and data_type != 'null':
export_file = 'AltDatabase/'+species+'/'+array_type+'/'+data_type+'/'+species+'_top-transcript-matches.txt'
else:
export_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_top-transcript-matches.txt'
export_data = export.ExportFile(export_file)
for probeset in valid_transcript_pairs:
matching_transcript,not_matching_transcript = valid_transcript_pairs[probeset]
values = string.join([probeset,matching_transcript,not_matching_transcript],'\t')+'\n'
export_data.write(values)
for probeset in ok_transcript_pairs:
matching_transcript,not_matching_transcript = ok_transcript_pairs[probeset]
values = string.join([probeset,matching_transcript,not_matching_transcript],'\t')+'\n'
export_data.write(values)
#if probeset == '3431530': print '3431530d', matching_transcript,not_matching_transcript
export_data.close()
def compareExonCompositionJunctionArray(species,array_type):
###Import sequence aligned transcript associations for individual or probeset-pairs. Pairs provided for match-match
probeset_transcript_db,unique_ens_transcripts,unique_transcripts,all_transcripts = importProbesetTranscriptMatches(species,array_type,'yes')
filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_transcript-annotations.txt'
ens_transcript_exon_db,ens_gene_transcript_db,ens_gene_exon_db = importEnsExonStructureDataSimple(filename,species,{},{},{},all_transcripts)
### Add UCSC transcript data to ens_transcript_exon_db and ens_gene_transcript_db
try:
filename = 'AltDatabase/ucsc/'+species+'/'+species+'_UCSC_transcript_structure_COMPLETE-mrna.txt' ### Use the non-filtered database to propperly analyze exon composition
ens_transcript_exon_db,ens_gene_transcript_db,ens_gene_exon_db = importEnsExonStructureDataSimple(filename,species,ens_transcript_exon_db,ens_gene_transcript_db,ens_gene_exon_db,all_transcripts)
except Exception: pass
print len(probeset_transcript_db), "probesets with multiple pairs of matching-matching or matching-null transcripts."
### Identifying isoforms containing and not containing the probeset
global transcripts_not_found; marker = 5000; increment = 5000
match_pairs_missing=0; ok_transcript_pairs={}; transcripts_not_found={}
for probesets in probeset_transcript_db:
exon_match_data=[]; start_time = time.time()
match_transcripts,not_match_transcripts = probeset_transcript_db[probesets]
for matching_transcript in match_transcripts:
if matching_transcript in ens_transcript_exon_db:
transcript_match_coord = ens_transcript_exon_db[matching_transcript]
for not_matching_transcript in not_match_transcripts:
if not_matching_transcript in ens_transcript_exon_db:
transcript_null_coord = ens_transcript_exon_db[not_matching_transcript]
unique_exon_count_db={} ### Determine how many exons are common and which are transcript distinct for a pair-wise comp
for exon_coor in transcript_match_coord:
try: unique_exon_count_db[tuple(exon_coor)] +=1
except KeyError: unique_exon_count_db[tuple(exon_coor)] =1
for exon_coor in transcript_null_coord:
try: unique_exon_count_db[tuple(exon_coor)] +=1
except KeyError: unique_exon_count_db[tuple(exon_coor)] =1
exon_count_db={}
for exon_coor in unique_exon_count_db:
num_trans_present = unique_exon_count_db[exon_coor]
try: exon_count_db[num_trans_present]+=1
except KeyError: exon_count_db[num_trans_present]=1
try:
exon_count_results = [exon_count_db[1],-1*(exon_count_db[2]),matching_transcript,not_matching_transcript]
exon_match_data.append(exon_count_results)
except KeyError:
null =[] ###Occurs if no exons are found in common (2)
#if probeset == '3431530': print '3431530b', exon_count_results
else: transcripts_not_found[matching_transcript]=[]
if len(exon_match_data)>0:
exon_match_data.sort()
matching_transcript = exon_match_data[0][-2]
not_matching_transcript = exon_match_data[0][-1]
end_time = time.time()
#if end_time - start_time>2: print len(ok_transcript_pairs),probesets,match_transcripts,not_match_transcripts;kill
ok_transcript_pairs[probesets] = matching_transcript, not_matching_transcript
#if probeset == '3431530': print '3431530', matching_transcript, not_matching_transcript
if len(ok_transcript_pairs) == marker: marker+= increment; print '*',
else: match_pairs_missing+=1
print match_pairs_missing,'probesets missing either an alinging or non-alinging transcript'
print len(transcripts_not_found),'transcripts not found'
print len(ok_transcript_pairs),'probesets with more than one exon difference aligning to two isoforms'
if (array_type == 'junction' or array_type == 'RNASeq') and data_type != 'null':
export_file = 'AltDatabase/'+species+'/'+array_type+'/'+data_type+'/'+species+'_top-transcript-matches.txt'
else: export_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_top-transcript-matches.txt'
export_data = export.ExportFile(export_file)
for probeset in ok_transcript_pairs:
matching_transcript,not_matching_transcript = ok_transcript_pairs[probeset]
values = string.join([probeset,matching_transcript,not_matching_transcript],'\t')+'\n'
export_data.write(values)
#if probeset == '3431530': print '3431530d', matching_transcript,not_matching_transcript
export_data.close()
def compareProteinComposition(species,array_type,translate,compare_all_features):
probeset_transcript_db,unique_ens_transcripts,unique_transcripts,all_transcripts = importProbesetTranscriptMatches(species,array_type,compare_all_features)
all_transcripts=[]
if translate == 'yes1': ### Used if we want to re-derive all transcript-protein sequences - set to yes1 when we want to disable this option
transcript_protein_seq_db = translateRNAs(unique_transcripts,unique_ens_transcripts,'fetch')
else: transcript_protein_seq_db = translateRNAs(unique_transcripts,unique_ens_transcripts,'fetch_new')
probeset_protein_db,protein_seq_db = convertTranscriptToProteinAssociations(probeset_transcript_db,transcript_protein_seq_db,compare_all_features)
transcript_protein_seq_db=[]
global protein_ft_db
if array_type == 'exon' or array_type == 'gene' or data_type == 'exon':
probeset_gene_db = importSplicingAnnotationDatabase(array_type,species,'no')
elif (array_type == 'junction' or array_type == 'RNASeq'):
probeset_gene_db = importJunctionDatabase(species,array_type)
elif array_type == 'AltMouse':
probeset_gene_db = importAltMouseJunctionDatabase(species,array_type)
genes_being_analyzed={} ### Need to tell 'grab_exon_level_feature_calls' what genes to analyze
for probeset in probeset_gene_db:
if probeset in probeset_protein_db: ### Filter for genes that have probesets with matching and non-matching proteins
gene = probeset_gene_db[probeset][0]; genes_being_analyzed[gene]=[gene]
protein_ft_db,domain_gene_counts = FeatureAlignment.grab_exon_level_feature_calls(species,array_type,genes_being_analyzed)
if compare_all_features == 'yes': ### Used when comparing all possible PROTEIN pairs to find minimal domain changes
compareProteinFeaturesForPairwiseComps(probeset_protein_db,protein_seq_db,probeset_gene_db,species,array_type)
ExonAnalyze_module.identifyAltIsoformsProteinComp(probeset_gene_db,species,array_type,protein_ft_db,compare_all_features,data_type)
def remoteTranslateRNAs(Species,unique_transcripts,unique_ens_transcripts,analysis_type):
global species
species = Species
translateRNAs(unique_transcripts,unique_ens_transcripts,analysis_type)
def translateRNAs(unique_transcripts,unique_ens_transcripts,analysis_type):
if analysis_type == 'local':
### Get protein ACs for UCSC transcripts if provided by UCSC (NOT CURRENTLY USED BY THE PROGRAM!!!!)
mRNA_protein_db,missing_protein_ACs_UCSC = importUCSCSequenceAssociations(species,unique_transcripts)
### For missing_protein_ACs, check to see if they are in UniProt. If so, export the protein sequence
try: missing_protein_ACs_UniProt = importUniProtSeqeunces(species,mRNA_protein_db,missing_protein_ACs_UCSC)
except Exception: null=[]
### For transcripts with protein ACs, see if we can find sequence from NCBI
#missing_protein_ACs_NCBI = importNCBIProteinSeq(mRNA_protein_db)
### Combine missing_protein_ACs_NCBI and missing_protein_ACs_UniProt
#for mRNA_AC in missing_protein_ACs_NCBI: missing_protein_ACs_UniProt[mRNA_AC] = missing_protein_ACs_NCBI[mRNA_AC]
### Import mRNA sequences for mRNA ACs with no associated protein sequence and submit for in silico translation
missing_protein_ACs_inSilico = importUCSCSequences(missing_protein_ACs_NCBI)
else:
try: missing_protein_ACs_UniProt = importUniProtSeqeunces(species,{},{})
except Exception, e:
print e
null=[]
### Export Ensembl protein sequences for matching isoforms and identify transcripts without protein seqeunce
ensembl_protein_seq_db, missing_protein_ACs_Ensembl = importEnsemblProteinSeqData(species,unique_ens_transcripts)
### Import Ensembl mRNA sequences for mRNA ACs with no associated protein sequence and submit for in silico translation
missing_Ens_protein_ACs_inSilico = importEnsemblTranscriptSequence(missing_protein_ACs_Ensembl)
if analysis_type == 'fetch':
ac_list = []
for ac in unique_transcripts: ac_list.append(ac)
try: ### Get rid of the second file which will not be immediately over-written and reading before regenerating
output_file = 'AltDatabase/'+species+'/SequenceData/output/sequences/Transcript-Protein_sequences_'+str(2)+'.txt'
fn=filepath(output_file); os.remove(fn)
except Exception: null=[]
try: fetchSeq(ac_list,'nucleotide',1)
except Exception:
print traceback.format_exc(),'\n'
print 'WARNING!!!!! NCBI webservice connectivity failed due to the above error!!!!!\n'
###Import protein sequences
seq_files, transcript_protein_db = importProteinSequences(species,just_get_ids=True)
print len(unique_ens_transcripts)+len(unique_transcripts), 'examined transcripts.'
print len(transcript_protein_db),'transcripts with associated protein sequence.'
missing_protein_data=[]
#transcript_protein_db={} ### Use to override existing mRNA-protein annotation files
for ac in unique_transcripts:
if ac not in transcript_protein_db: missing_protein_data.append(ac)
###Search NCBI using the esearch not efetch function to get valid GIs (some ACs make efetch crash)
try: missing_gi_list = searchEntrez(missing_protein_data,'nucleotide')
except Exception,e:
print 'Exception encountered:',e
try: missing_gi_list = searchEntrez(missing_protein_data,'nucleotide')
except Exception:
print 'Exception encountered:',e
try: missing_gi_list = searchEntrez(missing_protein_data,'nucleotide')
except Exception:
print traceback.format_exc(),'\n'
print 'WARNING!!!!! NCBI webservice connectivity failed due to the above error!!!!!\n'
try: fetchSeq(missing_gi_list,'nucleotide',len(seq_files)-2)
except Exception:
print traceback.format_exc(),'\n'
print 'WARNING!!!!! NCBI webservice connectivity failed due to the above error!!!!!\n'
seq_files, transcript_protein_seq_db = importProteinSequences(species)
print len(unique_ens_transcripts)+len(unique_transcripts), 'examined transcripts.'
print len(transcript_protein_seq_db),'transcripts with associated protein sequence.'
return transcript_protein_seq_db
def convertTranscriptToProteinAssociations(probeset_transcript_db,transcript_protein_seq_db,compare_all_features):
### Convert probeset-transcript match db to probeset-protein match db
probeset_protein_db={}; compared_protein_ids={}
for probeset in probeset_transcript_db:
match_transcripts,not_match_transcripts = probeset_transcript_db[probeset]
match_proteins=[]; not_match_proteins=[]
for transcript in match_transcripts:
if transcript in transcript_protein_seq_db:
protein_id = transcript_protein_seq_db[transcript][0]; match_proteins.append(protein_id); compared_protein_ids[protein_id]=[]
for transcript in not_match_transcripts:
if transcript in transcript_protein_seq_db:
protein_id = transcript_protein_seq_db[transcript][0]; not_match_proteins.append(protein_id); compared_protein_ids[protein_id]=[]
if len(match_proteins)>0 and len(not_match_proteins)>0:
probeset_protein_db[probeset]=[match_proteins,not_match_proteins]
protein_seq_db={}
for transcript in transcript_protein_seq_db:
protein_id, protein_seq = transcript_protein_seq_db[transcript]
if protein_id in compared_protein_ids: protein_seq_db[protein_id] = [protein_seq]
transcript_protein_seq_db=[]
if compare_all_features == 'no': ### If yes, all pairwise comps need to still be examined
title_row = 'Probeset\tAligned protein_id\tNon-aligned protein_id'
if (array_type == 'junction' or array_type == 'RNASeq') and data_type != 'null':
export_file = 'AltDatabase/'+species+'/'+array_type+'/'+data_type+'/probeset-protein-dbase_exoncomp.txt'
else: export_file = 'AltDatabase/'+species+'/'+array_type+'/probeset-protein-dbase_exoncomp.txt'
exportSimple(probeset_protein_db,export_file,title_row)
if (array_type == 'junction' or array_type == 'RNASeq') and data_type != 'null':
export_file = 'AltDatabase/'+species+'/'+array_type+'/'+data_type+'/SEQUENCE-protein-dbase_exoncomp.txt'
else: export_file = 'AltDatabase/'+species+'/'+array_type+'/SEQUENCE-protein-dbase_exoncomp.txt'
exportSimple(protein_seq_db,export_file,'')
return probeset_protein_db,protein_seq_db
def importProteinSequences(species,just_get_ids=False):
transcript_protein_seq_db={}
import_dir = '/AltDatabase/'+species+'/SequenceData/output/sequences' ### Multi-species fiel
g = GrabFiles(); g.setdirectory(import_dir)
seq_files = g.searchdirectory('Transcript-')
for seq_file in seq_files:
fn=filepath(seq_file)
for line in open(fn,'rU').xreadlines():
probeset_data = cleanUpLine(line) #remove endline
mRNA_AC,protein_AC,protein_seq = string.split(probeset_data,'\t')
if just_get_ids==True:
transcript_protein_seq_db[mRNA_AC]=protein_AC
else: transcript_protein_seq_db[mRNA_AC] = protein_AC,protein_seq
return seq_files, transcript_protein_seq_db
def importCDScoordinates(species):
""" Read in the CDS locations for Ensembl proteins, NCBI proteins and in silico derived proteins and then export to a single file """
cds_location_db={}
errors = {}
import_dir = '/AltDatabase/'+species+'/SequenceData/output/details'
import_dir2 = '/AltDatabase/ensembl/'+species
g = GrabFiles(); g.setdirectory(import_dir)
g2 = GrabFiles(); g2.setdirectory(import_dir2)
seq_files = g.searchdirectory('Transcript-')
seq_files += g2.searchdirectory('EnsemblTranscriptCDSPositions')
output_file = 'AltDatabase/ensembl/'+species +'/AllTranscriptCDSPositions.txt'
dataw = export.ExportFile(output_file)
for seq_file in seq_files:
fn=filepath(seq_file)
for line in open(fn,'rU').xreadlines():
line_data = cleanUpLine(line) #remove endline
line_data = string.replace(line_data,'>','') ### occurs for some weird entries
line_data = string.replace(line_data,'<','') ### occurs for some weird entries
if 'join' in line_data:
### Occures when the cds entry looks like this: AL137661 - join(203..1267,1267..2187) or 'AL137661\tjoin(203\t1267,1267\t2187)'
t = string.split(line_data,'\t')
mRNA_AC = t[0]; start = t[1][5:]; stop = t[-1][:-1]
else:
line_data = string.replace(line_data,'complement','') ### occurs for some weird entries
line_data = string.replace(line_data,')','') ### occurs for some weird entries
line_data = string.replace(line_data,'(','') ### occurs for some weird entries
try:
mRNA_AC,start,stop = string.split(line_data,'\t')
try:
cds_location_db[mRNA_AC] = int(start),int(stop)
dataw.write(string.join([mRNA_AC,start,stop],'\t')+'\n')
except Exception:
errors[line_data]=[]
#print line_data;sys.exit()
except Exception:
errors[line_data]=[]
#print line_data;sys.exit()
print len(errors),'errors...out of',len(cds_location_db)
dataw.close()
return cds_location_db
def importProbesetTranscriptMatches(species,array_type,compare_all_features):
if compare_all_features == 'yes': ### Used when comparing all possible PROTEIN pairs
if (array_type == 'junction' or array_type == 'RNASeq') and data_type != 'null':
filename = 'AltDatabase/'+species+'/'+array_type+'/'+data_type+'/'+species+'_all-transcript-matches.txt'
else: filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_all-transcript-matches.txt'
else: ### Used after comparing all possible TRANSCRIPT STRUCTURE pairs
if (array_type == 'junction' or array_type == 'RNASeq') and data_type != 'null':
filename = 'AltDatabase/'+species+'/'+array_type+'/'+data_type+'/'+species+'_top-transcript-matches.txt'
else: filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_top-transcript-matches.txt'
fn=filepath(filename); probeset_transcript_db={}; unique_transcripts={}; unique_ens_transcripts={}; all_transcripts={}
for line in open(fn,'rU').xreadlines():
probeset_data = cleanUpLine(line) #remove endline
try: probeset,match_transcripts,not_match_transcripts = string.split(probeset_data,'\t')
except IndexError: print t;kill
#if probeset == '2991483':
match_transcripts = string.split(match_transcripts,'|')
not_match_transcripts = string.split(not_match_transcripts,'|')
### Multiple transcript comparison sets can exist, depending on how many unique exon coordiantes the probeset aligns to
probeset_transcript_db[probeset] = match_transcripts,not_match_transcripts
consider_all_ensembl = 'no'
if array_type == 'RNASeq': ### Needed for Ensembl gene IDs that don't have 'ENS' in them.
if 'ENS' not in probeset: consider_all_ensembl = 'yes'
###Store unique transcripts
for transcript in match_transcripts:
if 'ENS' in transcript or consider_all_ensembl == 'yes': unique_ens_transcripts[transcript]=[]
else: unique_transcripts[transcript]=[]
if consider_all_ensembl == 'yes': unique_transcripts[transcript]=[] ### This redundant, but is the most unbiased way to examine all non-Ensembls as well
all_transcripts[transcript]=[]
for transcript in not_match_transcripts:
if 'ENS' in transcript or consider_all_ensembl == 'yes': unique_ens_transcripts[transcript]=[]
else: unique_transcripts[transcript]=[]
if consider_all_ensembl == 'yes': unique_transcripts[transcript]=[] ### This redundant, but is the most unbiased way to examine all non-Ensembls as well
all_transcripts[transcript]=[]
print 'len(unique_ens_transcripts)',len(unique_ens_transcripts)
print 'len(unique_transcripts)',len(unique_transcripts)
return probeset_transcript_db,unique_ens_transcripts,unique_transcripts,all_transcripts
def searchEntrez(accession_list,bio_type):
start_time = time.time()
Entrez.email = "nsalomonis@gmail.com" # Always tell NCBI who you are
index=0; gi_list=[]
while index<len(accession_list)+20:
try: new_accession_list = accession_list[index:index+20]
except IndexError: new_accession_list = accession_list[index:]
if len(new_accession_list)<1: break
search_handle = Entrez.esearch(db=bio_type,term=string.join(new_accession_list,','))
search_results = Entrez.read(search_handle)
gi_list += search_results["IdList"]
index+=20
end_time = time.time(); time_diff = int(end_time-start_time)
print "finished in %s seconds" % time_diff
return gi_list
def fetchSeq(accession_list,bio_type,version):
output_file = 'AltDatabase/'+species+'/SequenceData/output/sequences/Transcript-Protein_sequences_'+str(version)+'.txt'
datar = export.ExportFile(output_file)
output_file = 'AltDatabase/'+species+'/SequenceData/output/details/Transcript-Protein_sequences_'+str(version)+'.txt'
datad = export.ExportFile(output_file)
print len(accession_list), "mRNA Accession numbers submitted to eUTILs."
start_time = time.time()
Entrez.email = "nsalomonis@gmail.com" # Always tell NCBI who you are
index=0
while index<len(accession_list)+200:
try: new_accession_list = accession_list[index:index+200]
except IndexError: new_accession_list = accession_list[index:]
if len(new_accession_list)<1: break
### epost doesn't concatenate everything into a URL where esearch
try:
search_handle = Entrez.efetch(db=bio_type,id=string.join(new_accession_list,','),retmode="xml") #,usehistory="y"
search_results = Entrez.read(search_handle)
except Exception:
try: ### Make sure it's not due to an internet connection issue
search_handle = Entrez.efetch(db=bio_type,id=string.join(new_accession_list,','),retmode="xml") #,usehistory="y"
search_results = Entrez.read(search_handle)
except Exception: index+=200; continue
for a in search_results: ### list of dictionaries
accession = a['GBSeq_primary-accession']
if bio_type == 'nucleotide':
mRNA_seq=''; proceed = 'no'; get_location = 'no'
### Get translated protein sequence if available
try:
for entry in a["GBSeq_feature-table"]:
for key in entry: ### Key's in dictionary
if key == 'GBFeature_quals': ### composed of a list of dictionaries
for i in entry[key]:
if i['GBQualifier_name'] == 'protein_id': protein_id = i['GBQualifier_value']
if i['GBQualifier_name'] == 'translation': protein_sequence = i['GBQualifier_value']; proceed = 'yes'
if key == 'GBFeature_key':
if entry[key] == 'CDS':
get_location = 'yes'
else: get_location = 'no' ### Get CDS location (occurs only following the GBFeature_key CDS, but not after)
if key == 'GBFeature_location' and get_location == 'yes':
cds_location = entry[key]
except KeyError: null = []
alt_seq='no'
if proceed == 'yes':
if protein_sequence[0] != 'M': proceed = 'no'; alt_seq = 'yes'
else:
values = [accession,protein_id,protein_sequence]; values = string.join(values,'\t')+'\n'; datar.write(values)
datad.write(accession+'\t'+string.replace(cds_location,'..','\t')+'\n')
else: ### Translate mRNA seq to protein
mRNA_seq = a['GBSeq_sequence']
mRNA_db={}; mRNA_db[accession] = '',mRNA_seq
translation_db = BuildInSilicoTranslations(mRNA_db)
for mRNA_AC in translation_db: ### Export in silico protein predictions
protein_id, protein_sequence,cds_location = translation_db[mRNA_AC]
values = [mRNA_AC,protein_id,protein_sequence]; values = string.join(values,'\t')+'\n'; datar.write(values)
datad.write(mRNA_AC+'\t'+string.replace(cds_location,'..','\t')+'\n')
if len(translation_db)==0 and alt_seq == 'yes': ### If no protein sequence starting with an "M" found, write the listed seq
values = [accession,protein_id,protein_sequence]; values = string.join(values,'\t')+'\n'; datar.write(values)
datad.write(accession+'\t'+string.replace(cds_location,'..','\t')+'\n')
else: protein_sequence = a['GBSeq_sequence']
index+=200
"""print 'accession',accession
print 'protein_id',protein_id
print 'protein_sequence',protein_sequence
print 'mRNA_seq',mRNA_seq,'\n' """
end_time = time.time(); time_diff = int(end_time-start_time)
print "finished in %s seconds" % time_diff
datar.close()
datad.close()
def importEnsemblTranscriptSequence(missing_protein_ACs):
import_dir = '/AltDatabase/'+species+'/SequenceData' ### Multi-species fiel
g = GrabFiles(); g.setdirectory(import_dir)
seq_files = g.searchdirectory('.cdna.all.fa'); seq_files.sort(); filename = seq_files[-1]
#filename = 'AltDatabase/'+species+'/SequenceData/'+species+'_ensembl_cDNA.fasta.txt'
start_time = time.time()
output_file = 'AltDatabase/'+species+'/SequenceData/output/sequences/Transcript-EnsInSilicoProt_sequences.txt'
datar = export.ExportFile(output_file)
output_file = 'AltDatabase/'+species+'/SequenceData/output/details/Transcript-EnsInSilicoProt_sequences.txt'
datad = export.ExportFile(output_file)
print "Begining generic fasta import of",filename
fn=filepath(filename); translated_mRNAs={}; sequence = ''
for line in open(fn,'r').xreadlines():
try: data, newline= string.split(line,'\n')
except ValueError: continue
try:
if data[0] == '>':
if len(sequence) > 0:
if transid in missing_protein_ACs:
### Perform in silico translation
mRNA_db = {}; mRNA_db[transid] = '',sequence[1:]
translation_db = BuildInSilicoTranslations(mRNA_db)
for mRNA_AC in translation_db: ### Export in silico protein predictions
protein_id, protein_seq, cds_location = translation_db[mRNA_AC]
values = [mRNA_AC,protein_id,protein_seq]; values = string.join(values,'\t')+'\n'; datar.write(values)
datad.write(mRNA_AC+'\t'+string.replace(cds_location,'..','\t')+'\n')
translated_mRNAs[mRNA_AC]=[]
### Parse new line
#>ENST00000400685 cdna:known supercontig::NT_113903:9607:12778:1 gene:ENSG00000215618
t= string.split(data[1:],':'); sequence=''
transid_data = string.split(t[0],' '); transid = transid_data[0]
if '.' in transid:
transid = string.split(transid,'.')[0]
#try: ensembl_id,chr,strand,transid,prot_id = t
#except ValueError: ensembl_id,chr,strand,transid = t
except IndexError: continue
try:
if data[0] != '>': sequence = sequence + data
except IndexError: continue
#### Add the last entry
if len(sequence) > 0:
if transid in missing_protein_ACs:
### Perform in silico translation
mRNA_db = {}; mRNA_db[transid] = '',sequence[1:]
translation_db = BuildInSilicoTranslations(mRNA_db)
for mRNA_AC in translation_db: ### Export in silico protein predictions
protein_id, protein_seq, cds_location = translation_db[mRNA_AC]
values = [mRNA_AC,protein_id,protein_seq]; values = string.join(values,'\t')+'\n'; datar.write(values)
datad.write(mRNA_AC+'\t'+string.replace(cds_location,'..','\t')+'\n')
translated_mRNAs[mRNA_AC]=[]
datar.close()
datad.close()
end_time = time.time(); time_diff = int(end_time-start_time)
print "Ensembl transcript sequences analyzed in %d seconds" % time_diff
missing_protein_ACs_inSilico=[]
for mRNA_AC in missing_protein_ACs:
if mRNA_AC not in translated_mRNAs:
missing_protein_ACs_inSilico.append(mRNA_AC)
print len(missing_protein_ACs_inSilico), 'Ensembl mRNAs without mRNA sequence NOT in silico translated (e.g., lncRNAs)', missing_protein_ACs_inSilico[:10]
def importEnsemblProteinSeqData(species,unique_ens_transcripts):
from build_scripts import FeatureAlignment
protein_relationship_file,protein_feature_file,protein_seq_fasta,null = FeatureAlignment.getEnsemblRelationshipDirs(species)
ens_transcript_protein_db = FeatureAlignment.importEnsemblRelationships(protein_relationship_file,'transcript')
unique_ens_proteins = {}
for transcript in ens_transcript_protein_db:
if transcript in unique_ens_transcripts:
protein_id = ens_transcript_protein_db[transcript]
if len(protein_id)>1:
unique_ens_proteins[protein_id] = transcript
ensembl_protein_seq_db = importEnsemblProtSeq(protein_seq_fasta,unique_ens_proteins)
transcript_with_prot_seq = {}
for protein_id in ensembl_protein_seq_db:
if protein_id in unique_ens_proteins:
transcript = unique_ens_proteins[protein_id]
transcript_with_prot_seq[transcript]=[]
missing_ens_proteins={}
for transcript in unique_ens_transcripts:
if transcript not in transcript_with_prot_seq: missing_ens_proteins[transcript]=[]
print len(ensembl_protein_seq_db),'Ensembl transcripts linked to protein sequence and',len(missing_ens_proteins), 'transcripts missing protein sequence.'
return ensembl_protein_seq_db, missing_ens_proteins
def importEnsemblProtSeq(filename,unique_ens_proteins):
export_file = 'AltDatabase/'+species+'/SequenceData/output/sequences/Transcript-EnsProt_sequences.txt'
export_data = export.ExportFile(export_file)
fn=filepath(filename); ensembl_protein_seq_db={}; sequence = ''; y=0
for line in open(fn,'r').xreadlines():
try: data, newline= string.split(line,'\n'); y+=1
except ValueError: continue
try:
if data[0] == '>':
if len(sequence) > 0:
try: ensembl_prot = ensembl_prot
except Exception: print data,y,t;kill
if ensembl_prot in unique_ens_proteins:
mRNA_AC = unique_ens_proteins[ensembl_prot]
values = string.join([mRNA_AC,ensembl_prot,sequence],'\t')+'\n'
export_data.write(values); ensembl_protein_seq_db[ensembl_prot] = []
### Parse new line
t= string.split(data[1:],' '); sequence=''
ensembl_prot = t[0]
if '.' in ensembl_prot:
ensembl_prot = string.split(ensembl_prot,'.')[0] ### Added to Ensembl after version 77
except IndexError: continue
try:
if data[0] != '>': sequence = sequence + data
except IndexError: continue
if ensembl_prot in unique_ens_proteins:
mRNA_AC = unique_ens_proteins[ensembl_prot]
values = string.join([mRNA_AC,ensembl_prot,sequence],'\t')+'\n'
export_data.write(values); ensembl_protein_seq_db[ensembl_prot] = []
export_data.close()
return ensembl_protein_seq_db
def importUniProtSeqeunces(species,transcripts_with_uniprots,transcripts_to_analyze):
global n_terminal_seq; global c_terminal_seq
n_terminal_seq={}; c_terminal_seq={}
export_file = 'AltDatabase/'+species+'/SequenceData/output/sequences/Transcript-UniProt_sequences.txt'
export_data = export.ExportFile(export_file)
#filename = 'AltDatabase/'+species+'/SequenceData/'+'uniprot_trembl_sequence.txt'
filename = 'AltDatabase/uniprot/'+species+'/uniprot_sequence.txt'
fn=filepath(filename); transcript_to_uniprot={}
unigene_ensembl_up = {}
for line in open(fn,'r').readlines():
data, newline= string.split(line,'\n')
t = string.split(data,'\t')
id=t[0];ac=t[1];ensembls=t[4];seq=t[2];type=t[6];unigenes=t[7];embls=t[9]
ac=string.split(ac,','); embls=string.split(embls,',') #; ensembls=string.split(ensembls,','); unigenes=string.split(unigenes,',')
if type != 'swissprot1': ### unclear why this condition was excluding swissprot so added 1 - version 2.1.1
### Note: These associations are based on of UCSC, which in some cases don't look correct: see AY429540 and Q75N08 from the KgXref file.
### Possibly exclude
ac = ac[0]
if ac in transcripts_with_uniprots:
mRNA_ACs = transcripts_with_uniprots[ac]
for mRNA_AC in mRNA_ACs:
transcript_to_uniprot[mRNA_AC] = []
values = string.join([mRNA_AC,ac,seq],'\t')+'\n'; export_data.write(values)
for embl in embls:
proceed = 'no'
if (len(embl)>0) and type == 'fragment': ###NOTE: Fragment annotated seem to be the only protein IDs that contain direct references to a specific mRNA rather than promiscous (as opposed to Swissprot and Variant)
if embl in transcripts_to_analyze: proceed = 'yes'
elif embl in transcripts_with_uniprots: proceed = 'yes'
if proceed == 'yes':
if embl not in transcript_to_uniprot:
transcript_to_uniprot[embl] = []
values = string.join([embl,id,seq],'\t')+'\n'; export_data.write(values)
n_terminal_seq[seq[:5]] = []
c_terminal_seq[seq[-5:]] = []
export_data.close()
missing_protein_ACs={}
for mRNA_AC in transcripts_to_analyze:
if mRNA_AC not in transcript_to_uniprot: missing_protein_ACs[mRNA_AC]=[]
for protein_AC in transcripts_with_uniprots:
mRNA_ACs = transcripts_with_uniprots[protein_AC]
for mRNA_AC in mRNA_ACs:
if mRNA_AC not in transcript_to_uniprot: missing_protein_ACs[mRNA_AC]=[]
if len(transcripts_to_analyze)>0: ### Have to submitt ACs to report them
print len(missing_protein_ACs), 'missing protein ACs for associated UniProt mRNAs and', len(transcript_to_uniprot), 'found.'
print len(n_terminal_seq),len(c_terminal_seq),'N and C terminal, respectively...'
return missing_protein_ACs
def BuildInSilicoTranslations(mRNA_db):
"""
BI517798
Seq('ATGTGGCCAGGAGACGCCACTGGAGAACATGCTGTTCGCCTCCTTCTACCTTCTGGATTT ...', IUPACUnambiguousDNA())
213 MWPGDATGEHAVRLLLPSGFYPGFSWQYPGSVAFHPRPQVRDPGQRVPDASGRGRLVVRAGPAHPPGLPLLWEPLAIWGNRMPSHRLPLLPQHVRQHLLPHLHQRRPFPGHCAPGQVPQAPQAPLRTPGLCLPVGGGGCGHGPAAGEPTDRADKHTVGLPAAVPGEGSNMPGVPWQWPSLPVHHQVTCTVIIRSCGRPRVEKALRTRQGHESP
211 MNGLEVAPPGLITNFSLATAEQCGQETPLENMLFASFYLLDFILALVGNTLALWLFIRDHKSGTPANVFLMHLAVADLSCVLVLPTRLVYHFSGNHWPFGEIACRLTGFLFYLNMYASIYFLTCISADRFLAIVHPVKSLKLRRPLYAHLACAFLWVVVAVAMAPLLVSPQTVQTNTRWVCLQLYREKAPTCLVSLGSGLHFPFITRSRVL
"""
translation_db={}
from Bio.Seq import Seq
### New Biopython methods - http://biopython.org/wiki/Seq
from Bio.Alphabet import generic_dna
### Deprecated
#from Bio.Alphabet import IUPAC
#from Bio import Translate ### deprecated
#print 'Begining in silco translation for',len(mRNA_db),'sequences.'
def cleanSeq(input_seq):
"""Wrapper for Biopython translate function. Bio.Seq.translate will complain if input sequence is
not a mulitple of 3. This wrapper function passes an acceptable input to Bio.Seq.translate in order to
avoid this warning."""
#https://github.com/broadinstitute/oncotator/pull/265/commits/94b20aabff48741a92b3f9e608e159957af6af30
trailing_bases = len(input_seq) % 3
if trailing_bases:
input_seq = ''.join([input_seq, 'NN']) if trailing_bases == 1 else ''.join([input_seq, 'N'])
return input_seq
first_time = 1
for mRNA_AC in mRNA_db:
if mRNA_AC == 'AK025306': print '@@@@@@@@@@^^^^AK025306...attempting in silico translation'
temp_protein_list=[]; y=0
protein_id,sequence = mRNA_db[mRNA_AC]
if protein_id == '': protein_id = mRNA_AC+'-PEP'
original_seqeunce = sequence
sequence = string.upper(sequence)
loop=0
while (string.find(sequence,'ATG')) != -1: #while there is still a methionine in the DNA sequence, reload this DNA sequence for translation: find the longest ORF
x = string.find(sequence,'ATG') #before doing this, need to find the start codon ourselves
y += x #maintain a running count of the sequence position
if loop!=0: y+=3 ### This accounts for the loss in sequence_met
#if y<300: print original_seqeunce[:y+2], x
sequence_met = sequence[x:] #x gives us the position where the first Met* is.
### New Biopython methods - http://biopython.org/wiki/Seq
dna_clean = cleanSeq(sequence_met)
dna_seq = Seq(dna_clean, generic_dna)
prot_seq = dna_seq.translate(to_stop=True)
### Deprecated code
#seq_type = IUPAC.unambiguous_dna
#dna_seq = Seq(sequence_met,seq_type)
#standard_translator = Translate.unambiguous_dna_by_id[1]