-
Notifications
You must be signed in to change notification settings - Fork 1
/
bedBlastLift
executable file
·110 lines (98 loc) · 3.8 KB
/
bedBlastLift
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/python
from sys import *
from optparse import OptionParser
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = OptionParser("%prog [options] blastfile bedfile : Use a tblastx-tabular-file to lift annotations between sequences (approximatively), very slow")
#parser.add_option("-l", "--wordlength", dest="motiflength", action="store", help="length of word [default: %default, hexamers]", type="int", metavar="NUMBER", default="6")
parser.add_option("-s", "--suppressIntersection", dest="noIntersection", action="store_true", help="suppress printing length of intersection", default=True)
parser.add_option("-a", "--outputAll", dest="outputAll", action="store", help="instead of dropping everything that can not be mapped, output it with NAME as description. ", metavar="NAME", default="", type="string")
(options, args) = parser.parse_args()
# ==== FUNCTIONs =====
class hit:
""" a hit from a blast tabular output file """
def __repr__(self):
return self.line
def parseBlast(file):
hits = []
for l in file:
if l.startswith("#"):
continue
fs = l.split()
if len(fs)!=12:
stderr.write("error: blast output does not have 12 fields. Make sure that you ran blast with -m 9 or -D T (for bl2seq)\n")
exit(1)
h = hit()
(h.qName, h.sName, h.percId, h.alnLen, h.mismatches, h.gapOpen, h.qStart, h.qEnd, h.sStart, h.sEnd, h.eVal, h.alnScore) = fs
h.sStart = int(h.sStart)
h.sEnd = int(h.sEnd)
h.qStart = int(h.qStart)
h.qEnd = int(h.qEnd)
if h.qEnd < h.qStart:
(h.qStart, h.qEnd) = (h.qEnd, h.qStart)
if h.sEnd < h.sStart:
(h.sStart, h.sEnd) = (h.sEnd, h.sStart)
h.line = l.strip()
hits.append(h)
return hits
class bed:
""" a feature from a bed file """
def overlaps(self, hit):
""" returns number of overlapping bp if two Features overlap or
false if no overlap """
if (( hit.sStart <= self.start and hit.sEnd > self.start) or \
(hit.sStart < self.end and hit.sEnd >= self.end) or
(hit.sStart > self.start and hit.sStart < self.end)):
return (min (hit.sEnd, self.end) - max (hit.sStart, self.start))
else:
return 0
def __repr__(self):
return self.line
def parseBed(file):
beds = []
for l in file:
l = l.strip()
b = bed()
if l.startswith("#"):
continue
fs = l.split()
if not len(fs) >= 4:
stderr.write("error: bed features need names.\n")
exit(1)
(b.chrom, b.start, b.end, b.name) = fs[0:4]
b.start = int(b.start)
b.end = int(b.end)
b.line = l
beds.append(b)
return beds
# ----------- MAIN --------------
if args==[]:
parser.print_help()
exit(1)
defaultname = options.outputAll
noIntersection = options.noIntersection
stderr.write("Reading blast output...\n")
f = open(args[0], "r")
hits = parseBlast(f)
stderr.write("Reading bedfile ...\n")
f = open(args[1], "r")
beds = parseBed(f)
stderr.write("Searching for overlaps bed-blast...\n")
for b in beds:
overlaps = []
for h in hits:
length = b.overlaps(h)
if length > 0:
h.overlapLen=length
overlaps.append(h)
# choose highest scoring overlap
if len(overlaps)==0:
if defaultname!="":
print "%s\t%s\t%s\t%s" % (b.chrom, b.start, b.end,defaultname)
continue
#overlaps.sort(key=lambda x: x.eVal, reverse=True) # py cannot parse eval
overlaps.sort(key=lambda x: x.alnScore)
best = overlaps[0]
suffix = ""
if (not noIntersection):
suffix="_ints"+str(best.overlapLen), str(best.eVal)
print "\t".join([best.qName, str(best.qStart), str(best.qEnd), b.name+suffix])