-
Notifications
You must be signed in to change notification settings - Fork 1
/
bedfastaextract
executable file
·93 lines (81 loc) · 2.65 KB
/
bedfastaextract
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python
from sys import *
import sys
import re
def readfasta(lines):
## will read from lines
## will return a dict
seqs = {}
for line in lines:
if line.startswith(">"):
seqname = line.strip(">").split()[0]
seqs[seqname]=""
else:
seqs[seqname]+=line.replace(" ","").strip()
return seqs
def revComp(seq):
table = { "a" : "t", "A" : "T", "t" : "a", "T" : "A", "c" : "g", "C":"G", "g":"c", "G":"C", "-":"-" }
newseq = ""
for nucl in reversed(seq):
newseq += table[nucl]
return newseq
def readBed(lines):
features = []
for line in lines:
if line.startswith("#"):
continue
if line.startswith("track"):
continue
if line.startswith("browser"):
continue
parts = line.split()
if len(parts)>=6:
seq,start,end,name,score,strand = parts[0], int(parts[1]), int(parts[2]), parts[3], parts[4], parts[5]
else:
seq,start,end,name= parts[0], int(parts[1]), int(parts[2]), parts[3]
features.append([seq,start,end,name,score,strand])
return features
if len(argv)==1:
print " Will extract sequences of features of a multi fasta sequence file."
print " The parts to extract have to be specified as a bed file."
print " Will print the negative strand if bed-strand is '-'. "
print " Results are printed in fasta-format."
print " "
print " Syntax: "
print " cat test.bed | fastabedextract test.fa "
print " Options:"
print " -b : output as bed, do not output as fasta but annotation name field"
exit()
bedoutput=False
for a in argv[1:]:
if a=="-b":
bedoutput=True
else:
filename=a
sys.stderr.write("Reading fasta...\n")
f = open(filename, 'r')
lines = f.readlines()
seqs = readfasta(lines)
lines = sys.stdin.readlines()
features = readBed(lines)
del lines
sys.stderr.write("Processing beds...\n")
for f in features:
if f[1] < 0:
sys.stderr.write("Warning: "+str(f[1])+" is smaller than 0!\n")
if f[0] not in seqs:
sys.stderr.write("Warning: dropping feature "+f[0]+" "+str(f[1])+" as sequence not found in fasta\n")
continue
if f[2] > len(seqs[f[0]])+1:
sys.stderr.write("Warning: "+str(f[2])+" is greater than seqlen + 1. Seq="+f[0]+", seqlen="+str(len(seqs[f[0]]))+"\n")
f[2] = len(seqs[f[0]])
seq = seqs[f[0]][f[1]:f[2]]
if f[5]=="-":
seq = revComp(seq)
if bedoutput==False:
print ">"+f[0]+"_"+str(f[1])+"-"+str(f[2])+" "+str(f[3])
print seq
else:
#f = [str(s) for s in f]
print "%s\t%s\t%s\t%s_%s\t%s\t%s" % (f[0],f[1],f[2],f[3],seq,f[4],f[5])
#print "\t".join(f[0-2]) + "_"+seq + "\t%s\t%s" % (f[3],f[4],f[5])