Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
81 lines (61 sloc) 2.29 KB
#! /usr/bin/env python
Usage: reference.fa query.x.reference.blastn minmatch query.fa
calc-blast-cover calculates the fraction of bases in 'reference.fa' that are
covered by BLAST matches from 'query.fa', for sequence in 'query.fa' that are
longer than 'minmatch'.
import sys
import blastparser
import screed
if len(sys.argv) != 5:
print>>sys.stderr, "Usage: b.seqs a.x.b matchlen a.seqs"
MIN_QUERY_LEN = int(sys.argv[3])
# load in the query sequences into a list
query_seqs = set([ for record in[4]) \
if len(record.sequence) >= MIN_QUERY_LEN ])
# create empty lists representing the total number of bases in the reference
covs = {}
for n, record in enumerate([1])):
if n % 1000 == 0:
covs[] = [0] * len(record.sequence)
# run through the BLAST records in the query, and calculate how much of
# the reference is covered by the query.
for n, record in enumerate(blastparser.parse_fp(open(sys.argv[2]))):
if n % 100 == 0:
if record.query_name not in query_seqs:
for hit in record.hits:
for match in hit.matches:
if match.score < MIN_SCORE:
cov = covs.get(hit.subject_name)
if not cov:
start = min(match.subject_start, match.subject_end) - 1
end = max(match.subject_start, match.subject_end)
for i in range(start, end):
cov[i] = 1
print ''
# print out summary statistics for each of the reference.
coved = 0
total = 0
for name in covs:
coved += sum(covs[name])
total += len(covs[name])
f = sum(covs[name]) / float(len(covs[name]))
#print name, sum(covs[name]), len(covs[name]), f
print 'total bases in reference:', total
print 'total ref bases covered :', coved
print 'fraction :', coved / float(total)
print 'reference :', sys.argv[1]
print 'blast file :', sys.argv[2]
print 'query sequences :', sys.argv[4]
#print coved, total, coved / float(total), sys.argv[1], sys.argv[2], MIN_QUERY_LEN