Skip to content

Commit

Permalink
regionify - few tools to summarize gff region sizes
Browse files Browse the repository at this point in the history
  • Loading branch information
Mark Fiers committed Jul 19, 2012
1 parent 881758f commit a9276ac
Showing 1 changed file with 128 additions and 0 deletions.
128 changes: 128 additions & 0 deletions bin/gff_regionify
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#!/usr/bin/env python

import os
import sys
import gzip
import time
import cPickle
import argparse
import collections
import numpy as np

seqs = {}
types = []

parser = argparse.ArgumentParser(description='Do fancy region stuff based on GFFs.')

parser.add_argument('-b', dest='base', help='basename for output', default='gff_regionify')
parser.add_argument('lengths', help='file with the lengths of all sequences')
parser.add_argument('gff', help='input gff')
parser.add_argument('--vcf', dest='vcf', help='vcf file to categorize')

args = parser.parse_args()

starttime= time.time()

print 'creating numpy structure'
totn = 0
with open(args.lengths) as F:
for i, line in enumerate(F):
si, ln = line.split()
ln = int(ln)
totn += ln
seqs[si] = np.zeros(ln, dtype=np.uint16)
if (i+1) % 72 == 0:
sys.stdout.write('\r%d ' % i)
sys.stdout.flush()
print
print 'generated space for %d nt' % totn

print 'loading annotations'
with open(args.gff) as F:
for i, line in enumerate(F):
line = line.strip()
if not line: continue
if line[0] == '#': continue
ls = line.strip().split()
atype = ls[2]

if not atype in types:
print 'adding type', atype
types.append(atype)

sqid = ls[0]
start = int(ls[3])
stop = int(ls[4])
seqs[sqid][start-1:stop-1] = \
np.bitwise_or(seqs[sqid][start-1:stop-1], 2 ** types.index(atype))

if i % (72*9) == 0 and i > 1000:
sys.stdout.write('\r%d ' % i)
sys.stdout.flush()
print

print 'summarizing'
summ = collections.defaultdict(int)
tlen = 0
for j, sid in enumerate(seqs):
tlen += len(seqs[sid])
for i, t in enumerate(types):
summ[t] += len(np.nonzero(np.bitwise_and(seqs[sid], 2 ** i))[0])
if j % 72 == 0:
sys.stdout.write('\r%d ' % i)
sys.stdout.flush()
print

with open(args.base + '.wg.stats', 'w') as F:
print "total\t%s" % (tlen)
F.write("total\t%s\n" % (tlen))
for t in summ:
print "%s\t%s" % (t, summ[t])
F.write("%s\t%s\n" % (t, summ[t]))

if not args.vcf:
sys.exit()

def vcfreader(filename):
with open(filename) as F:
for line in F:
line = line.strip()
if not line: continue
if line[0] == '#': continue
ls = line.split()
if len(ls) < 2: continue
yield ls[0], int(ls[1])


vsumm = collections.defaultdict(int)
vcfi = 0
print '\nProcessing VCF %s' % args.vcf
for sid, pos in vcfreader(args.vcf):
pos = pos -1 #vcf is 1 based!!
try:
m = seqs[sid][pos]
except IndexError:
print "cannot find mask value for %s/%s" % pos, sid
raise
vcfi += 1
for i, t in enumerate(types):
inni = np.bitwise_and(m, 2 ** i)
if inni > 0:
#print "%s:%s mask %2d type %2d/%10s inni %d" % (
# sid, pos, m, i, t, inni)
vsumm[t] += 1

if vcfi % 72 == 0:
sys.stdout.write('\r%d' % vcfi)
sys.stdout.flush()

with open(args.base + '.vcf.stats', 'w') as F:
print "total\t%s" % (vcfi)
F.write("total\t%s\n" % (vcfi))
for t in vsumm:
print "%s\t%s" % (t, vsumm[t])
F.write("%s\t%s\n" % (t, vsumm[t]))




0 comments on commit a9276ac

Please sign in to comment.