Skip to content

Commit

Permalink
Merge branch 'master' of bitbucket.org:lpryszcz/bin
Browse files Browse the repository at this point in the history
  • Loading branch information
Leszek Pryszcz committed Aug 18, 2014
2 parents 506a7b6 + 0025581 commit dd3ddcf
Show file tree
Hide file tree
Showing 5 changed files with 546 additions and 2 deletions.
6 changes: 4 additions & 2 deletions bed2stats.py
Expand Up @@ -20,8 +20,10 @@
def print_stats( lengths,addon="" ):
"""
"""
print "%.2f kb in %s fragments%s (mean: %.1f kb +-%.1f )" % ( sum(lengths)/10.0**3,len(lengths),addon, \
np.mean(lengths)/10.0**3,np.std(lengths)/10.0**3 )
info = "%.2f kb in %s fragments%s (median: %s bp; mean: %.3f kb +-%.3f )"
print info%(sum(lengths)/10.0**3, len(lengths), addon, \
int(np.median(lengths)), np.mean(lengths)/10.0**3,
np.std(lengths)/10.0**3)

def bed2stats( handle,simple,verbose ):
"""Parse BED and print stats."""
Expand Down
153 changes: 153 additions & 0 deletions get_complete_genomes.NCBI.py
@@ -0,0 +1,153 @@
#!/usr/bin/env python
"""
Download genomes from NCBI for given taxid.
USAGE: get_complete_genomes.NCBI.py taxid nuccore
"""

import os, sys
from datetime import datetime
from Bio import Entrez, SeqIO
Entrez.email = 'lpryszcz@crg.eu'

def split_seq( seq,length=70 ):
"""Return list of sequence fragnments having given length.
"""
return [seq[i:i + length] for i in range(0, len(seq), length)]

def gb2cds( handle,outDir='out' ):
"""
"""
#get genome sequence as GenBank SeqIO object
for gb in SeqIO.parse( handle,"gb" ): #Entrez.efetch( db=db,id=gi,rettype="gb" )
###get sp name
acc = gb.id
spName= gb.description.split(',')[0]
spName= spName.strip().replace(' ','_').replace('/','_').replace(':','_')

#check if outfile exists
outFn = '%s/%s.%s.chromosome.fna' % (outDir,spName,acc)
#if os.path.isfile( outFn ):
# continue

###save genome fasta
seq = '\n'.join( split_seq( str(gb.seq) ) )
out = open( outFn,'w' )
out.write( '>%s\n%s\n' % ( acc,seq ) )
out.close()

###save genes, CDS and translations
#genesOut = open( '%s/%s.%s.genes.fna' % (outDir,spName,acc),'w' )
cdsOut = open( '%s/%s.%s.cds.fna' % (outDir,spName,acc),'w' )
aaOut = open( '%s/%s.%s.aa.faa' % (outDir,spName,acc),'w' )
for feature in gb.features:
if feature.type == 'CDS':
#header
header = '>'
if 'db_xref' in feature.qualifiers:
extID = feature.qualifiers['db_xref'][0]
extID = extID.replace(':','|')
header += "%s|" % extID

if 'protein_id' in feature.qualifiers:
header += "%s" % feature.qualifiers['protein_id'][0]
header += '|'

if 'gene' in feature.qualifiers:
header += ' %s ' % feature.qualifiers['gene'][0]
header += '|'

#add position info
header += ' %s-%s %s' % ( feature.location.start.position,feature.location.end.position,feature.strand )

#cds
cdsSeq = feature.extract( gb.seq )
cds = '\n'.join( split_seq( str( cdsSeq ) ) )
cdsOut.write( '%s\n%s\n' % ( header,cds) )

#protein
if 'translation' in feature.qualifiers:
aa = '\n'.join( split_seq( feature.qualifiers['translation'][0] ) )
else:
aa = '\n'.join( split_seq( str( cdsSeq.translate() ) ) )
aaOut.write( '%s\n%s\n' % ( header,aa ) )

#close out files
cdsOut.close()
aaOut.close()

def get_cds( outDir = 'cds' ):
"""Get protein and cds sequence for every chromosome
"""
#create outdir
if not os.path.isdir( outDir ):
os.makedirs( outDir )

print "Retrieving CDS & proteins from GenBank files..."
fnames = filter( lambda x: x.endswith('.gb'), os.walk('.').next()[2] )
fnames.sort()
for fn in fnames:
tNow=str(datetime.now())
print "%s\t%s / %s\t%s" % ( tNow.split('.')[0],fnames.index(fn)+1,len(fnames),fn )
gb2cds( open(fn),outDir )

def get_genomes( taxid,db ):
"""Fetch complete genomes in GenBank format
"""
q='txid%s[organism] AND complete genome[title] NOT plasmid[title]' % taxid #'txid%s[organism] AND complete genome[title]' % taxid
handle = Entrez.esearch( db=db,term=q,retmax=1000000 )
record = Entrez.read( handle )
GIs=record['IdList']

print "Downloading %s complete genomes...\n cmd: %s" % ( len(GIs),q )
#fecth all genomes
for gi in GIs:
#skip if file already exits
outFn = '%s.gb' % gi
if os.path.isfile( outFn ):
continue

#print info
tNow=str(datetime.now())
print "%s\t%s / %s\t%s" % ( tNow.split('.')[0],GIs.index(gi)+1,len(GIs),gi )

#get GenBank string and write
error = 1
while error:
try:
gb = Entrez.efetch( db=db,id=gi,rettype="gb" ).read()
error = 0
except:
error += 1
print "\terror %s" % error

#skip fragmented entries like: http://www.ncbi.nlm.nih.gov/nuccore/AL935263.1
#if len(gb)<10**5:
# continue

#open outfile, write, close
out = open( outFn,'w' ); out.write( gb ); out.close()

#remove wrong entries
#try:
# r = SeqIO.parse( open(outFn),'gb' ).next()
#except:
# print " Wrong genbank file: %s. Removed!" % outFn
# os.unlink( outFn )

def main(groupTaxid, db="nuccore"):
"""
"""
###get taxIDs of strains with complete genome, groupTaxid=2 for Bacteria
get_genomes( groupTaxid,db )

###get CDSs and proteins
get_cds()

if __name__=='__main__':
t0=datetime.now()
groupTaxid = int(sys.argv[1])
db=sys.argv[2]
main(groupTaxid, db)
dt=datetime.now()-t0
sys.stderr.write( "#Time elapsed: %s\n" % dt )
62 changes: 62 additions & 0 deletions loh2chrom.py
@@ -0,0 +1,62 @@
#!/usr/bin/env python
"""Generate chromosome characteristics plot for testing for meiosis-associated
recombination.
USAGE: loh2chrom.py ../../ref/CANOR.chromosomes.fa.fai Ch_T3_7318.bam.gatk.homo.100bp.cov_flt.bed
"""

import os, sys
import numpy as np
from scipy import stats

ref_fai, bed = sys.argv[1:3]

#load chrom sizes
chrom2size = {}
for l in open(ref_fai):
chrom, size = l.split()[:2]
size = int(size)
chrom2size[chrom] = size

chrom2lohs = {chrom: [] for chrom in chrom2size}
for l in open(bed):
chrom, s, e = l.split()[:3]
s, e = int(s), int(e)
chrom2lohs[chrom].append(e-s)

chrs = [size for chrom, size in sorted(chrom2size.items())]
loh = [np.mean(lohs) for chrom, lohs in sorted(chrom2lohs.items())]
hetero = [100-100.0*sum(lohs)/size for (chrom, size), (chrom, lohs) in zip(sorted(chrom2size.items()), sorted(chrom2lohs.items()))]

#print hetero
#hetero=[18.68,15.37,19.49,26.98,15.01,12.13,9.20,12.58]
#loh=[1980.42,2321.94,2039.60,1214.59,2649.85,3190.51,3765.92,2806.15]
loh=np.array(loh) / 1e3
#chrs=[2936865,2433673,1644167,1591921,1474802,1027593,937312,613068]
chrs=np.array(chrs) / 1e6

print "#sample\tchromosome size\tLOH total\tLOH median\tLOH mean\tLOH stdev"
for (chrom, size), (chrom, lohs) in zip(sorted(chrom2size.items()), sorted(chrom2lohs.items())):
print "%s\t%s\t%s\t%s\t%s\t%s"%(chrom, size, sum(lohs), np.median(lohs), np.mean(lohs), np.std(lohs))

print "LOH mean size vs Chromosome size"
print " Pearson: r=%s p=%s" % stats.pearsonr(loh, chrs)
print " Spearman: r=%s p=%s" % stats.spearmanr(loh, chrs)
print "Heterozygous % vs Chromosome size"
print " Pearson: r=%s p=%s" % stats.pearsonr(hetero, chrs)
print " Spearman: r=%s p=%s" % stats.spearmanr(hetero, chrs)

import matplotlib.pyplot as plt

fig = plt.figure()
ax1 = fig.add_subplot(2,1,1)
ax2 = fig.add_subplot(2,1,2)
p1, = ax1.plot(chrs, hetero, "bo", label="Heterozygous")
ax1.set_ylabel("Heterozygous [%]")
ax2.set_xlabel("Chromosome size [Mb]")
p2, = ax2.plot(chrs, loh, "ro", label="LOH mean size")
ax2.set_ylabel("Mean size [kb]")
plt.show()



35 changes: 35 additions & 0 deletions newick2rooted.py
@@ -0,0 +1,35 @@
#!/usr/bin/env python

import sys
import ete2
rooters = False
try:
from rooted_phylomes import ROOTED_PHYLOMES
rooters = True
except:
sys.stderr.write("No rooters (rooted_phylomes.py)\n")
try:
phyid, seed = int(sys.argv[1]), sys.argv[2]
except:
sys.stderr.write("No phylome_id and/or seed given\n")
rooters = False

def _get_spcode(protid):
"""Species naming function compatible with phylome_db3"""
return protid.split('_')[-1]

for i, nw in enumerate(sys.stdin, 1):
if not i%100:
sys.stderr.write(" %i \r"%i)
t = ete2.PhyloTree(nw)
t.set_species_naming_function(_get_spcode)
try:
seedNode = t.get_leaves_by_name(seed)[0]
seedNode.get_farthest_oldest_node(ROOTED_PHYLOMES[phyid])
if rooters:
sys.stderr.write("[WARNING] Cannot root tree %s with rooter\n"%i)
except:
t.set_outgroup(t.get_midpoint_outgroup())
print t.write(format=9)


0 comments on commit dd3ddcf

Please sign in to comment.