Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of bitbucket.org:lpryszcz/bin
- Loading branch information
Showing
5 changed files
with
546 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
#!/usr/bin/env python | ||
""" | ||
Download genomes from NCBI for given taxid. | ||
USAGE: get_complete_genomes.NCBI.py taxid nuccore | ||
""" | ||
|
||
import os, sys | ||
from datetime import datetime | ||
from Bio import Entrez, SeqIO | ||
Entrez.email = 'lpryszcz@crg.eu' | ||
|
||
def split_seq( seq,length=70 ): | ||
"""Return list of sequence fragnments having given length. | ||
""" | ||
return [seq[i:i + length] for i in range(0, len(seq), length)] | ||
|
||
def gb2cds( handle,outDir='out' ): | ||
""" | ||
""" | ||
#get genome sequence as GenBank SeqIO object | ||
for gb in SeqIO.parse( handle,"gb" ): #Entrez.efetch( db=db,id=gi,rettype="gb" ) | ||
###get sp name | ||
acc = gb.id | ||
spName= gb.description.split(',')[0] | ||
spName= spName.strip().replace(' ','_').replace('/','_').replace(':','_') | ||
|
||
#check if outfile exists | ||
outFn = '%s/%s.%s.chromosome.fna' % (outDir,spName,acc) | ||
#if os.path.isfile( outFn ): | ||
# continue | ||
|
||
###save genome fasta | ||
seq = '\n'.join( split_seq( str(gb.seq) ) ) | ||
out = open( outFn,'w' ) | ||
out.write( '>%s\n%s\n' % ( acc,seq ) ) | ||
out.close() | ||
|
||
###save genes, CDS and translations | ||
#genesOut = open( '%s/%s.%s.genes.fna' % (outDir,spName,acc),'w' ) | ||
cdsOut = open( '%s/%s.%s.cds.fna' % (outDir,spName,acc),'w' ) | ||
aaOut = open( '%s/%s.%s.aa.faa' % (outDir,spName,acc),'w' ) | ||
for feature in gb.features: | ||
if feature.type == 'CDS': | ||
#header | ||
header = '>' | ||
if 'db_xref' in feature.qualifiers: | ||
extID = feature.qualifiers['db_xref'][0] | ||
extID = extID.replace(':','|') | ||
header += "%s|" % extID | ||
|
||
if 'protein_id' in feature.qualifiers: | ||
header += "%s" % feature.qualifiers['protein_id'][0] | ||
header += '|' | ||
|
||
if 'gene' in feature.qualifiers: | ||
header += ' %s ' % feature.qualifiers['gene'][0] | ||
header += '|' | ||
|
||
#add position info | ||
header += ' %s-%s %s' % ( feature.location.start.position,feature.location.end.position,feature.strand ) | ||
|
||
#cds | ||
cdsSeq = feature.extract( gb.seq ) | ||
cds = '\n'.join( split_seq( str( cdsSeq ) ) ) | ||
cdsOut.write( '%s\n%s\n' % ( header,cds) ) | ||
|
||
#protein | ||
if 'translation' in feature.qualifiers: | ||
aa = '\n'.join( split_seq( feature.qualifiers['translation'][0] ) ) | ||
else: | ||
aa = '\n'.join( split_seq( str( cdsSeq.translate() ) ) ) | ||
aaOut.write( '%s\n%s\n' % ( header,aa ) ) | ||
|
||
#close out files | ||
cdsOut.close() | ||
aaOut.close() | ||
|
||
def get_cds( outDir = 'cds' ): | ||
"""Get protein and cds sequence for every chromosome | ||
""" | ||
#create outdir | ||
if not os.path.isdir( outDir ): | ||
os.makedirs( outDir ) | ||
|
||
print "Retrieving CDS & proteins from GenBank files..." | ||
fnames = filter( lambda x: x.endswith('.gb'), os.walk('.').next()[2] ) | ||
fnames.sort() | ||
for fn in fnames: | ||
tNow=str(datetime.now()) | ||
print "%s\t%s / %s\t%s" % ( tNow.split('.')[0],fnames.index(fn)+1,len(fnames),fn ) | ||
gb2cds( open(fn),outDir ) | ||
|
||
def get_genomes( taxid,db ): | ||
"""Fetch complete genomes in GenBank format | ||
""" | ||
q='txid%s[organism] AND complete genome[title] NOT plasmid[title]' % taxid #'txid%s[organism] AND complete genome[title]' % taxid | ||
handle = Entrez.esearch( db=db,term=q,retmax=1000000 ) | ||
record = Entrez.read( handle ) | ||
GIs=record['IdList'] | ||
|
||
print "Downloading %s complete genomes...\n cmd: %s" % ( len(GIs),q ) | ||
#fecth all genomes | ||
for gi in GIs: | ||
#skip if file already exits | ||
outFn = '%s.gb' % gi | ||
if os.path.isfile( outFn ): | ||
continue | ||
|
||
#print info | ||
tNow=str(datetime.now()) | ||
print "%s\t%s / %s\t%s" % ( tNow.split('.')[0],GIs.index(gi)+1,len(GIs),gi ) | ||
|
||
#get GenBank string and write | ||
error = 1 | ||
while error: | ||
try: | ||
gb = Entrez.efetch( db=db,id=gi,rettype="gb" ).read() | ||
error = 0 | ||
except: | ||
error += 1 | ||
print "\terror %s" % error | ||
|
||
#skip fragmented entries like: http://www.ncbi.nlm.nih.gov/nuccore/AL935263.1 | ||
#if len(gb)<10**5: | ||
# continue | ||
|
||
#open outfile, write, close | ||
out = open( outFn,'w' ); out.write( gb ); out.close() | ||
|
||
#remove wrong entries | ||
#try: | ||
# r = SeqIO.parse( open(outFn),'gb' ).next() | ||
#except: | ||
# print " Wrong genbank file: %s. Removed!" % outFn | ||
# os.unlink( outFn ) | ||
|
||
def main(groupTaxid, db="nuccore"): | ||
""" | ||
""" | ||
###get taxIDs of strains with complete genome, groupTaxid=2 for Bacteria | ||
get_genomes( groupTaxid,db ) | ||
|
||
###get CDSs and proteins | ||
get_cds() | ||
|
||
if __name__=='__main__': | ||
t0=datetime.now() | ||
groupTaxid = int(sys.argv[1]) | ||
db=sys.argv[2] | ||
main(groupTaxid, db) | ||
dt=datetime.now()-t0 | ||
sys.stderr.write( "#Time elapsed: %s\n" % dt ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/usr/bin/env python | ||
"""Generate chromosome characteristics plot for testing for meiosis-associated | ||
recombination. | ||
USAGE: loh2chrom.py ../../ref/CANOR.chromosomes.fa.fai Ch_T3_7318.bam.gatk.homo.100bp.cov_flt.bed | ||
""" | ||
|
||
import os, sys | ||
import numpy as np | ||
from scipy import stats | ||
|
||
ref_fai, bed = sys.argv[1:3] | ||
|
||
#load chrom sizes | ||
chrom2size = {} | ||
for l in open(ref_fai): | ||
chrom, size = l.split()[:2] | ||
size = int(size) | ||
chrom2size[chrom] = size | ||
|
||
chrom2lohs = {chrom: [] for chrom in chrom2size} | ||
for l in open(bed): | ||
chrom, s, e = l.split()[:3] | ||
s, e = int(s), int(e) | ||
chrom2lohs[chrom].append(e-s) | ||
|
||
chrs = [size for chrom, size in sorted(chrom2size.items())] | ||
loh = [np.mean(lohs) for chrom, lohs in sorted(chrom2lohs.items())] | ||
hetero = [100-100.0*sum(lohs)/size for (chrom, size), (chrom, lohs) in zip(sorted(chrom2size.items()), sorted(chrom2lohs.items()))] | ||
|
||
#print hetero | ||
#hetero=[18.68,15.37,19.49,26.98,15.01,12.13,9.20,12.58] | ||
#loh=[1980.42,2321.94,2039.60,1214.59,2649.85,3190.51,3765.92,2806.15] | ||
loh=np.array(loh) / 1e3 | ||
#chrs=[2936865,2433673,1644167,1591921,1474802,1027593,937312,613068] | ||
chrs=np.array(chrs) / 1e6 | ||
|
||
print "#sample\tchromosome size\tLOH total\tLOH median\tLOH mean\tLOH stdev" | ||
for (chrom, size), (chrom, lohs) in zip(sorted(chrom2size.items()), sorted(chrom2lohs.items())): | ||
print "%s\t%s\t%s\t%s\t%s\t%s"%(chrom, size, sum(lohs), np.median(lohs), np.mean(lohs), np.std(lohs)) | ||
|
||
print "LOH mean size vs Chromosome size" | ||
print " Pearson: r=%s p=%s" % stats.pearsonr(loh, chrs) | ||
print " Spearman: r=%s p=%s" % stats.spearmanr(loh, chrs) | ||
print "Heterozygous % vs Chromosome size" | ||
print " Pearson: r=%s p=%s" % stats.pearsonr(hetero, chrs) | ||
print " Spearman: r=%s p=%s" % stats.spearmanr(hetero, chrs) | ||
|
||
import matplotlib.pyplot as plt | ||
|
||
fig = plt.figure() | ||
ax1 = fig.add_subplot(2,1,1) | ||
ax2 = fig.add_subplot(2,1,2) | ||
p1, = ax1.plot(chrs, hetero, "bo", label="Heterozygous") | ||
ax1.set_ylabel("Heterozygous [%]") | ||
ax2.set_xlabel("Chromosome size [Mb]") | ||
p2, = ax2.plot(chrs, loh, "ro", label="LOH mean size") | ||
ax2.set_ylabel("Mean size [kb]") | ||
plt.show() | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#!/usr/bin/env python | ||
|
||
import sys | ||
import ete2 | ||
rooters = False | ||
try: | ||
from rooted_phylomes import ROOTED_PHYLOMES | ||
rooters = True | ||
except: | ||
sys.stderr.write("No rooters (rooted_phylomes.py)\n") | ||
try: | ||
phyid, seed = int(sys.argv[1]), sys.argv[2] | ||
except: | ||
sys.stderr.write("No phylome_id and/or seed given\n") | ||
rooters = False | ||
|
||
def _get_spcode(protid): | ||
"""Species naming function compatible with phylome_db3""" | ||
return protid.split('_')[-1] | ||
|
||
for i, nw in enumerate(sys.stdin, 1): | ||
if not i%100: | ||
sys.stderr.write(" %i \r"%i) | ||
t = ete2.PhyloTree(nw) | ||
t.set_species_naming_function(_get_spcode) | ||
try: | ||
seedNode = t.get_leaves_by_name(seed)[0] | ||
seedNode.get_farthest_oldest_node(ROOTED_PHYLOMES[phyid]) | ||
if rooters: | ||
sys.stderr.write("[WARNING] Cannot root tree %s with rooter\n"%i) | ||
except: | ||
t.set_outgroup(t.get_midpoint_outgroup()) | ||
print t.write(format=9) | ||
|
||
|
Oops, something went wrong.