In [8]:
def batch_iterator(iterator, batch_size):
    """Returns lists of length batch_size.

    This can be used on any iterator, for example to batch up
    SeqRecord objects from Bio.SeqIO.parse(...), or to batch
    Alignment objects from Bio.AlignIO.parse(...), or simply
    lines from a file handle.

    This is a generator function, and it returns lists of the
    entries from the supplied iterator.  Each list will have
    batch_size entries, although the final list may be shorter.
    """
    entry = True  # Make sure we loop once
    while entry:
        batch = []
        while len(batch) < batch_size:
            try:
                entry = iterator.next()
            except StopIteration:
                entry = None
            if entry is None:
                # End of file
                break
            batch.append(entry)
        if batch:
            yield batch



In [9]:
%%time
from Bio import SeqIO

record_iter = SeqIO.parse(open("../2517287028.genes.faa"),"fasta")#m.rosea
for i, batch in enumerate(batch_iterator(record_iter, 1000)):
    filename = "group_%i.fasta" % (i + 1)
    with open(filename, "w") as handle:
        count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, filename))



Wrote 500 records to group_1.fasta
Wrote 500 records to group_2.fasta
Wrote 500 records to group_3.fasta
Wrote 500 records to group_4.fasta
Wrote 500 records to group_5.fasta
Wrote 500 records to group_6.fasta
Wrote 500 records to group_7.fasta
Wrote 391 records to group_8.fasta
CPU times: user 93 ms, sys: 7.56 ms, total: 101 ms
Wall time: 109 ms


# Trying temporary files
[https://docs.python.org/3/library/tempfile.html]

In [3]:
import tempfile

# create a temporary file and write some data to it
fp = tempfile.TemporaryFile()
fp.write(b'Hello world!')
# read data from file
fp.seek(0)

txt=fp.read()
print(txt)
# close the file, it will be removed
fp.close()
#print(txt)
# # create a temporary file using a context manager
# with tempfile.TemporaryFile() as fp:
#     fp.write(b'Hello world!')
#     fp.seek(0)
#     fp.read()

# # file is now closed and removed

# # create a temporary directory using the context manager
# with tempfile.TemporaryDirectory() as tmpdirname:
#     print('created temporary directory', tmpdirname)

# directory and contents have been removed

Hello world!


In [11]:
import tempfile
from Bio import SeqIO

record_iter = SeqIO.parse(open("../76969.assembled.faa"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 30000)):
    
    filename = "group_%i.fasta" % (i + 1)
    print(filename)
    filename = tempfile.TemporaryFile()
    filename.seek(0)
    count = SeqIO.write(batch, filename, "fasta")
#     with open(filename, "w") as handle:
#         count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, filename))

group_1.fasta
Wrote 30000 records to <open file '<fdopen>', mode 'w+b' at 0x7f5bc00dde40>
group_2.fasta
Wrote 30000 records to <open file '<fdopen>', mode 'w+b' at 0x7f5bc81c4540>
group_3.fasta
Wrote 30000 records to <open file '<fdopen>', mode 'w+b' at 0x7f5bc00dded0>
group_4.fasta
Wrote 30000 records to <open file '<fdopen>', mode 'w+b' at 0x7f5bc005fc90>
group_5.fasta
Wrote 30000 records to <open file '<fdopen>', mode 'w+b' at 0x7f5bc00ddd20>
group_6.fasta
Wrote 30000 records to <open file '<fdopen>', mode 'w+b' at 0x7f5bc005fb70>
group_7.fasta
Wrote 30000 records to <open file '<fdopen>', mode 'w+b' at 0x7f5bc00ddae0>
group_8.fasta
Wrote 30000 records to <open file '<fdopen>', mode 'w+b' at 0x7f5bc005f9c0>
group_9.fasta
Wrote 30000 records to <open file '<fdopen>', mode 'w+b' at 0x7f5bc005f9c0>
group_10.fasta
Wrote 30000 records to <open file '<fdopen>', mode 'w+b' at 0x7f5bc005f9c0>
group_11.fasta
Wrote 30000 records to <open file '<fdopen>', mode 'w+b' at 0x7f5bc005f9c0>
group_12

## Named temporary files

[https://docs.python.org/2/library/tempfile.html]

In [64]:
files

[('group_1.fasta', <closed file '<fdopen>', mode 'w+b' at 0x7f5ba498ded0>),
 ('group_2.fasta', <closed file '<fdopen>', mode 'w+b' at 0x7f5ba539b0c0>),
 ('group_3.fasta', <closed file '<fdopen>', mode 'w+b' at 0x7f5ba539b420>),
 ('group_4.fasta', <closed file '<fdopen>', mode 'w+b' at 0x7f5ba539b4b0>)]

In [13]:
import tempfile
from Bio import SeqIO

files=list()
#input_fasta="../76969.assembled.faa"
input_fasta="../2517287028.genes.faa"#m.rosea
record_iter = SeqIO.parse(open(input_fasta),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 1000)):
    
    label = "group_%i.fasta" % (i + 1)
    #f = tempfile.NamedTemporaryFile(delete=False)#exists on closing
    f = tempfile.NamedTemporaryFile()#deleted after f.close()
    files.append((label,f))
    #f.seek(0)
    count = SeqIO.write(batch, f, "fasta")
    f.flush() #this solves the EOF problem
#     with open(filename, "w") as handle:
#         count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, f.name))

Wrote 1000 records to /tmp/tmpDjX8B8
Wrote 1000 records to /tmp/tmp6fEZVA
Wrote 1000 records to /tmp/tmprxg_Kg
Wrote 891 records to /tmp/tmpG2VDVP


In [12]:
for (l,f) in files:
    print(l)
    print(f.name)
    f.close()

group_1.fasta
/tmp/tmpRFGDVJ
group_2.fasta
/tmp/tmpgzDt19
group_3.fasta
/tmp/tmpo4107w
group_4.fasta
/tmp/tmpPamqdd


## hmmering tempfiles

In [63]:
%%time
import os
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
for (l,f) in files:
    #print(l)
    #print(f.name)
    
    #print('Exists after close:', os.path.exists(f.name))
    command="hmmscan --tblout "+l+".tab " + "../amoCAB " + f.name+""
    print(command)
    os.system(command)
    

for (l,f) in files:
    print(l)
    print(f.name)
    f.close()
    print('Exists after close:', os.path.exists(f.name))

hmmscan --tblout group_1.fasta.tab ../amoCAB /tmp/tmp4WuJSN
hmmscan --tblout group_2.fasta.tab ../amoCAB /tmp/tmpCBMFY1
hmmscan --tblout group_3.fasta.tab ../amoCAB /tmp/tmp9ABMDY
hmmscan --tblout group_4.fasta.tab ../amoCAB /tmp/tmp4g8aDl
group_1.fasta
/tmp/tmp4WuJSN
('Exists after close:', False)
group_2.fasta
/tmp/tmpCBMFY1
('Exists after close:', False)
group_3.fasta
/tmp/tmp9ABMDY
('Exists after close:', False)
group_4.fasta
/tmp/tmp4g8aDl
('Exists after close:', False)
CPU times: user 4.65 ms, sys: 4.08 ms, total: 8.74 ms
Wall time: 24.7 ms


In [59]:
%%time
import os
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
for (l,f) in files:
    #print(l)
    #print(f.name)
    
    #print('Exists after close:', os.path.exists(f.name))
    #
    command="hmmscan "+"--cpu 8 --noali"+" --tblout "+l+".tab " + "../amoCAB " + f.name
    print(command)
    os.system(command)

for (l,f) in files:
    print(l)
    print(f.name)
    f.close()
    #print('Exists after close:', os.path.exists(f.name))

hmmscan --cpu 8 --noali --tblout group_1.fasta.tab ../amoCAB /tmp/tmpM_ZFS9
hmmscan --cpu 8 --noali --tblout group_2.fasta.tab ../amoCAB /tmp/tmp6sxP79
hmmscan --cpu 8 --noali --tblout group_3.fasta.tab ../amoCAB /tmp/tmpCgusGR
hmmscan --cpu 8 --noali --tblout group_4.fasta.tab ../amoCAB /tmp/tmpVnIz1M
group_1.fasta
/tmp/tmpM_ZFS9
('Exists after close:', False)
group_2.fasta
/tmp/tmp6sxP79
('Exists after close:', False)
group_3.fasta
/tmp/tmpCgusGR
('Exists after close:', False)
group_4.fasta
/tmp/tmpVnIz1M
('Exists after close:', False)
CPU times: user 646 µs, sys: 8.05 ms, total: 8.7 ms
Wall time: 12.5 s


## Subprocess version

In [45]:
%%time
import os
import subprocess
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
for (l,f) in files:
    #print(l)
    #print(f.name)
    
    #print('Exists after close:', os.path.exists(f.name))
    command="hmmscan "+"--cpu 8"+" --tblout "+l+".tab " + "../amoCAB " + f.name
    print(command)
    subprocess.Popen(command, shell=True)

for (l,f) in files:
    print(l)
    print(f.name)
    f.close()
    print('Exists after close:', os.path.exists(f.name))

hmmscan --cpu 8 --tblout group_1.fasta.tab ../amoCAB /tmp/tmpWxz75u
hmmscan --cpu 8 --tblout group_2.fasta.tab ../amoCAB /tmp/tmp_a05vQ
hmmscan --cpu 8 --tblout group_3.fasta.tab ../amoCAB /tmp/tmp0tsj4Z
hmmscan --cpu 8 --tblout group_4.fasta.tab ../amoCAB /tmp/tmpZO5t_L
group_1.fasta
/tmp/tmpWxz75u
('Exists after close:', False)
group_2.fasta
/tmp/tmp_a05vQ
('Exists after close:', False)
group_3.fasta
/tmp/tmp0tsj4Z
('Exists after close:', False)
group_4.fasta
/tmp/tmpZO5t_L
('Exists after close:', False)
CPU times: user 6.91 ms, sys: 4.17 ms, total: 11.1 ms
Wall time: 18.5 ms


## Subprocess verision of entire file, with no splitting

In [16]:
%%time
from Bio import SeqIO
import os
import subprocess
import shlex

#filename="../76969.assembled.faa"
filename="../2517287028.genes.faa"#m.rosea
#fout=entire_mg1_2.tab
fout="entire_mros.tab"
#fh = SeqIO.parse(open(filename),"fasta")#m.rosea
# with open(filename, "r") as handle:
#     count = SeqIO.write(batch, handle, "fasta")
# print("Wrote %i records to %s" % (count, filename))




#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")

    #print('Exists after close:', os.path.exists(f.name))
# command="hmmscan "+"--cpu 8"+" -o /dev/null --tblout "+"entire_mg1"+".tab " + "../amoCAB " + filename
# print(command)
# subprocess.Popen(command, shell=True)

cmd="hmmscan --cpu 8 -o /dev/null --tblout "+fout+" ../amoCAB "+filename
print(cmd)
p=subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
# print(cmd)
# args = shlex.split(cmd)
#os.system(cmd)
#os.system(cmd)
#print(p.stdout.read())
#output=p.stdout.read()
#os.system(cmd)
#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames

#p=subprocess.Popen(args,stdout=subprocess.PIPE)
#print(p.stdout.read())
output=p.stdout.read()

hmmscan --cpu 8 -o /dev/null --tblout entire_mros.tab ../amoCAB ../2517287028.genes.faa
CPU times: user 1.93 ms, sys: 235 µs, total: 2.16 ms
Wall time: 1.51 s


In [30]:
with open(fout, "r") as handle:
    mros_entire=handle.read()


In [31]:
mros_entire=mros_entire.split('\n')
for i in mros_entire:
    print(i)

#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
AmoC                 PF04896.11 2517401456           -           5.6e-120  386.8  15.8  7.2e-120  386.4  15.8   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C
AmoC                 PF04896.11 2517402628           -           5.8e-122  393.3  14.7    7e-122  393.0  14.7   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C
AMO                  PF02461.15 2517402629           -           4.8e-124  400.4  24.0  5.4e-124  400.3  24.0   1.0   1   0   

In [28]:
mros_entire[3:-11]

['AmoC                 PF04896.11 2517401456           -           5.6e-120  386.8  15.8  7.2e-120  386.4  15.8   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C',
 'AmoC                 PF04896.11 2517402628           -           5.8e-122  393.3  14.7    7e-122  393.0  14.7   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C',
 'AMO                  PF02461.15 2517402629           -           4.8e-124  400.4  24.0  5.4e-124  400.3  24.0   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase',
 'Monooxygenase_B      PF04744.11 2517402630           -           3.3e-170  553.6   0.0  3.7e-170  553.4   0.0   1.0   1   0   0   1   1   1   1 Monooxygenase subunit B protein',
 'AmoC                 PF04896.11 2517403201           -           5.8e-122  393.3  14.7    7e-122  393.0  14.7   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C',
 'Monooxygenase_B      PF04744.11 2517403

In [34]:
mros_entire[4].split()

['AmoC',
 'PF04896.11',
 '2517402628',
 '-',
 '5.8e-122',
 '393.3',
 '14.7',
 '7e-122',
 '393.0',
 '14.7',
 '1.0',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 'Ammonia',
 'monooxygenase/methane',
 'monooxygenase,',
 'subunit',
 'C']

In [7]:
for i,j in files:
    print(i, j.name)

NameError: name 'files' is not defined

In [None]:
%%time
import os
import subprocess
import shlex
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
fnames=""
flabels=""
for (l,f) in files:
    fnames+=f.name+" "
    flabels+=l+" "
#print flabels
    #print(l)
    #print(f.name)
    #parallel -j $1 blastall "-p blastn -d nt/nt -i "{}" -o "{.}".xml -e 1e-10 -m 7 -K 100 -b 200" ::: *.fa
    #print('Exists after close:', os.path.exists(f.name))
#this prints nicely to stdout, and is caught by p.stdout.read()
#cmd="parallel -j 8 hmmscan "+"-o /dev/null --noali --cpu 1"+" --tblout >(tee /dev/stdout)  " + "../amoCAB {} ::: "+fnames
#this prints to the file and stdout
#0.tab should be deleted if exists
if os.path.exists("0.tab"):
    os.remove("0.tab")
cmd="parallel -j 8 hmmscan "+"-o /dev/null --noali --cpu 1"+" --tblout >(tee -a 0.tab)  " + "../amoCAB {} ::: "+fnames
args = shlex.split(cmd)
#os.system(cmd)
p=subprocess.Popen(args,stdout=subprocess.PIPE)
#print(p.stdout.read())
output=p.stdout.read()
#os.system(cmd)
#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames
print(cmd)

## Trying GNU parallel

In [85]:
files

[('group_1.fasta', <open file '<fdopen>', mode 'w+b' at 0x7f5ba498d540>),
 ('group_2.fasta', <open file '<fdopen>', mode 'w+b' at 0x7f5ba539be40>),
 ('group_3.fasta', <open file '<fdopen>', mode 'w+b' at 0x7f5ba498dc90>),
 ('group_4.fasta', <open file '<fdopen>', mode 'w+b' at 0x7f5ba49a80c0>)]

In [8]:
#this works
#parallel -j 8 hmmscan --cpu 1 --tblout {.}.tab ../amoCAB {} ::: *.fasta > /dev/null


In [35]:
%%time
import os
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
fnames=""
flabels=""
for (l,f) in files:
    fnames+=f.name+" "
    flabels+=l+" "
#print flabels
    #print(l)
    #print(f.name)
    #parallel -j $1 blastall "-p blastn -d nt/nt -i "{}" -o "{.}".xml -e 1e-10 -m 7 -K 100 -b 200" ::: *.fa
    #print('Exists after close:', os.path.exists(f.name))
    #this does print in the output:
    #hmmscan --cpu 8 --noali --tblout >(tee 1.tab)  ../amoCAB /tmp/tmp4WuJSN > /dev/null

cmd="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout >(tee {#}.tab) " + "../amoCAB {} ::: "+fnames+" > /dev/null"
os.system(cmd)
#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames
print(cmd)
    #os.system(command)

# for (l,f) in files:
#     print(l)
#     print(f.name)
#     f.close()
#     print('Exists after close:', os.path.exists(f.name))

parallel -j 8 hmmscan --cpu 1 --tblout >(tee {#}.tab) ../amoCAB {} ::: /tmp/tmpDjX8B8 /tmp/tmp6fEZVA /tmp/tmprxg_Kg /tmp/tmpG2VDVP  > /dev/null
CPU times: user 928 µs, sys: 122 µs, total: 1.05 ms
Wall time: 1.63 ms


119

# Rewriting to use subprocess module

In [None]:
# x = subprocess.Popen(['touch', 'xyz'])
# >>> print x

In [150]:
%%time
import os
import subprocess
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
fnames=""
flabels=""
for (l,f) in files:
    fnames+=f.name+" "
    flabels+=l+" "
#print flabels
    #print(l)
    #print(f.name)
    #parallel -j $1 blastall "-p blastn -d nt/nt -i "{}" -o "{.}".xml -e 1e-10 -m 7 -K 100 -b 200" ::: *.fa
    #print('Exists after close:', os.path.exists(f.name))
cmd="parallel -j 8 hmmscan "+" --noali --cpu 1"+" --tblout >0.tab  " + "../amoCAB {} ::: "+fnames+" > /dev/null"
p=subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE)
print(p.stdout.read())
#os.system(cmd)
#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames
print(cmd)


parallel -j 8 hmmscan  --noali --cpu 1 --tblout >0.tab  ../amoCAB {} ::: /tmp/tmp7100qP /tmp/tmpnD7HAz /tmp/tmpoKVoBZ /tmp/tmpm1kpsd  > /dev/null
CPU times: user 1.03 ms, sys: 4.03 ms, total: 5.06 ms
Wall time: 93.7 ms


## os.system version for output redirection

In [152]:
files

[('group_1.fasta', <open file '<fdopen>', mode 'w+b' at 0x7f5ba498db70>),
 ('group_2.fasta', <open file '<fdopen>', mode 'w+b' at 0x7f5ba49a8150>),
 ('group_3.fasta', <open file '<fdopen>', mode 'w+b' at 0x7f5ba498d1e0>),
 ('group_4.fasta', <open file '<fdopen>', mode 'w+b' at 0x7f5ba7996420>)]

## Finally it works: --tblout >(tee >> 0.tab) to the rescue

In [36]:
%%time
import os
import subprocess
import shlex
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
fnames=""
flabels=""
for (l,f) in files:
    fnames+=f.name+" "
    flabels+=l+" "
#print flabels
    #print(l)
    #print(f.name)
    #parallel -j $1 blastall "-p blastn -d nt/nt -i "{}" -o "{.}".xml -e 1e-10 -m 7 -K 100 -b 200" ::: *.fa
    #print('Exists after close:', os.path.exists(f.name))
#this prints nicely to stdout, and is caught by p.stdout.read()
#cmd="parallel -j 8 hmmscan "+"-o /dev/null --noali --cpu 1"+" --tblout >(tee /dev/stdout)  " + "../amoCAB {} ::: "+fnames
#this prints to the file and stdout
#0.tab should be deleted if exists
if os.path.exists("0.tab"):
    os.remove("0.tab")
cmd="parallel -j 8 hmmscan "+"-o /dev/null --noali --cpu 1"+" --tblout >(tee -a 0.tab)  " + "../amoCAB {} ::: "+fnames
args = shlex.split(cmd)
#os.system(cmd)
p=subprocess.Popen(args,stdout=subprocess.PIPE)
#print(p.stdout.read())
output=p.stdout.read()
#os.system(cmd)
#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames
print(cmd)

parallel -j 8 hmmscan -o /dev/null --noali --cpu 1 --tblout >(tee -a 0.tab)  ../amoCAB {} ::: /tmp/tmpDjX8B8 /tmp/tmp6fEZVA /tmp/tmprxg_Kg /tmp/tmpG2VDVP 
CPU times: user 0 ns, sys: 4.02 ms, total: 4.02 ms
Wall time: 373 ms


In [39]:
len(files)

4

In [37]:
split_line="#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----"
chunks=output.split(split_line)

In [38]:
len(chunks)

5

In [42]:
total=[]
for chunk in chunks:
    total+=chunk.split("\n")[3:-11]

In [43]:
for i in total:
    print(i)

AmoC                 PF04896.11 2517404225           -           5.6e-111  357.3  11.5  6.6e-111  357.1  11.5   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C
AmoC                 PF04896.11 2517404670           -            1.7e-30   93.8   0.0   2.2e-30   93.3   0.0   1.1   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C
AmoC                 PF04896.11 2517404865           -            4.2e-93  298.8  17.7   5.1e-93  298.5  17.7   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C
Monooxygenase_B      PF04744.11 2517404866           -           7.5e-155  503.0   0.0  8.4e-155  502.9   0.0   1.0   1   0   0   1   1   1   1 Monooxygenase subunit B protein
AMO                  PF02461.15 2517404867           -           6.5e-114  367.3  13.8  7.7e-114  367.0  13.8   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase
AmoC                 PF04896.11 2517401456           -       

In [207]:
for i in chunks:
    print(i)



# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
AmoC                 PF04896.11 Ga0073928_1000040165 -            1.8e-05   11.8   0.5   2.9e-05   11.1   0.5   1.3   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C
AmoC                 PF04896.11 Ga0073928_100004499  -            4.3e-06   13.8   0.2   4.6e-06   13.7   0.2   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C
Monooxygenase_B      PF04744.11 Ga0073928_1000054854 -            9.1e-99  318.4   0.6   1.1e-98  318.1   0.6   1.0   1   0   0   1   1   1   1 Monooxygenase subunit B protein
AMO                  PF02461.15 Ga0073928_1000054856 -            1.2e-66  212.6  23.8   1.4

In [148]:
pipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

In [146]:
pipe.communicate()

('', '/bin/sh: 1: Syntax error: "(" unexpected\n')

In [199]:
#process = subprocess.Popen(['ls','-l'], stdout=subprocess.PIPE)
#print(process.stdout.read())

In [201]:
%%time
import os
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
fnames=""
flabels=""
for (l,f) in files:
    fnames+=f.name+" "
    flabels+=l+" "
#print flabels
    #print(l)
    #print(f.name)
    #parallel -j $1 blastall "-p blastn -d nt/nt -i "{}" -o "{.}".xml -e 1e-10 -m 7 -K 100 -b 200" ::: *.fa
    #print('Exists after close:', os.path.exists(f.name))
cmd="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {#}.tab " + "../amoCAB {} ::: "+fnames+" > /dev/null"

#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames
print(cmd)

parallel -j 8 hmmscan --cpu 1 --tblout {#}.tab ../amoCAB {} ::: /tmp/tmpFw8MBK /tmp/tmp_oy5yT /tmp/tmpM458tk /tmp/tmpVeuHbH  > /dev/null
CPU times: user 38 µs, sys: 6 µs, total: 44 µs
Wall time: 41 µs


In [153]:
c

NameError: name 'c' is not defined

In [187]:
cmd="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {#}.tab " + "../amoCAB {} ::: "+fnames+" > /dev/null"
#flabels
print(cmd)

parallel -j 8 hmmscan --cpu 1 --tblout {#}.tab ../amoCAB {} ::: /tmp/tmpHvDyKk /tmp/tmpaHwyEP /tmp/tmpeH4u_D /tmp/tmpzsOzgd  > /dev/null


In [81]:
filename

<open file '<fdopen>', mode 'w+b' at 0x7faa94788150>

In [58]:
/tmp/tmprOkRVb.close()

SyntaxError: invalid syntax (<ipython-input-58-02dd9a23c7c6>, line 1)

In [12]:
filename.name

'<fdopen>'

In [None]:
txt=filename.seek(0)
txt=filename.read()
#txt

In [27]:
filename

<open file '<fdopen>', mode 'w+b' at 0x000000000655A030>

In [24]:
txt2

''

In [22]:
#Coool

In [26]:
txt2=filename.read()

## In-memory files
[https://stackoverflow.com/questions/23028071/open-a-file-in-memory]

In [11]:
#import io##for python3
import StringIO

output = StringIO.StringIO()
output.write('First line.\n')
print >>output, 'Second line.'

# Retrieve file contents -- this will be
# 'First line.\nSecond line.\n'
contents = output.getvalue()
print(contents)
# Close object and discard memory buffer --
# .getvalue() will now raise an exception.
output.close()


First line.
Second line.



## Redoing generator with an in-memory implementation

In [15]:
import StringIO

In [21]:
%%time
from Bio import SeqIO
import StringIO
#output = io.StringIO()
record_iter = SeqIO.parse(open("../2517287028.genes.faa"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 50000)):
    filename = "group_%i.fastq" % (i + 1)
    filename = StringIO.StringIO()
    
    count = SeqIO.write(batch, filename, "fasta")
    print("Wrote %i records to %s" % (count, filename))

Wrote 3891 records to <StringIO.StringIO instance at 0x7f5bc2c582d8>
CPU times: user 58.6 ms, sys: 128 µs, total: 58.7 ms
Wall time: 58.7 ms


In [24]:
dir(filename)

['__doc__',
 '__init__',
 '__iter__',
 '__module__',
 'buf',
 'buflist',
 'close',
 'closed',
 'flush',
 'getvalue',
 'isatty',
 'len',
 'next',
 'pos',
 'read',
 'readline',
 'readlines',
 'seek',
 'softspace',
 'tell',
 'truncate',
 'write',
 'writelines']

## memory map
[https://docs.python.org/2/library/mmap.html]

In [None]:
import mmap
from Bio import SeqIO
#import StringIO

record_iter = SeqIO.parse(open("../76969.assembled.faa"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 50000)):
    filename = "group_%i.fastq" % (i + 1)
    with open(filename, "w") as handle:
        count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, filename))