In [1]:
def batch_iterator(iterator, batch_size):
    """Returns lists of length batch_size.

    This can be used on any iterator, for example to batch up
    SeqRecord objects from Bio.SeqIO.parse(...), or to batch
    Alignment objects from Bio.AlignIO.parse(...), or simply
    lines from a file handle.

    This is a generator function, and it returns lists of the
    entries from the supplied iterator.  Each list will have
    batch_size entries, although the final list may be shorter.
    """
    entry = True  # Make sure we loop once
    while entry:
        batch = []
        while len(batch) < batch_size:
            try:
                entry = iterator.next()
            except StopIteration:
                entry = None
            if entry is None:
                # End of file
                break
            batch.append(entry)
        if batch:
            yield batch



In [9]:
%%time
from Bio import SeqIO

record_iter = SeqIO.parse(open("../2517287028.genes.faa"),"fasta")#m.rosea
for i, batch in enumerate(batch_iterator(record_iter, 1000)):
    filename = "group_%i.fasta" % (i + 1)
    with open(filename, "w") as handle:
        count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, filename))



Wrote 500 records to group_1.fasta
Wrote 500 records to group_2.fasta
Wrote 500 records to group_3.fasta
Wrote 500 records to group_4.fasta
Wrote 500 records to group_5.fasta
Wrote 500 records to group_6.fasta
Wrote 500 records to group_7.fasta
Wrote 391 records to group_8.fasta
CPU times: user 93 ms, sys: 7.56 ms, total: 101 ms
Wall time: 109 ms


In [321]:
files[0][1].name

'/tmp/tmpSqwnnp'

In [326]:
%%time


with open('/tmp/tmpSqwnnp') as handle:
    #m.rosea
    record_dict = SeqIO.index("example.fasta", "fasta")
    print(record_dict["2517401456"])



    
    #     filename = "group_%i.fasta" % (i + 1)
#     with open(filename, "w") as handle:
#         count = SeqIO.write(batch, handle, "fasta")
#     print("Wrote %i records to %s" % (count, filename))

ID: 2517401456
Name: 2517401456
Description: 2517401456 A3OODRAFT_0323 methane monooxygenase/ammonia monooxygenase, subunit C [Methylocystis rosea SV97T]
Number of features: 0
Seq('MSVTTQSAAGAMDRAQPIVELKGMWIGLAVLNGFYLVVRIYEQIYGWRAGLDSF...SSC', SingleLetterAlphabet())
CPU times: user 4.34 ms, sys: 21 µs, total: 4.36 ms
Wall time: 4.31 ms


# Trying temporary files
[https://docs.python.org/3/library/tempfile.html]

In [3]:
import tempfile

# create a temporary file and write some data to it
fp = tempfile.TemporaryFile()
fp.write(b'Hello world!')
# read data from file
fp.seek(0)

txt=fp.read()
print(txt)
# close the file, it will be removed
fp.close()
#print(txt)
# # create a temporary file using a context manager
# with tempfile.TemporaryFile() as fp:
#     fp.write(b'Hello world!')
#     fp.seek(0)
#     fp.read()

# # file is now closed and removed

# # create a temporary directory using the context manager
# with tempfile.TemporaryDirectory() as tmpdirname:
#     print('created temporary directory', tmpdirname)

# directory and contents have been removed

Hello world!


In [None]:
import tempfile
from Bio import SeqIO

record_iter = SeqIO.parse(open("../76969.assembled.faa"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 30000)):
    
    filename = "group_%i.fasta" % (i + 1)
    print(filename)
    filename = tempfile.TemporaryFile()
    filename.seek(0)
    count = SeqIO.write(batch, filename, "fasta")
#     with open(filename, "w") as handle:
#         count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, filename))

## Named temporary files

[https://docs.python.org/2/library/tempfile.html]

In [303]:
files

[(0, <closed file '<fdopen>', mode 'w+b' at 0x7fbba162bf60>),
 (1, <closed file '<fdopen>', mode 'w+b' at 0x7fbba162be40>),
 (2, <closed file '<fdopen>', mode 'w+b' at 0x7fbba0baee40>),
 (3, <closed file '<fdopen>', mode 'w+b' at 0x7fbba0baef60>)]

In [304]:
import tempfile
from Bio import SeqIO

files=list()
sizes={}
#input_fasta="../76969.assembled.faa"
input_fasta="../2517287028.genes.faa"#m.rosea
record_iter = SeqIO.parse(open(input_fasta),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 1000)):
    
    label = i
    #f = tempfile.NamedTemporaryFile(delete=False)#exists on closing
    f = tempfile.NamedTemporaryFile()#deleted after f.close()
    files.append((label,f))
    #f.seek(0)
    count = SeqIO.write(batch, f, "fasta")
    f.flush() #this solves the EOF problem
#     with open(filename, "w") as handle:
#         count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, f.name))
    sizes[f.name]=count

Wrote 1000 records to /tmp/tmpSqwnnp
Wrote 1000 records to /tmp/tmpd_zEcV
Wrote 1000 records to /tmp/tmpNdc_YO
Wrote 891 records to /tmp/tmpO4S3j2


In [287]:
for (l,f) in files:
    print(l)
    print(f.name)
    f.close()

0
/tmp/tmpwib0Ns
1
/tmp/tmpIrHmsO
2
/tmp/tmpNS0g6z
3
/tmp/tmpnelKVM


In [288]:
sizes

{'/tmp/tmpIrHmsO': 1000,
 '/tmp/tmpNS0g6z': 1000,
 '/tmp/tmpnelKVM': 891,
 '/tmp/tmpwib0Ns': 1000}

In [30]:
fna

NameError: name 'fna' is not defined

## hmmering tempfiles

In [305]:
files

[(0, <open file '<fdopen>', mode 'w+b' at 0x7fbba162bc90>),
 (1, <open file '<fdopen>', mode 'w+b' at 0x7fbba14111e0>),
 (2, <open file '<fdopen>', mode 'w+b' at 0x7fbba0bfbed0>),
 (3, <open file '<fdopen>', mode 'w+b' at 0x7fbb792be930>)]

In [63]:
%%time
import os
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
for (l,f) in files:
    #print(l)
    #print(f.name)
    
    #print('Exists after close:', os.path.exists(f.name))
    command="hmmscan --tblout "+l+".tab " + "../amoCAB " + f.name+""
    print(command)
    os.system(command)
    

for (l,f) in files:
    print(l)
    print(f.name)
    f.close()
    print('Exists after close:', os.path.exists(f.name))

hmmscan --tblout group_1.fasta.tab ../amoCAB /tmp/tmp4WuJSN
hmmscan --tblout group_2.fasta.tab ../amoCAB /tmp/tmpCBMFY1
hmmscan --tblout group_3.fasta.tab ../amoCAB /tmp/tmp9ABMDY
hmmscan --tblout group_4.fasta.tab ../amoCAB /tmp/tmp4g8aDl
group_1.fasta
/tmp/tmp4WuJSN
('Exists after close:', False)
group_2.fasta
/tmp/tmpCBMFY1
('Exists after close:', False)
group_3.fasta
/tmp/tmp9ABMDY
('Exists after close:', False)
group_4.fasta
/tmp/tmp4g8aDl
('Exists after close:', False)
CPU times: user 4.65 ms, sys: 4.08 ms, total: 8.74 ms
Wall time: 24.7 ms


In [59]:
%%time
import os
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
for (l,f) in files:
    #print(l)
    #print(f.name)
    
    #print('Exists after close:', os.path.exists(f.name))
    #
    command="hmmscan "+"--cpu 8 --noali"+" --tblout "+l+".tab " + "../amoCAB " + f.name
    print(command)
    os.system(command)

for (l,f) in files:
    print(l)
    print(f.name)
    f.close()
    #print('Exists after close:', os.path.exists(f.name))

hmmscan --cpu 8 --noali --tblout group_1.fasta.tab ../amoCAB /tmp/tmpM_ZFS9
hmmscan --cpu 8 --noali --tblout group_2.fasta.tab ../amoCAB /tmp/tmp6sxP79
hmmscan --cpu 8 --noali --tblout group_3.fasta.tab ../amoCAB /tmp/tmpCgusGR
hmmscan --cpu 8 --noali --tblout group_4.fasta.tab ../amoCAB /tmp/tmpVnIz1M
group_1.fasta
/tmp/tmpM_ZFS9
('Exists after close:', False)
group_2.fasta
/tmp/tmp6sxP79
('Exists after close:', False)
group_3.fasta
/tmp/tmpCgusGR
('Exists after close:', False)
group_4.fasta
/tmp/tmpVnIz1M
('Exists after close:', False)
CPU times: user 646 µs, sys: 8.05 ms, total: 8.7 ms
Wall time: 12.5 s


## Subprocess version

In [45]:
%%time
import os
import subprocess
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
for (l,f) in files:
    #print(l)
    #print(f.name)
    
    #print('Exists after close:', os.path.exists(f.name))
    command="hmmscan "+"--cpu 8"+" --tblout "+l+".tab " + "../amoCAB " + f.name
    print(command)
    subprocess.Popen(command, shell=True)

for (l,f) in files:
    print(l)
    print(f.name)
    f.close()
    print('Exists after close:', os.path.exists(f.name))

hmmscan --cpu 8 --tblout group_1.fasta.tab ../amoCAB /tmp/tmpWxz75u
hmmscan --cpu 8 --tblout group_2.fasta.tab ../amoCAB /tmp/tmp_a05vQ
hmmscan --cpu 8 --tblout group_3.fasta.tab ../amoCAB /tmp/tmp0tsj4Z
hmmscan --cpu 8 --tblout group_4.fasta.tab ../amoCAB /tmp/tmpZO5t_L
group_1.fasta
/tmp/tmpWxz75u
('Exists after close:', False)
group_2.fasta
/tmp/tmp_a05vQ
('Exists after close:', False)
group_3.fasta
/tmp/tmp0tsj4Z
('Exists after close:', False)
group_4.fasta
/tmp/tmpZO5t_L
('Exists after close:', False)
CPU times: user 6.91 ms, sys: 4.17 ms, total: 11.1 ms
Wall time: 18.5 ms


In [757]:
filename

'../2517287028.genes.faa'

## Subprocess verision of entire file, with no splitting

In [786]:
%%time
from Bio import SeqIO
import os
import subprocess
import shlex

#filename="../76969.assembled.faa"
filename="../2517287028.genes.faa"#m.rosea
#fout=entire_mg1_2.tab
fout="entire_mros.tab"
#fh = SeqIO.parse(open(filename),"fasta")#m.rosea
# with open(filename, "r") as handle:
#     count = SeqIO.write(batch, handle, "fasta")
# print("Wrote %i records to %s" % (count, filename))




#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")

    #print('Exists after close:', os.path.exists(f.name))
# command="hmmscan "+"--cpu 8"+" -o /dev/null --tblout "+"entire_mg1"+".tab " + "../amoCAB " + filename
# print(command)
# subprocess.Popen(command, shell=True)

cmd="hmmscan --cpu 8 -o /dev/null --tblout "+fout+" ../amoCAB "+filename
print(cmd)
p=subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
# print(cmd)
# args = shlex.split(cmd)
#os.system(cmd)
#os.system(cmd)
#print(p.stdout.read())
#output=p.stdout.read()
#os.system(cmd)
#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames

#p=subprocess.Popen(args,stdout=subprocess.PIPE)
#print(p.stdout.read())
output=p.stdout.read()

hmmscan --cpu 8 -o /dev/null --tblout entire_mros.tab ../amoCAB ../2517287028.genes.faa
CPU times: user 2.53 ms, sys: 429 µs, total: 2.96 ms
Wall time: 1.48 s


In [787]:
with open(fout, "r") as handle:
    mros_entire=handle.read()


In [788]:
mros_entire=mros_entire.split('\n')
for i in mros_entire:
    print(i)

#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
AmoC                 PF04896.11 2517401456           -           5.6e-120  386.8  15.8  7.2e-120  386.4  15.8   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C
AmoC                 PF04896.11 2517402628           -           5.8e-122  393.3  14.7    7e-122  393.0  14.7   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C
AMO                  PF02461.15 2517402629           -           4.8e-124  400.4  24.0  5.4e-124  400.3  24.0   1.0   1   0   

In [789]:
header=mros_entire[1:2]
header[0].split()

['#',
 'target',
 'name',
 'accession',
 'query',
 'name',
 'accession',
 'E-value',
 'score',
 'bias',
 'E-value',
 'score',
 'bias',
 'exp',
 'reg',
 'clu',
 'ov',
 'env',
 'dom',
 'rep',
 'inc',
 'description',
 'of',
 'target']

In [790]:
len(header[0].split())

24

In [791]:
header

['# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target']

In [792]:
mros_entire[3:-11]

['AmoC                 PF04896.11 2517401456           -           5.6e-120  386.8  15.8  7.2e-120  386.4  15.8   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C',
 'AmoC                 PF04896.11 2517402628           -           5.8e-122  393.3  14.7    7e-122  393.0  14.7   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C',
 'AMO                  PF02461.15 2517402629           -           4.8e-124  400.4  24.0  5.4e-124  400.3  24.0   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase',
 'Monooxygenase_B      PF04744.11 2517402630           -           3.3e-170  553.6   0.0  3.7e-170  553.4   0.0   1.0   1   0   0   1   1   1   1 Monooxygenase subunit B protein',
 'AmoC                 PF04896.11 2517403201           -           5.8e-122  393.3  14.7    7e-122  393.0  14.7   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C',
 'Monooxygenase_B      PF04744.11 2517403

In [793]:
mros_entire[4].split()[0]

'AmoC'

# Creating a dataframe

In [794]:
#d = df[[p, p.team, p.passing_att, p.passer_rating()] for p in game.players.passing()]

In [795]:
import pandas as pd

In [796]:
[p[5] for p=line.split() in mros_entire[3:-11]]

SyntaxError: invalid syntax (<ipython-input-796-c2fdd2690845>, line 1)

In [797]:
#[(x[1],x[2]) for x in (x.split(";") for x in a.split("\n")) if x[1] != 5]

In [798]:
[[x[0], x[1]] for x in mros_entire[3:-11] for x in (x.split())]

IndexError: string index out of range

In [799]:
a = "1;2;4\n3;4;5"
[(x[1],x[2]) for x in (x.split(";") for x in a.split("\n")) if x[1] != 5]

[('2', '4'), ('4', '5')]

In [800]:
df=pd.DataFrame([x.split() for x in mros_entire[3:-11]])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,AmoC,PF04896.11,2517401456,-,5.6e-120,386.8,15.8,7.199999999999999e-120,386.4,15.8,...,0,1,1,1,1,Ammonia,monooxygenase/methane,"monooxygenase,",subunit,C
1,AmoC,PF04896.11,2517402628,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,0,1,1,1,1,Ammonia,monooxygenase/methane,"monooxygenase,",subunit,C
2,AMO,PF02461.15,2517402629,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,...,0,1,1,1,1,Ammonia,monooxygenase,,,
3,Monooxygenase_B,PF04744.11,2517402630,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,0,1,1,1,1,Monooxygenase,subunit,B,protein,
4,AmoC,PF04896.11,2517403201,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,0,1,1,1,1,Ammonia,monooxygenase/methane,"monooxygenase,",subunit,C
5,Monooxygenase_B,PF04744.11,2517403996,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,0,1,1,1,1,Monooxygenase,subunit,B,protein,
6,AMO,PF02461.15,2517403997,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,...,0,1,1,1,1,Ammonia,monooxygenase,,,
7,AmoC,PF04896.11,2517403998,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,0,1,1,1,1,Ammonia,monooxygenase/methane,"monooxygenase,",subunit,C
8,AmoC,PF04896.11,2517404225,-,5.600000000000001e-111,357.3,11.5,6.6000000000000005e-111,357.1,11.5,...,0,1,1,1,1,Ammonia,monooxygenase/methane,"monooxygenase,",subunit,C
9,AmoC,PF04896.11,2517404670,-,1.7e-30,93.8,0.0,2.2000000000000002e-30,93.3,0.0,...,0,1,1,1,1,Ammonia,monooxygenase/methane,"monooxygenase,",subunit,C


In [801]:
def joinDescriptionColumns(descr_columns):
    merged=''
    for row in descr_columns:
        if row != None:
            merged+=row+' '
    #print(descr_columns)
    return merged.strip()

In [802]:
#df['period'] = df[['Year', 'quarter']].apply(lambda x: ''.join(x), axis=1)
df[18]=df.loc[:,18:].apply(joinDescriptionColumns, axis=1)
df=df.loc[:,:18]
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,AmoC,PF04896.11,2517401456,-,5.6e-120,386.8,15.8,7.199999999999999e-120,386.4,15.8,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
1,AmoC,PF04896.11,2517402628,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
2,AMO,PF02461.15,2517402629,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,1.0,1,0,0,1,1,1,1,Ammonia monooxygenase
3,Monooxygenase_B,PF04744.11,2517402630,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,1.0,1,0,0,1,1,1,1,Monooxygenase subunit B protein
4,AmoC,PF04896.11,2517403201,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
5,Monooxygenase_B,PF04744.11,2517403996,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,1.0,1,0,0,1,1,1,1,Monooxygenase subunit B protein
6,AMO,PF02461.15,2517403997,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,1.0,1,0,0,1,1,1,1,Ammonia monooxygenase
7,AmoC,PF04896.11,2517403998,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
8,AmoC,PF04896.11,2517404225,-,5.600000000000001e-111,357.3,11.5,6.6000000000000005e-111,357.1,11.5,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
9,AmoC,PF04896.11,2517404670,-,1.7e-30,93.8,0.0,2.2000000000000002e-30,93.3,0.0,1.1,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."


In [259]:
%%time
import os
import subprocess
import shlex
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
fnames=""
flabels=""
for (l,f) in files:
    fnames+=f.name+" "
    flabels+=l+" "
#print flabels
    #print(l)
    #print(f.name)
    #parallel -j $1 blastall "-p blastn -d nt/nt -i "{}" -o "{.}".xml -e 1e-10 -m 7 -K 100 -b 200" ::: *.fa
    #print('Exists after close:', os.path.exists(f.name))
#this prints nicely to stdout, and is caught by p.stdout.read()
#cmd="parallel -j 8 hmmscan "+"-o /dev/null --noali --cpu 1"+" --tblout >(tee /dev/stdout)  " + "../amoCAB {} ::: "+fnames
#this prints to the file and stdout
#0.tab should be deleted if exists
if os.path.exists("0.tab"):
    os.remove("0.tab")
cmd="parallel -j 8 hmmscan "+"-o /dev/null --noali --cpu 1"+" --tblout >(tee -a 0.tab)  " + "../amoCAB {} ::: "+fnames
args = shlex.split(cmd)
#os.system(cmd)
p=subprocess.Popen(args,stdout=subprocess.PIPE)
#print(p.stdout.read())
output=p.stdout.read()
#os.system(cmd)
#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames
print(cmd)

parallel -j 8 hmmscan -o /dev/null --noali --cpu 1 --tblout >(tee -a 0.tab)  ../amoCAB {} ::: /tmp/tmpsNB_xI /tmp/tmp9XW3yx /tmp/tmpyMn37J /tmp/tmp4h4Gou 
CPU times: user 2.3 ms, sys: 331 µs, total: 2.63 ms
Wall time: 356 ms


## Trying GNU parallel

In [260]:
files

[('group_1.fasta', <open file '<fdopen>', mode 'w+b' at 0x7fbba15f5ed0>),
 ('group_2.fasta', <open file '<fdopen>', mode 'w+b' at 0x7fbb79bbcd20>),
 ('group_3.fasta', <open file '<fdopen>', mode 'w+b' at 0x7fbba15f5f60>),
 ('group_4.fasta', <open file '<fdopen>', mode 'w+b' at 0x7fbb792be030>)]

In [8]:
#this works
#parallel -j 8 hmmscan --cpu 1 --tblout {.}.tab ../amoCAB {} ::: *.fasta > /dev/null


In [35]:
%%time
import os
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
fnames=""
flabels=""
for (l,f) in files:
    fnames+=f.name+" "
    flabels+=l+" "
#print flabels
    #print(l)
    #print(f.name)
    #parallel -j $1 blastall "-p blastn -d nt/nt -i "{}" -o "{.}".xml -e 1e-10 -m 7 -K 100 -b 200" ::: *.fa
    #print('Exists after close:', os.path.exists(f.name))
    #this does print in the output:
    #hmmscan --cpu 8 --noali --tblout >(tee 1.tab)  ../amoCAB /tmp/tmp4WuJSN > /dev/null

cmd="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout >(tee {#}.tab) " + "../amoCAB {} ::: "+fnames+" > /dev/null"
os.system(cmd)
#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames
print(cmd)
    #os.system(command)

# for (l,f) in files:
#     print(l)
#     print(f.name)
#     f.close()
#     print('Exists after close:', os.path.exists(f.name))

parallel -j 8 hmmscan --cpu 1 --tblout >(tee {#}.tab) ../amoCAB {} ::: /tmp/tmpDjX8B8 /tmp/tmp6fEZVA /tmp/tmprxg_Kg /tmp/tmpG2VDVP  > /dev/null
CPU times: user 928 µs, sys: 122 µs, total: 1.05 ms
Wall time: 1.63 ms


119

# Rewriting to use subprocess module

In [None]:
# x = subprocess.Popen(['touch', 'xyz'])
# >>> print x

In [150]:
%%time
import os
import subprocess
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
fnames=""
flabels=""
for (l,f) in files:
    fnames+=f.name+" "
    flabels+=l+" "
#print flabels
    #print(l)
    #print(f.name)
    #parallel -j $1 blastall "-p blastn -d nt/nt -i "{}" -o "{.}".xml -e 1e-10 -m 7 -K 100 -b 200" ::: *.fa
    #print('Exists after close:', os.path.exists(f.name))
cmd="parallel -j 8 hmmscan "+" --noali --cpu 1"+" --tblout >0.tab  " + "../amoCAB {} ::: "+fnames+" > /dev/null"
p=subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE)
print(p.stdout.read())
#os.system(cmd)
#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames
print(cmd)


parallel -j 8 hmmscan  --noali --cpu 1 --tblout >0.tab  ../amoCAB {} ::: /tmp/tmp7100qP /tmp/tmpnD7HAz /tmp/tmpoKVoBZ /tmp/tmpm1kpsd  > /dev/null
CPU times: user 1.03 ms, sys: 4.03 ms, total: 5.06 ms
Wall time: 93.7 ms


## os.system version for output redirection

In [21]:
files

[('group_1.fasta', <open file '<fdopen>', mode 'w+b' at 0x7fbba15f5e40>),
 ('group_2.fasta', <open file '<fdopen>', mode 'w+b' at 0x7fbbafb60150>),
 ('group_3.fasta', <open file '<fdopen>', mode 'w+b' at 0x7fbba15f5c90>),
 ('group_4.fasta', <open file '<fdopen>', mode 'w+b' at 0x7fbbafb600c0>)]

In [24]:
fnames

NameError: name 'fnames' is not defined

## Finally it works: --tblout >(tee >> 0.tab) to the rescue

In [306]:
files

[(0, <open file '<fdopen>', mode 'w+b' at 0x7fbba162bc90>),
 (1, <open file '<fdopen>', mode 'w+b' at 0x7fbba14111e0>),
 (2, <open file '<fdopen>', mode 'w+b' at 0x7fbba0bfbed0>),
 (3, <open file '<fdopen>', mode 'w+b' at 0x7fbb792be930>)]

In [307]:
files2={j.name:i for i,j in files}
files2

{'/tmp/tmpNdc_YO': 2,
 '/tmp/tmpO4S3j2': 3,
 '/tmp/tmpSqwnnp': 0,
 '/tmp/tmpd_zEcV': 1}

In [308]:
%%time
import os
import subprocess
import shlex
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
fnames=""
flabels=""
for (l,f) in files:
    fnames+=f.name+" "
    flabels+=str(l)+" "
#print flabels
    #print(l)
    #print(f.name)
    #parallel -j $1 blastall "-p blastn -d nt/nt -i "{}" -o "{.}".xml -e 1e-10 -m 7 -K 100 -b 200" ::: *.fa
    #print('Exists after close:', os.path.exists(f.name))
#this prints nicely to stdout, and is caught by p.stdout.read()
#cmd="parallel -j 8 hmmscan "+"-o /dev/null --noali --cpu 1"+" --tblout >(tee /dev/stdout)  " + "../amoCAB {} ::: "+fnames
#this prints to the file and stdout
#0.tab should be deleted if exists
if os.path.exists("0.tab"):
    os.remove("0.tab")
cmd="parallel -j 8 hmmscan "+"-o /dev/null --noali --cpu 1"+" --tblout >(tee -a 0.tab)  " + "../amoCAB {} ::: "+fnames
args = shlex.split(cmd)
#os.system(cmd)
p=subprocess.Popen(args,stdout=subprocess.PIPE)
#print(p.stdout.read())
output=p.stdout.read()
#os.system(cmd)
#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames
print(cmd)

parallel -j 8 hmmscan -o /dev/null --noali --cpu 1 --tblout >(tee -a 0.tab)  ../amoCAB {} ::: /tmp/tmpSqwnnp /tmp/tmpd_zEcV /tmp/tmpNdc_YO /tmp/tmpO4S3j2 
CPU times: user 0 ns, sys: 7.66 ms, total: 7.66 ms
Wall time: 361 ms


In [309]:
len(files)

4

In [310]:
split_line="#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----"
chunks=output.split(split_line)

In [311]:
chunks

['',
 '\n# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target\n#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------\nAmoC                 PF04896.11 2517404225           -           5.6e-111  357.3  11.5  6.6e-111  357.1  11.5   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C\nAmoC                 PF04896.11 2517404670           -            1.7e-30   93.8   0.0   2.2e-30   93.3   0.0   1.1   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C\nAmoC                 PF04896.11 2517404865           -            4.2e-93  298.8  17.7   5.1e-93  298.5  17.7   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C\nMonooxygenase_B      PF04744.11 2517404866           -   

In [300]:
chunks[2]

IndexError: list index out of range

In [111]:
import re
re.findall('# Query file:\s*(.*)', chunks[0])

[]

In [108]:
chunks

['',
 '\n# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target\n#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------\nAmoC                 PF04896.11 2517404225           -           5.6e-111  357.3  11.5  6.6e-111  357.1  11.5   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C\nAmoC                 PF04896.11 2517404670           -            1.7e-30   93.8   0.0   2.2e-30   93.3   0.0   1.1   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C\nAmoC                 PF04896.11 2517404865           -            4.2e-93  298.8  17.7   5.1e-93  298.5  17.7   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C\nMonooxygenase_B      PF04744.11 2517404866           -   

In [92]:
mros_entire

['#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----',
 '# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target',
 '#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------',
 'AmoC                 PF04896.11 2517401456           -           5.6e-120  386.8  15.8  7.2e-120  386.4  15.8   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C',
 'AmoC                 PF04896.11 2517402628           -           5.8e-122  393.3  14.7    7e-122  393.0  14.7   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C',
 'AMO                  PF02461.15 2517402629           -           4.8e-124  400.4  24.0  5.4e-124  400.3 

In [94]:
for chunk in chunks:
    print(chunk)



# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
AmoC                 PF04896.11 2517404225           -           5.6e-111  357.3  11.5  6.6e-111  357.1  11.5   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C
AmoC                 PF04896.11 2517404670           -            1.7e-30   93.8   0.0   2.2e-30   93.3   0.0   1.1   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C
AmoC                 PF04896.11 2517404865           -            4.2e-93  298.8  17.7   5.1e-93  298.5  17.7   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C
Monooxygenase_B      PF04744.11 2517404866           -           7.5e

In [None]:
# df=pd.DataFrame([x.split() for x in mros_entire[3:-11]])

# #df['period'] = df[['Year', 'quarter']].apply(lambda x: ''.join(x), axis=1)
# df[18]=df.loc[:,18:].apply(joinDescriptionColumns, axis=1)
# df=df.loc[:,:18]
# df

In [434]:
total=[]
df_scramb=pd.DataFrame()
for chunk in chunks:
    if chunk != '':
        #print((chunk))
        data=chunk.split('\n')[3:-11]
        #print(data)
        d=pd.DataFrame([x.split() for x in data])
        
        query=re.findall('# Query file:\s*(.*)', chunk)[0]
        label=files2[query]
        #print(query)
        #d['query file']=query
        #df.insert(loc=idx, column='A', value=new_col)
        d.insert(loc=0, column='query file', value=query)
        d.insert(loc=1, column='label', value=label)
        #d['label']=files2[query]
        df_scramb=df_scramb.append(d)
        #total+=chunk.split("\n")[3:-11]
df_scramb.set_index(['query file', 'label'], inplace=True)

#df_scramb[18]

df_scramb.iloc[:,18]=df_scramb.iloc[:,18:].apply(joinDescriptionColumns, axis=1)
df_scramb=df_scramb.iloc[:,:19]
df_scramb.reset_index(inplace=True)
df_scramb.sort_values(by='label', inplace=True)

In [435]:
df2=df_scramb.iloc[:,2:]
df2.reset_index(inplace=True, drop=True)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,AmoC,PF04896.11,2517401456,-,5.6e-120,386.8,15.8,7.199999999999999e-120,386.4,15.8,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
1,AmoC,PF04896.11,2517402628,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
2,AMO,PF02461.15,2517402629,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,1.0,1,0,0,1,1,1,1,Ammonia monooxygenase
3,Monooxygenase_B,PF04744.11,2517402630,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,1.0,1,0,0,1,1,1,1,Monooxygenase subunit B protein
4,AmoC,PF04896.11,2517403201,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
5,Monooxygenase_B,PF04744.11,2517403996,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,1.0,1,0,0,1,1,1,1,Monooxygenase subunit B protein
6,AMO,PF02461.15,2517403997,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,1.0,1,0,0,1,1,1,1,Ammonia monooxygenase
7,AmoC,PF04896.11,2517403998,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
8,AmoC,PF04896.11,2517404225,-,5.600000000000001e-111,357.3,11.5,6.6000000000000005e-111,357.1,11.5,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
9,AmoC,PF04896.11,2517404670,-,1.7e-30,93.8,0.0,2.2000000000000002e-30,93.3,0.0,1.1,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."


In [441]:
df

Unnamed: 0,2,2_x,0_x,1_x,2_x.1,3_x,4,5,6,7,...,1_y,2_y,3_y,0,1,2_y.1,3,diff,diff1,diff2
0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,320,1155
1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1155,1
2,2517402629,2517402629,AMO,PF02461.15,2517402629,-,4.800000000000001e-124,400.4,24.0,5.4e-124,...,1476,2517402629 A3OODRAFT_1496 methane monooxygenas...,<bound method Seq.tostring of Seq('MSQSKSGGAVG...,2517402629,1476,2517402629 A3OODRAFT_1496 methane monooxygenas...,<bound method Seq.tostring of Seq('MSQSKSGGAVG...,1,1,1
3,0,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,566
4,0,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,566,773
5,0,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,773,1
6,2517403997,2517403997,AMO,PF02461.15,2517403997,-,4.800000000000001e-124,400.4,24.0,5.4e-124,...,2817,2517403997 A3OODRAFT_2864 methane monooxygenas...,<bound method Seq.tostring of Seq('MSQSKSGGAVG...,2517403997,2817,2517403997 A3OODRAFT_2864 methane monooxygenas...,<bound method Seq.tostring of Seq('MSQSKSGGAVG...,1,1,1
7,0,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,224
8,0,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,224,440
9,0,0,0,0,0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,440,192


In [439]:
df2[18][0]

'Ammonia monooxygenase/methane monooxygenase, subunit C'

In [803]:
df==df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
6,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
7,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
8,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
9,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [317]:
df_scramb

Unnamed: 0,query file,label,0,1,2,3,4,5,6,7,...,9,10,11,12,13,14,15,16,17,18
5,/tmp/tmpSqwnnp,0,AmoC,PF04896.11,2517401456,-,5.6e-120,386.8,15.8,7.199999999999999e-120,...,15.8,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
6,/tmp/tmpd_zEcV,1,AmoC,PF04896.11,2517402628,-,5.8e-122,393.3,14.7,7e-122,...,14.7,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
7,/tmp/tmpd_zEcV,1,AMO,PF02461.15,2517402629,-,4.800000000000001e-124,400.4,24.0,5.4e-124,...,24.0,1.0,1,0,0,1,1,1,1,Ammonia monooxygenase
8,/tmp/tmpd_zEcV,1,Monooxygenase_B,PF04744.11,2517402630,-,3.3e-170,553.6,0.0,3.7e-170,...,0.0,1.0,1,0,0,1,1,1,1,Monooxygenase subunit B protein
9,/tmp/tmpNdc_YO,2,AmoC,PF04896.11,2517403201,-,5.8e-122,393.3,14.7,7e-122,...,14.7,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
10,/tmp/tmpNdc_YO,2,Monooxygenase_B,PF04744.11,2517403996,-,3.3e-170,553.6,0.0,3.7e-170,...,0.0,1.0,1,0,0,1,1,1,1,Monooxygenase subunit B protein
11,/tmp/tmpNdc_YO,2,AMO,PF02461.15,2517403997,-,4.800000000000001e-124,400.4,24.0,5.4e-124,...,24.0,1.0,1,0,0,1,1,1,1,Ammonia monooxygenase
12,/tmp/tmpNdc_YO,2,AmoC,PF04896.11,2517403998,-,5.8e-122,393.3,14.7,7e-122,...,14.7,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
0,/tmp/tmpO4S3j2,3,AmoC,PF04896.11,2517404225,-,5.600000000000001e-111,357.3,11.5,6.6000000000000005e-111,...,11.5,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
1,/tmp/tmpO4S3j2,3,AmoC,PF04896.11,2517404670,-,1.7e-30,93.8,0.0,2.2000000000000002e-30,...,0.0,1.1,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."


In [249]:
# def sizes(s):
#     s['size_kb'] = locale.format("%.1f", s['size'] / 1024.0, grouping=True) + ' KB'
#     s['size_mb'] = locale.format("%.1f", s['size'] / 1024.0 ** 2, grouping=True) + ' MB'
#     s['size_gb'] = locale.format("%.1f", s['size'] / 1024.0 ** 3, grouping=True) + ' GB'
#     return s

# df_test = df_test.append(rows_list)
# df_test = df_test.apply(sizes, axis=1)

In [285]:
files2['/tmp/tmp4h4Gou']

'group_4.fasta'

# Putting it all together

## Adding sequence and position - entire version

In [804]:
filename

'../2517287028.genes.faa'

In [805]:
names=df[2].tolist()

In [806]:
def extractFeatures(ids):
    features=[]
    pos=0
    for record in SeqIO.parse(filename,"fasta"):
        pos+=1
        if record.name in ids:
            features.append({'qid':record.name, 'position':pos, 'descr':record.description, 'seq':str(record.seq)})
    return pd.DataFrame(features)
        

In [814]:
df=df.merge(extractFeatures(names), left_on=2, right_on='qid', suffixes=('',''))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,qid_x,seq_x,descr_y,position_y,qid_y,seq_y,descr,position,qid,seq
0,AmoC,PF04896.11,2517401456,-,5.6e-120,386.8,15.8,7.199999999999999e-120,386.4,15.8,...,2517401456,MSVTTQSAAGAMDRAQPIVELKGMWIGLAVLNGFYLVVRIYEQIYG...,2517401456 A3OODRAFT_0323 methane monooxygenas...,320,2517401456,MSVTTQSAAGAMDRAQPIVELKGMWIGLAVLNGFYLVVRIYEQIYG...,2517401456 A3OODRAFT_0323 methane monooxygenas...,320,2517401456,MSVTTQSAAGAMDRAQPIVELKGMWIGLAVLNGFYLVVRIYEQIYG...
1,AmoC,PF04896.11,2517402628,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,2517402628,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,2517402628 A3OODRAFT_1495 methane monooxygenas...,1475,2517402628,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,2517402628 A3OODRAFT_1495 methane monooxygenas...,1475,2517402628,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...
2,AMO,PF02461.15,2517402629,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,...,2517402629,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...,2517402629 A3OODRAFT_1496 methane monooxygenas...,1476,2517402629,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...,2517402629 A3OODRAFT_1496 methane monooxygenas...,1476,2517402629,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...
3,Monooxygenase_B,PF04744.11,2517402630,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,2517402630,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,2517402630 A3OODRAFT_1497 methane monooxygenas...,1477,2517402630,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,2517402630 A3OODRAFT_1497 methane monooxygenas...,1477,2517402630,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...
4,AmoC,PF04896.11,2517403201,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,2517403201,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,2517403201 A3OODRAFT_2068 methane monooxygenas...,2043,2517403201,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,2517403201 A3OODRAFT_2068 methane monooxygenas...,2043,2517403201,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...
5,Monooxygenase_B,PF04744.11,2517403996,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,2517403996,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,2517403996 A3OODRAFT_2863 methane monooxygenas...,2816,2517403996,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,2517403996 A3OODRAFT_2863 methane monooxygenas...,2816,2517403996,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...
6,AMO,PF02461.15,2517403997,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,...,2517403997,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...,2517403997 A3OODRAFT_2864 methane monooxygenas...,2817,2517403997,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...,2517403997 A3OODRAFT_2864 methane monooxygenas...,2817,2517403997,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...
7,AmoC,PF04896.11,2517403998,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,2517403998,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,2517403998 A3OODRAFT_2865 methane monooxygenas...,2818,2517403998,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,2517403998 A3OODRAFT_2865 methane monooxygenas...,2818,2517403998,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...
8,AmoC,PF04896.11,2517404225,-,5.600000000000001e-111,357.3,11.5,6.6000000000000005e-111,357.1,11.5,...,2517404225,MSMTKTEARSAARSAERIIDTRPVLIGVPALMLFVAILRLYEQLFA...,2517404225 A3OODRAFT_3092 Ammonia monooxygenas...,3042,2517404225,MSMTKTEARSAARSAERIIDTRPVLIGVPALMLFVAILRLYEQLFA...,2517404225 A3OODRAFT_3092 Ammonia monooxygenas...,3042,2517404225,MSMTKTEARSAARSAERIIDTRPVLIGVPALMLFVAILRLYEQLFA...
9,AmoC,PF04896.11,2517404670,-,1.7e-30,93.8,0.0,2.2000000000000002e-30,93.3,0.0,...,2517404670,MSETTQSAVGAIERAEPIVDLRGMWIGLAVLNGFYLVVRIYEQVFG...,2517404670 A3OODRAFT_3537 Ammonia monooxygenas...,3482,2517404670,MSETTQSAVGAIERAEPIVDLRGMWIGLAVLNGFYLVVRIYEQVFG...,2517404670 A3OODRAFT_3537 Ammonia monooxygenas...,3482,2517404670,MSETTQSAVGAIERAEPIVDLRGMWIGLAVLNGFYLVVRIYEQVFG...


In [818]:
l

[320, 1475, 1476, 1477, 2043, 2816, 2817, 2818, 3042, 3482, 3674, 3675, 3676]

## Counting, identifying and annotating operons

In [819]:
max_distance=2

model2type={'AmoC': 'C', 'AMO': 'A', 'Monooxygenase_B': 'B'}

def is_operon(x):
    return x['diff1'] or x['diff2']

def rel_coordinates(x):
    if ((x['diff1'] < max_distance) or (x['diff2']) <max_distance):
        return min(x['diff1'],x['diff2'])
    else:
        return 0

l=df['position'].tolist()

df['diff1']=abs((np.array(l)-np.array([0] + l[:-1])))#shift to left
df['diff2']=abs(np.array(l)-np.array(l[1:]+[0]))#shift to right
#df[(df['diff1'] <= 2) | (df['diff2'] <= 2)]

df['is_operon']=df[['diff1', 'diff2']].apply(lambda x: x < max_distance).apply(is_operon, axis=1)

df['rel_coordinates']=df[['diff1', 'diff2']].apply(rel_coordinates, axis=1)

df['operon_count']=operonCount(df['is_operon'].tolist())
df['type']=df[0].apply(lambda x: model2type[x])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,descr,position,qid,seq,diff1,diff2,is_operon,rel_coordinates,operon_count,type
0,AmoC,PF04896.11,2517401456,-,5.6e-120,386.8,15.8,7.199999999999999e-120,386.4,15.8,...,2517401456 A3OODRAFT_0323 methane monooxygenas...,320,2517401456,MSVTTQSAAGAMDRAQPIVELKGMWIGLAVLNGFYLVVRIYEQIYG...,320,1155,False,0,False,C
1,AmoC,PF04896.11,2517402628,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,2517402628 A3OODRAFT_1495 methane monooxygenas...,1475,2517402628,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,1155,1,True,1,1,C
2,AMO,PF02461.15,2517402629,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,...,2517402629 A3OODRAFT_1496 methane monooxygenas...,1476,2517402629,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...,1,1,True,1,1,A
3,Monooxygenase_B,PF04744.11,2517402630,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,2517402630 A3OODRAFT_1497 methane monooxygenas...,1477,2517402630,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,1,566,True,1,1,B
4,AmoC,PF04896.11,2517403201,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,2517403201 A3OODRAFT_2068 methane monooxygenas...,2043,2517403201,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,566,773,False,0,False,C
5,Monooxygenase_B,PF04744.11,2517403996,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,2517403996 A3OODRAFT_2863 methane monooxygenas...,2816,2517403996,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,773,1,True,1,2,B
6,AMO,PF02461.15,2517403997,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,...,2517403997 A3OODRAFT_2864 methane monooxygenas...,2817,2517403997,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...,1,1,True,1,2,A
7,AmoC,PF04896.11,2517403998,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,2517403998 A3OODRAFT_2865 methane monooxygenas...,2818,2517403998,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,1,224,True,1,2,C
8,AmoC,PF04896.11,2517404225,-,5.600000000000001e-111,357.3,11.5,6.6000000000000005e-111,357.1,11.5,...,2517404225 A3OODRAFT_3092 Ammonia monooxygenas...,3042,2517404225,MSMTKTEARSAARSAERIIDTRPVLIGVPALMLFVAILRLYEQLFA...,224,440,False,0,False,C
9,AmoC,PF04896.11,2517404670,-,1.7e-30,93.8,0.0,2.2000000000000002e-30,93.3,0.0,...,2517404670 A3OODRAFT_3537 Ammonia monooxygenas...,3482,2517404670,MSETTQSAVGAIERAEPIVDLRGMWIGLAVLNGFYLVVRIYEQVFG...,440,192,False,0,False,C


In [820]:
import numpy as np

In [821]:
df['is_operon'].tolist()
        

[False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True]

In [822]:
def operonCount(lst):
    opCnt=0
    state=False
    for i in xrange(len(lst)):
        if lst[i]:
            newState=True
            if (state==False) and (newState==True):
                opCnt+=1
            lst[i]=opCnt
            state=newState
        else:
            state=False
    return lst
            

In [824]:
opCnt

[0, 1, 1, 1, 0, 2, 2, 2, 0, 0, 3, 3, 3]

In [825]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,descr,position,qid,seq,diff1,diff2,is_operon,rel_coordinates,operon_count,type
0,AmoC,PF04896.11,2517401456,-,5.6e-120,386.8,15.8,7.199999999999999e-120,386.4,15.8,...,2517401456 A3OODRAFT_0323 methane monooxygenas...,320,2517401456,MSVTTQSAAGAMDRAQPIVELKGMWIGLAVLNGFYLVVRIYEQIYG...,320,1155,False,0,False,C
1,AmoC,PF04896.11,2517402628,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,2517402628 A3OODRAFT_1495 methane monooxygenas...,1475,2517402628,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,1155,1,True,1,1,C
2,AMO,PF02461.15,2517402629,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,...,2517402629 A3OODRAFT_1496 methane monooxygenas...,1476,2517402629,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...,1,1,True,1,1,A
3,Monooxygenase_B,PF04744.11,2517402630,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,2517402630 A3OODRAFT_1497 methane monooxygenas...,1477,2517402630,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,1,566,True,1,1,B
4,AmoC,PF04896.11,2517403201,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,2517403201 A3OODRAFT_2068 methane monooxygenas...,2043,2517403201,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,566,773,False,0,False,C


In [826]:
operons[1]

Index([                 2,             u'0_x',             u'1_x',
                   u'2_x',             u'3_x',                  4,
                        5,                  6,                  7,
                        8,                  9,                 10,
                       11,                 12,                 13,
                       14,                 15,                 16,
                       17,                 18,             u'0_y',
                   u'1_y',             u'2_y',             u'3_y',
                 u'diff1',           u'diff2',       u'is_operon',
          u'operon_count', u'rel_coordinates',            u'type',
                        2,             u'0_x',             u'1_x',
                   u'2_x',             u'3_x',                  4,
                        5,                  6,                  7,
                        8,                  9,                 10,
                       11,                 12,                

In [683]:
# df = df.stack().to_frame().T
# df.columns = ['{}_{}'.format(*c) for c in df.columns]

In [865]:
operons=list()
output=pd.DataFrame()
for count,frame in df.groupby('operon_count'):
    if count > 0:
        #print("***********"+str(count)+"*****************************")
        #print(frame)
        #print('===========================================================')
        #frame.columns = ['{}_{}'.format(*c) for c in df.columns]
        frame=frame.stack().to_frame().T
        operons.append(frame.copy())
        frame.columns=frame.columns.droplevel(0)
        output=output.append(frame, ignore_index=True)

In [866]:
def concatValues(row):
    concatVal=''
    for i in range(output['type'].iloc[1].size):
        concatVal+=str(row.iloc[i])
    return concatVal

In [883]:
def concatSeparatedValues(row, sep="|"):
    concatVal=''
    for i in range(output['type'].iloc[1].size):
        concatVal+=str(row.iloc[i])+sep
    concatVal=concatVal[:-1]
    return concatVal

In [884]:
output['type'].apply(lambda x: x.iloc[0]+x.iloc[1]+x.iloc[2] ,axis=1)

0    CAB
1    BAC
2    CBA
dtype: object

In [869]:
output['type']

Unnamed: 0,type,type.1,type.2
0,C,A,B
1,B,A,C
2,C,B,A


In [870]:
output['type'].iloc[1].size

3

In [871]:
output['type'].apply(concatValues,axis=1)


0    CAB
1    BAC
2    CBA
dtype: object

In [872]:
output

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,descr,position,qid,seq,diff1,diff2,is_operon,rel_coordinates,operon_count,type
0,AmoC,PF04896.11,2517402628,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,2517402630 A3OODRAFT_1497 methane monooxygenas...,1477,2517402630,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,1,566,True,1,1,B
1,Monooxygenase_B,PF04744.11,2517403996,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,2517403998 A3OODRAFT_2865 methane monooxygenas...,2818,2517403998,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,1,224,True,1,2,C
2,AmoC,PF04896.11,2517404865,-,4.2e-93,298.8,17.7,5.1e-93,298.5,17.7,...,2517404867 A3OODRAFT_3735 Ammonia monooxygenas...,3676,2517404867,MTGKSLDIPARPYTGEKSRLSRAYDYLILVLALFLFIGSFHLHVAL...,1,3676,True,1,3,A


In [873]:
output['descr']

Unnamed: 0,descr,descr.1,descr.2
0,2517402628 A3OODRAFT_1495 methane monooxygenas...,2517402629 A3OODRAFT_1496 methane monooxygenas...,2517402630 A3OODRAFT_1497 methane monooxygenas...
1,2517403996 A3OODRAFT_2863 methane monooxygenas...,2517403997 A3OODRAFT_2864 methane monooxygenas...,2517403998 A3OODRAFT_2865 methane monooxygenas...
2,2517404865 A3OODRAFT_3733 methane monooxygenas...,2517404866 A3OODRAFT_3734 methane monooxygenas...,2517404867 A3OODRAFT_3735 Ammonia monooxygenas...


In [874]:
operons[0].columns

MultiIndex(levels=[[1, 2, 3], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, u'descr', u'descr_x', u'descr_y', u'diff1', u'diff2', u'is_operon', u'operon_count', u'position', u'position_x', u'position_y', u'qid', u'qid_x', u'qid_y', u'rel_coordinates', u'seq', u'seq_x', u'seq_y', u'type']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 27, 30, 34, 21, 28, 31, 35, 20, 27, 30, 34, 21, 28, 31, 35, 19, 26, 29, 33, 22, 23, 24, 32, 25, 36, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 27, 30, 34, 21, 28, 3

In [878]:
output['operon_type']=output['type'].apply(concatValues, axis=1)
output['rel_distance']=output['rel_coordinates'].apply(concatValues, axis=1)
output['concat_ids']=output[2].apply(concatSeparatedValues, axis=1)
output['concat_descr']=output['descr'].apply(concatSeparatedValues, axis=1)
output['concat_sequences']=output['seq'].apply(concatValues, axis=1)

In [879]:
output

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,is_operon,rel_coordinates,operon_count,type,operon_type,rel_distance,concat_ids,concat_descr,concat_operon,concat_sequences
0,AmoC,PF04896.11,2517402628,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,True,1,1,B,CAB,111,2517402628|2517402629|2517402630,2517402628 A3OODRAFT_1495 methane monooxygenas...,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...
1,Monooxygenase_B,PF04744.11,2517403996,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,True,1,2,C,BAC,111,2517403996|2517403997|2517403998,2517403996 A3OODRAFT_2863 methane monooxygenas...,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...
2,AmoC,PF04896.11,2517404865,-,4.2e-93,298.8,17.7,5.1e-93,298.5,17.7,...,True,1,3,A,CBA,111,2517404865|2517404866|2517404867,2517404865 A3OODRAFT_3733 methane monooxygenas...,MSTTAQTISQSTPQAPFNLPWYLRDLPKYLLTFGVMTVIYVGFRMY...,MSTTAQTISQSTPQAPFNLPWYLRDLPKYLLTFGVMTVIYVGFRMY...


## Exporting values

In [880]:
export_columns=['operon_type', 'rel_distance', 'concat_ids', 'concat_descr', 'concat_sequences']

In [900]:
outfile1 = "out.csv"
outfile2 = "out.fasta"
output[export_columns].to_csv(outfile1, sep='\t', header=True)

In [901]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC

In [907]:


ids = output['concat_ids'].tolist()
sequences=output['concat_sequences'].tolist()
descriptions=output['concat_descr'].tolist()
types=output['operon_type'].tolist()
#if len(ids)==len(sequences):
records=list()
for i in range(len(ids)):
    record=SeqRecord(Seq(sequences[i], IUPAC.protein), id=ids[i],name=types[i] , description=types[i]+" "+descriptions[i])
    records.append(record)
with open(outfile2, "w") as output_handle:
    SeqIO.write(records, output_handle, "fasta")


In [908]:
records

[SeqRecord(seq=Seq('MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFGWRAGLDSF...DMP', IUPACProtein()), id='2517402628|2517402629|2517402630', name='CAB', description='CAB 2517402628 A3OODRAFT_1495 methane monooxygenase/ammonia monooxygenase, subunit C [Methylocystis rosea SV97T]|2517402629 A3OODRAFT_1496 methane monooxygenase/ammonia monooxygenase, subunit A [Methylocystis rosea SV97T]|2517402630 A3OODRAFT_1497 methane monooxygenase/ammonia monooxygenase, subunit B [Methylocystis rosea SV97T]', dbxrefs=[]),
 SeqRecord(seq=Seq('MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWYDVQWSKTT...LTE', IUPACProtein()), id='2517403996|2517403997|2517403998', name='BAC', description='BAC 2517403996 A3OODRAFT_2863 methane monooxygenase/ammonia monooxygenase, subunit B [Methylocystis rosea SV97T]|2517403997 A3OODRAFT_2864 methane monooxygenase/ammonia monooxygenase, subunit A [Methylocystis rosea SV97T]|2517403998 A3OODRAFT_2865 methane monooxygenase/ammonia monooxygenase, subunit C [Methylocystis rosea SV

In [887]:
my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")

In [897]:

record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
                       IUPAC.protein),
                   id="YP_025292.1", name="HokC",
                   description="toxic membrane protein, small")

140443830384016

In [891]:
my_rna.id='haha'

In [893]:
my_rna.id

'haha'

In [895]:
my_rna.des='hehe'

In [896]:
my_rna.des

'hehe'

In [710]:
# for record in SeqIO.parse(filename,"fasta"):
#     print(record.description)

In [695]:
def locationAndSequence(s):
    s['loc']=

SyntaxError: invalid syntax (<ipython-input-695-c917f59625ee>, line 2)

In [696]:
# for i in chunks:
#     print(i)

In [697]:
pipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

In [698]:
pipe.communicate()

('', '/bin/sh: 1: Syntax error: "(" unexpected\n')

In [699]:
#process = subprocess.Popen(['ls','-l'], stdout=subprocess.PIPE)
#print(process.stdout.read())

In [700]:
%%time
import os
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
fnames=""
flabels=""
for (l,f) in files:
    fnames+=f.name+" "
    flabels+=l+" "
#print flabels
    #print(l)
    #print(f.name)
    #parallel -j $1 blastall "-p blastn -d nt/nt -i "{}" -o "{.}".xml -e 1e-10 -m 7 -K 100 -b 200" ::: *.fa
    #print('Exists after close:', os.path.exists(f.name))
cmd="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {#}.tab " + "../amoCAB {} ::: "+fnames+" > /dev/null"

#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames
print(cmd)

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [701]:
c

2

In [702]:
cmd="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {#}.tab " + "../amoCAB {} ::: "+fnames+" > /dev/null"
#flabels
print(cmd)

parallel -j 8 hmmscan --cpu 1 --tblout {#}.tab ../amoCAB {} ::: /tmp/tmpSqwnnp  > /dev/null


In [703]:
filename

'../2517287028.genes.faa'

In [704]:
/tmp/tmprOkRVb.close()

SyntaxError: invalid syntax (<ipython-input-704-02dd9a23c7c6>, line 1)

In [705]:
filename.name

AttributeError: 'str' object has no attribute 'name'

In [706]:
txt=filename.seek(0)
txt=filename.read()
#txt

AttributeError: 'str' object has no attribute 'seek'

In [707]:
filename

'../2517287028.genes.faa'

In [708]:
txt2

NameError: name 'txt2' is not defined

In [709]:
#Coool

In [26]:
txt2=filename.read()

## In-memory files
[https://stackoverflow.com/questions/23028071/open-a-file-in-memory]

In [11]:
#import io##for python3
import StringIO

output = StringIO.StringIO()
output.write('First line.\n')
print >>output, 'Second line.'

# Retrieve file contents -- this will be
# 'First line.\nSecond line.\n'
contents = output.getvalue()
print(contents)
# Close object and discard memory buffer --
# .getvalue() will now raise an exception.
output.close()


First line.
Second line.



## Redoing generator with an in-memory implementation

In [15]:
import StringIO

In [21]:
%%time
from Bio import SeqIO
import StringIO
#output = io.StringIO()
record_iter = SeqIO.parse(open("../2517287028.genes.faa"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 50000)):
    filename = "group_%i.fastq" % (i + 1)
    filename = StringIO.StringIO()
    
    count = SeqIO.write(batch, filename, "fasta")
    print("Wrote %i records to %s" % (count, filename))

Wrote 3891 records to <StringIO.StringIO instance at 0x7f5bc2c582d8>
CPU times: user 58.6 ms, sys: 128 µs, total: 58.7 ms
Wall time: 58.7 ms


In [24]:
dir(filename)

['__doc__',
 '__init__',
 '__iter__',
 '__module__',
 'buf',
 'buflist',
 'close',
 'closed',
 'flush',
 'getvalue',
 'isatty',
 'len',
 'next',
 'pos',
 'read',
 'readline',
 'readlines',
 'seek',
 'softspace',
 'tell',
 'truncate',
 'write',
 'writelines']

## memory map
[https://docs.python.org/2/library/mmap.html]

In [None]:
import mmap
from Bio import SeqIO
#import StringIO

record_iter = SeqIO.parse(open("../76969.assembled.faa"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 50000)):
    filename = "group_%i.fastq" % (i + 1)
    with open(filename, "w") as handle:
        count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, filename))