In [1]:
def batch_iterator(iterator, batch_size):
    """Returns lists of length batch_size.

    This can be used on any iterator, for example to batch up
    SeqRecord objects from Bio.SeqIO.parse(...), or to batch
    Alignment objects from Bio.AlignIO.parse(...), or simply
    lines from a file handle.

    This is a generator function, and it returns lists of the
    entries from the supplied iterator.  Each list will have
    batch_size entries, although the final list may be shorter.
    """
    entry = True  # Make sure we loop once
    while entry:
        
        batch = []
        while len(batch) < batch_size:
            try:
                #origianlly was iterator.next()
                #changed for py3 compatibility
                #needs python 2.6+
                entry = next(iterator)
            except StopIteration:
                entry = None
            if entry is None:
                # End of file
                break
            batch.append(entry)
        if batch:
            yield batch



In [2]:
#extracts names as well
def batch_iterator2(iterator, batch_size):
    """Returns lists of length batch_size.

    This can be used on any iterator, for example to batch up
    SeqRecord objects from Bio.SeqIO.parse(...), or to batch
    Alignment objects from Bio.AlignIO.parse(...), or simply
    lines from a file handle.

    This is a generator function, and it returns lists of the
    entries from the supplied iterator.  Each list will have
    batch_size entries, although the final list may be shorter.
    """
    entry = True  # Make sure we loop once
    while entry:
        batch_names=[]
        batch = []
        while len(batch) < batch_size:
            try:
                #origianlly was iterator.next()
                #changed for py3 compatibility
                #needs python 2.6+
                entry = next(iterator)
            except StopIteration:
                entry = None
            if entry is None:
                # End of file
                break
            batch.append(entry)
            batch_names.append(entry.name)
        if batch:
            yield batch,batch_names



In [3]:
##Worked!!!
##### Super Exiting ######

#extracts names as well
def batch_iterator3(iterator, batch_size):
    """Returns lists of length batch_size.

    This can be used on any iterator, for example to batch up
    SeqRecord objects from Bio.SeqIO.parse(...), or to batch
    Alignment objects from Bio.AlignIO.parse(...), or simply
    lines from a file handle.

    This is a generator function, and it returns lists of the
    entries from the supplied iterator.  Each list will have
    batch_size entries, although the final list may be shorter.
    """
    entry = True  # Make sure we loop once
    counter=0
    while entry:
        batch_names=[]
        batch = []
        while len(batch) < batch_size:
            try:
                #origianlly was iterator.next()
                #changed for py3 compatibility
                #needs python 2.6+
                entry = next(iterator)
            except StopIteration:
                entry = None
            if entry is None:
                # End of file
                break
            counter+=1
            #entry._id=entry.id
            entry.id=str(counter)
            batch.append(entry)
            
        if batch:
            yield batch

In [8]:
%%time
from Bio import SeqIO

record_iter = SeqIO.parse(open("../2517287028.genes.faa"),"fasta")#m.rosea
for i, batch in enumerate(batch_iterator3(record_iter, 1000)):
    filename = "group_%i.fasta" % (i + 1)
    with open(filename, "w") as handle:
        count = SeqIO.write(batch, handle, "fasta")#accessing sequences
    print("Wrote %i records to %s" % (count, filename))



Wrote 1000 records to group_1.fasta
Wrote 1000 records to group_2.fasta
Wrote 1000 records to group_3.fasta
Wrote 891 records to group_4.fasta
Wall time: 96 ms


In [45]:
len(batch[1])

891

In [321]:
files[0][1].name

'/tmp/tmpSqwnnp'

In [4]:
%%time


with open('/tmp/tmpSqwnnp') as handle:
    #m.rosea
    record_dict = SeqIO.index("example.fasta", "fasta")
    print(record_dict["2517401456"])



    
    #     filename = "group_%i.fasta" % (i + 1)
#     with open(filename, "w") as handle:
#         count = SeqIO.write(batch, handle, "fasta")
#     print("Wrote %i records to %s" % (count, filename))

FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmpSqwnnp'

# Trying temporary files
[https://docs.python.org/3/library/tempfile.html]

In [3]:
import tempfile

# create a temporary file and write some data to it
fp = tempfile.TemporaryFile()
fp.write(b'Hello world!')
# read data from file
fp.seek(0)

txt=fp.read()
print(txt)
# close the file, it will be removed
fp.close()
#print(txt)
# # create a temporary file using a context manager
# with tempfile.TemporaryFile() as fp:
#     fp.write(b'Hello world!')
#     fp.seek(0)
#     fp.read()

# # file is now closed and removed

# # create a temporary directory using the context manager
# with tempfile.TemporaryDirectory() as tmpdirname:
#     print('created temporary directory', tmpdirname)

# directory and contents have been removed

Hello world!


In [None]:
import tempfile
from Bio import SeqIO

record_iter = SeqIO.parse(open("../76969.assembled.faa"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 30000)):
    
    filename = "group_%i.fasta" % (i + 1)
    print(filename)
    filename = tempfile.TemporaryFile()
    filename.seek(0)
    count = SeqIO.write(batch, filename, "fasta")
#     with open(filename, "w") as handle:
#         count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, filename))

## Named temporary files

[https://docs.python.org/2/library/tempfile.html]

In [4]:
filename

'group_4.fasta'

In [5]:
os.path.basename(filename)

NameError: name 'os' is not defined

In [114]:
files

[(0, <tempfile._TemporaryFileWrapper at 0x7f3dbe71abe0>),
 (1, <tempfile._TemporaryFileWrapper at 0x7f3dbfd91400>),
 (2, <tempfile._TemporaryFileWrapper at 0x7f3dbe38b208>),
 (3, <tempfile._TemporaryFileWrapper at 0x7f3dc05f10f0>),
 (4, <tempfile._TemporaryFileWrapper at 0x7f3dbe71aeb8>),
 (5, <tempfile._TemporaryFileWrapper at 0x7f3dc1d5ca90>),
 (6, <tempfile._TemporaryFileWrapper at 0x7f3dbe9ab128>),
 (7, <tempfile._TemporaryFileWrapper at 0x7f3dbe4dff98>),
 (8, <tempfile._TemporaryFileWrapper at 0x7f3dbe9ab208>),
 (9, <tempfile._TemporaryFileWrapper at 0x7f3dbeca2b00>),
 (10, <tempfile._TemporaryFileWrapper at 0x7f3dbe9aba90>),
 (11, <tempfile._TemporaryFileWrapper at 0x7f3dbe74ef98>),
 (12, <tempfile._TemporaryFileWrapper at 0x7f3dbe3806d8>),
 (13, <tempfile._TemporaryFileWrapper at 0x7f3dc1178ef0>),
 (14, <tempfile._TemporaryFileWrapper at 0x7f3dbe7f50b8>),
 (15, <tempfile._TemporaryFileWrapper at 0x7f3dc230d710>),
 (16, <tempfile._TemporaryFileWrapper at 0x7f3dbe318a58>),
 (17, <

In [113]:
import tempfile
from Bio import SeqIO

files=list()
sizes={}
input_fasta="../76969.assembled.faa"
#input_fasta="../mrosea_proteins.fasta"
#input_fasta="../2517287028.genes.faa"#m.rosea
record_iter = SeqIO.parse(open(input_fasta),"fasta")
#for i, batch in enumerate(batch_iterator3(record_iter, 30000)):
for i, batch in enumerate(batch_iterator3(record_iter, 30000)):
    
    label = i
    #f = tempfile.NamedTemporaryFile(delete=False)#exists on closing
    f = tempfile.NamedTemporaryFile(mode='w+t')#deleted after f.close()
    files.append((label,f))
    #f.seek(0)
    count = SeqIO.write(batch, f, "fasta")
    f.flush() #this solves the EOF problem
#     with open(filename, "w") as handle:
#         count = SeqIO.write(batch, handle, "fasta")
    #print("Wrote %i records to %s" % (count, f.name))
    sizes[f.name]=count

In [112]:
for (l,f) in files:
    #print(l)
    #print(f.name)
    f.close()

In [115]:
sizes

{'/tmp/tmp0ifa7fhr': 30000,
 '/tmp/tmp0zr06t4w': 30000,
 '/tmp/tmp1_592eid': 30000,
 '/tmp/tmp1fkph2jw': 30000,
 '/tmp/tmp1itgy8v9': 30000,
 '/tmp/tmp1te8x7fz': 30000,
 '/tmp/tmp1y1_zlaz': 30000,
 '/tmp/tmp28758qiu': 30000,
 '/tmp/tmp2pt0n5uu': 30000,
 '/tmp/tmp35pfxpew': 30000,
 '/tmp/tmp35spj29e': 30000,
 '/tmp/tmp3829di35': 30000,
 '/tmp/tmp3ao49lex': 30000,
 '/tmp/tmp3qvhmve6': 30000,
 '/tmp/tmp3u8mf83p': 30000,
 '/tmp/tmp47l4almu': 30000,
 '/tmp/tmp4k2gck0b': 30000,
 '/tmp/tmp4xgzr196': 30000,
 '/tmp/tmp4xpfb1he': 30000,
 '/tmp/tmp52d1gh4w': 30000,
 '/tmp/tmp52jd5p0f': 30000,
 '/tmp/tmp544mvn8q': 30000,
 '/tmp/tmp59rpnwml': 30000,
 '/tmp/tmp5gda1pem': 30000,
 '/tmp/tmp5v6tewhb': 30000,
 '/tmp/tmp626m5oem': 30000,
 '/tmp/tmp6o18bihe': 30000,
 '/tmp/tmp6pux_65w': 30000,
 '/tmp/tmp7_qf1bue': 30000,
 '/tmp/tmp7wwklmmb': 30000,
 '/tmp/tmp8pgkwpo1': 30000,
 '/tmp/tmp9dndu8ro': 30000,
 '/tmp/tmp9hldbodb': 30000,
 '/tmp/tmp_whqcbr5': 30000,
 '/tmp/tmpb1w07r9s': 30000,
 '/tmp/tmpb7law7w7':

In [7]:
!cat /tmp/tmp13o6_f9t

>3001 2517404183 A3OODRAFT_3050 NADH dehydrogenase, FAD-containing subunit [Methylocystis rosea SV97T]
MQHSSAAERQPRVVIIGGGFAGIAAAQALDGAGARIFILDSNNHHCFQPLLYQVATAALA
SPDVAWPIRHIVRRQKDVTVLMLTVESVDVARKVVKTDKCEIGYDFLVVATGATHSYFGH
NWASLAPGLKSISDATLIRRRLLLAFERAEVSVDPVERERLITIIIVGGGPTGVELAGAI
SELARRTLPPEFRRVDPRKARIILLEAGPRILASFPERLSHYARNSLEAKGVKVLTDTPV
DQIFEDRIVAGEKEIPAGVILWAAGVRASPAANWLGVEGDRTGRIPVGEDLTVPGLPDVY
VIGDLALLTGPDGAPVPALAASAKQMGKYAGRAIRLRLKGRSPRKPFRYRDYGNLATIGR
NSAIVKLGRLELTGFPGWLFWSVVHIYFLVNLRSRILVAISWIATYLTGNRGSRLITR
>3002 2517404184 A3OODRAFT_3051 hypothetical protein [Methylocystis rosea SV97T]
MDKLSRIGLSALVTGSVASVISTAALAALATLEGRGALEPTNATSHWLWGRKSRGRRDVD
AAHTGVGYATHHASAVFWALPFEAWLAMQPPRSTFELIGDAAMMSGIAATVDYGVAPKRI
TPGWEFALSDRSMIAAFAALAVGLACGAAASRALLGGEELELR
>3003 2517404185 A3OODRAFT_3052 primosomal protein N' [Methylocystis rosea SV97T]
MSAAQDEEERARVVEILAPVAVDSTYSYLAPAAMRLSPGDSVTAPLGRREAYGVVWSIDG
DSGPGGNLKTVSARLDRPPLAQNLRSFIDWLARYTLTPRGMALRLATRAAEDAGPEAPRM
LYRATG

In [30]:
fna

NameError: name 'fna' is not defined

## hmmering tempfiles

In [59]:
files

[(0, <open file '<fdopen>', mode 'w+b' at 0x7fd26afdb4b0>)]

In [63]:
%%time
import os
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
for (l,f) in files:
    #print(l)
    #print(f.name)
    
    #print('Exists after close:', os.path.exists(f.name))
    command="hmmscan --tblout "+l+".tab " + "../amoCAB " + f.name+""
    print(command)
    os.system(command)
    

for (l,f) in files:
    print(l)
    print(f.name)
    f.close()
    print('Exists after close:', os.path.exists(f.name))

hmmscan --tblout group_1.fasta.tab ../amoCAB /tmp/tmp4WuJSN
hmmscan --tblout group_2.fasta.tab ../amoCAB /tmp/tmpCBMFY1
hmmscan --tblout group_3.fasta.tab ../amoCAB /tmp/tmp9ABMDY
hmmscan --tblout group_4.fasta.tab ../amoCAB /tmp/tmp4g8aDl
group_1.fasta
/tmp/tmp4WuJSN
('Exists after close:', False)
group_2.fasta
/tmp/tmpCBMFY1
('Exists after close:', False)
group_3.fasta
/tmp/tmp9ABMDY
('Exists after close:', False)
group_4.fasta
/tmp/tmp4g8aDl
('Exists after close:', False)
CPU times: user 4.65 ms, sys: 4.08 ms, total: 8.74 ms
Wall time: 24.7 ms


In [59]:
%%time
import os
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
for (l,f) in files:
    #print(l)
    #print(f.name)
    
    #print('Exists after close:', os.path.exists(f.name))
    #
    command="hmmscan "+"--cpu 8 --noali"+" --tblout "+l+".tab " + "../amoCAB " + f.name
    print(command)
    os.system(command)

for (l,f) in files:
    print(l)
    print(f.name)
    f.close()
    #print('Exists after close:', os.path.exists(f.name))

hmmscan --cpu 8 --noali --tblout group_1.fasta.tab ../amoCAB /tmp/tmpM_ZFS9
hmmscan --cpu 8 --noali --tblout group_2.fasta.tab ../amoCAB /tmp/tmp6sxP79
hmmscan --cpu 8 --noali --tblout group_3.fasta.tab ../amoCAB /tmp/tmpCgusGR
hmmscan --cpu 8 --noali --tblout group_4.fasta.tab ../amoCAB /tmp/tmpVnIz1M
group_1.fasta
/tmp/tmpM_ZFS9
('Exists after close:', False)
group_2.fasta
/tmp/tmp6sxP79
('Exists after close:', False)
group_3.fasta
/tmp/tmpCgusGR
('Exists after close:', False)
group_4.fasta
/tmp/tmpVnIz1M
('Exists after close:', False)
CPU times: user 646 µs, sys: 8.05 ms, total: 8.7 ms
Wall time: 12.5 s


## Subprocess version

In [45]:
%%time
import os
import subprocess
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
for (l,f) in files:
    #print(l)
    #print(f.name)
    
    #print('Exists after close:', os.path.exists(f.name))
    command="hmmscan "+"--cpu 8"+" --tblout "+l+".tab " + "../amoCAB " + f.name
    print(command)
    subprocess.Popen(command, shell=True)

for (l,f) in files:
    print(l)
    print(f.name)
    f.close()
    print('Exists after close:', os.path.exists(f.name))

hmmscan --cpu 8 --tblout group_1.fasta.tab ../amoCAB /tmp/tmpWxz75u
hmmscan --cpu 8 --tblout group_2.fasta.tab ../amoCAB /tmp/tmp_a05vQ
hmmscan --cpu 8 --tblout group_3.fasta.tab ../amoCAB /tmp/tmp0tsj4Z
hmmscan --cpu 8 --tblout group_4.fasta.tab ../amoCAB /tmp/tmpZO5t_L
group_1.fasta
/tmp/tmpWxz75u
('Exists after close:', False)
group_2.fasta
/tmp/tmp_a05vQ
('Exists after close:', False)
group_3.fasta
/tmp/tmp0tsj4Z
('Exists after close:', False)
group_4.fasta
/tmp/tmpZO5t_L
('Exists after close:', False)
CPU times: user 6.91 ms, sys: 4.17 ms, total: 11.1 ms
Wall time: 18.5 ms


In [757]:
filename

'../2517287028.genes.faa'

## Subprocess verision of entire file, with no splitting

In [9]:
%%time
from Bio import SeqIO
import os
import subprocess
import shlex

#filename="../76969.assembled.faa"
filename="../2517287028.genes.faa"#m.rosea
#fout=entire_mg1_2.tab
fout="entire_mros.tab"
#fh = SeqIO.parse(open(filename),"fasta")#m.rosea
# with open(filename, "r") as handle:
#     count = SeqIO.write(batch, handle, "fasta")
# print("Wrote %i records to %s" % (count, filename))




#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")

    #print('Exists after close:', os.path.exists(f.name))
# command="hmmscan "+"--cpu 8"+" -o /dev/null --tblout "+"entire_mg1"+".tab " + "../amoCAB " + filename
# print(command)
# subprocess.Popen(command, shell=True)

cmd="hmmscan --cpu 8 -o /dev/null --tblout "+fout+" ../amoCAB "+filename
print(cmd)
p=subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
# print(cmd)
# args = shlex.split(cmd)
#os.system(cmd)
#os.system(cmd)
#print(p.stdout.read())
#output=p.stdout.read()
#os.system(cmd)
#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames

#p=subprocess.Popen(args,stdout=subprocess.PIPE)
#print(p.stdout.read())
output=p.stdout.read()

hmmscan --cpu 8 -o /dev/null --tblout entire_mros.tab ../amoCAB ../2517287028.genes.faa
CPU times: user 2.19 ms, sys: 0 ns, total: 2.19 ms
Wall time: 1.51 s


In [10]:
with open(fout, "r") as handle:
    mros_entire=handle.read()


In [11]:
mros_entire=mros_entire.split('\n')
for i in mros_entire:
    print(i)

#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
AmoC                 PF04896.11 2517401456           -           5.6e-120  386.8  15.8  7.2e-120  386.4  15.8   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C
AmoC                 PF04896.11 2517402628           -           5.8e-122  393.3  14.7    7e-122  393.0  14.7   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C
AMO                  PF02461.15 2517402629           -           4.8e-124  400.4  24.0  5.4e-124  400.3  24.0   1.0   1   0   

In [789]:
header=mros_entire[1:2]
header[0].split()

['#',
 'target',
 'name',
 'accession',
 'query',
 'name',
 'accession',
 'E-value',
 'score',
 'bias',
 'E-value',
 'score',
 'bias',
 'exp',
 'reg',
 'clu',
 'ov',
 'env',
 'dom',
 'rep',
 'inc',
 'description',
 'of',
 'target']

In [790]:
len(header[0].split())

24

In [791]:
header

['# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target']

In [792]:
mros_entire[3:-11]

['AmoC                 PF04896.11 2517401456           -           5.6e-120  386.8  15.8  7.2e-120  386.4  15.8   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C',
 'AmoC                 PF04896.11 2517402628           -           5.8e-122  393.3  14.7    7e-122  393.0  14.7   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C',
 'AMO                  PF02461.15 2517402629           -           4.8e-124  400.4  24.0  5.4e-124  400.3  24.0   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase',
 'Monooxygenase_B      PF04744.11 2517402630           -           3.3e-170  553.6   0.0  3.7e-170  553.4   0.0   1.0   1   0   0   1   1   1   1 Monooxygenase subunit B protein',
 'AmoC                 PF04896.11 2517403201           -           5.8e-122  393.3  14.7    7e-122  393.0  14.7   1.0   1   0   0   1   1   1   1 Ammonia monooxygenase/methane monooxygenase, subunit C',
 'Monooxygenase_B      PF04744.11 2517403

In [793]:
mros_entire[4].split()[0]

'AmoC'

# Creating a dataframe

In [794]:
#d = df[[p, p.team, p.passing_att, p.passer_rating()] for p in game.players.passing()]

In [795]:
import pandas as pd

In [796]:
[p[5] for p=line.split() in mros_entire[3:-11]]

SyntaxError: invalid syntax (<ipython-input-796-c2fdd2690845>, line 1)

In [797]:
#[(x[1],x[2]) for x in (x.split(";") for x in a.split("\n")) if x[1] != 5]

In [798]:
[[x[0], x[1]] for x in mros_entire[3:-11] for x in (x.split())]

IndexError: string index out of range

In [799]:
a = "1;2;4\n3;4;5"
[(x[1],x[2]) for x in (x.split(";") for x in a.split("\n")) if x[1] != 5]

[('2', '4'), ('4', '5')]

In [800]:
df=pd.DataFrame([x.split() for x in mros_entire[3:-11]])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,AmoC,PF04896.11,2517401456,-,5.6e-120,386.8,15.8,7.199999999999999e-120,386.4,15.8,...,0,1,1,1,1,Ammonia,monooxygenase/methane,"monooxygenase,",subunit,C
1,AmoC,PF04896.11,2517402628,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,0,1,1,1,1,Ammonia,monooxygenase/methane,"monooxygenase,",subunit,C
2,AMO,PF02461.15,2517402629,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,...,0,1,1,1,1,Ammonia,monooxygenase,,,
3,Monooxygenase_B,PF04744.11,2517402630,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,0,1,1,1,1,Monooxygenase,subunit,B,protein,
4,AmoC,PF04896.11,2517403201,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,0,1,1,1,1,Ammonia,monooxygenase/methane,"monooxygenase,",subunit,C
5,Monooxygenase_B,PF04744.11,2517403996,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,0,1,1,1,1,Monooxygenase,subunit,B,protein,
6,AMO,PF02461.15,2517403997,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,...,0,1,1,1,1,Ammonia,monooxygenase,,,
7,AmoC,PF04896.11,2517403998,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,0,1,1,1,1,Ammonia,monooxygenase/methane,"monooxygenase,",subunit,C
8,AmoC,PF04896.11,2517404225,-,5.600000000000001e-111,357.3,11.5,6.6000000000000005e-111,357.1,11.5,...,0,1,1,1,1,Ammonia,monooxygenase/methane,"monooxygenase,",subunit,C
9,AmoC,PF04896.11,2517404670,-,1.7e-30,93.8,0.0,2.2000000000000002e-30,93.3,0.0,...,0,1,1,1,1,Ammonia,monooxygenase/methane,"monooxygenase,",subunit,C


In [943]:
def joinDescriptionColumns(descr_columns):
    merged=''
    for row in descr_columns:
        if (row != None):
            merged+=row+' '
    #print(descr_columns)
    return merged.strip()

In [944]:
#df['period'] = df[['Year', 'quarter']].apply(lambda x: ''.join(x), axis=1)
df[18]=df.loc[:,18:].apply(joinDescriptionColumns, axis=1)
df=df.loc[:,:18]
df

TypeError: cannot do slice indexing on <class 'pandas.indexes.base.Index'> with these indexers [18] of <type 'int'>

In [259]:
%%time
import os
import subprocess
import shlex
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
fnames=""
flabels=""
for (l,f) in files:
    fnames+=f.name+" "
    flabels+=l+" "
#print flabels
    #print(l)
    #print(f.name)
    #parallel -j $1 blastall "-p blastn -d nt/nt -i "{}" -o "{.}".xml -e 1e-10 -m 7 -K 100 -b 200" ::: *.fa
    #print('Exists after close:', os.path.exists(f.name))
#this prints nicely to stdout, and is caught by p.stdout.read()
#cmd="parallel -j 8 hmmscan "+"-o /dev/null --noali --cpu 1"+" --tblout >(tee /dev/stdout)  " + "../amoCAB {} ::: "+fnames
#this prints to the file and stdout
#0.tab should be deleted if exists
if os.path.exists("0.tab"):
    os.remove("0.tab")
cmd="parallel -j 8 hmmscan "+"-o /dev/null --noali --cpu 1"+" --tblout >(tee -a 0.tab)  " + "../amoCAB {} ::: "+fnames
args = shlex.split(cmd)
#os.system(cmd)
p=subprocess.Popen(args,stdout=subprocess.PIPE)
#print(p.stdout.read())
output=p.stdout.read()
#os.system(cmd)
#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames
print(cmd)

parallel -j 8 hmmscan -o /dev/null --noali --cpu 1 --tblout >(tee -a 0.tab)  ../amoCAB {} ::: /tmp/tmpsNB_xI /tmp/tmp9XW3yx /tmp/tmpyMn37J /tmp/tmp4h4Gou 
CPU times: user 2.3 ms, sys: 331 µs, total: 2.63 ms
Wall time: 356 ms


## Trying GNU parallel

In [260]:
files

[('group_1.fasta', <open file '<fdopen>', mode 'w+b' at 0x7fbba15f5ed0>),
 ('group_2.fasta', <open file '<fdopen>', mode 'w+b' at 0x7fbb79bbcd20>),
 ('group_3.fasta', <open file '<fdopen>', mode 'w+b' at 0x7fbba15f5f60>),
 ('group_4.fasta', <open file '<fdopen>', mode 'w+b' at 0x7fbb792be030>)]

In [8]:
#this works
#parallel -j 8 hmmscan --cpu 1 --tblout {.}.tab ../amoCAB {} ::: *.fasta > /dev/null


In [35]:
%%time
import os
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
fnames=""
flabels=""
for (l,f) in files:
    fnames+=f.name+" "
    flabels+=l+" "
#print flabels
    #print(l)
    #print(f.name)
    #parallel -j $1 blastall "-p blastn -d nt/nt -i "{}" -o "{.}".xml -e 1e-10 -m 7 -K 100 -b 200" ::: *.fa
    #print('Exists after close:', os.path.exists(f.name))
    #this does print in the output:
    #hmmscan --cpu 8 --noali --tblout >(tee 1.tab)  ../amoCAB /tmp/tmp4WuJSN > /dev/null

cmd="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout >(tee {#}.tab) " + "../amoCAB {} ::: "+fnames+" > /dev/null"
os.system(cmd)
#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames
print(cmd)
    #os.system(command)

# for (l,f) in files:
#     print(l)
#     print(f.name)
#     f.close()
#     print('Exists after close:', os.path.exists(f.name))

parallel -j 8 hmmscan --cpu 1 --tblout >(tee {#}.tab) ../amoCAB {} ::: /tmp/tmpDjX8B8 /tmp/tmp6fEZVA /tmp/tmprxg_Kg /tmp/tmpG2VDVP  > /dev/null
CPU times: user 928 µs, sys: 122 µs, total: 1.05 ms
Wall time: 1.63 ms


119

# Rewriting to use subprocess module

In [None]:
# x = subprocess.Popen(['touch', 'xyz'])
# >>> print x

In [150]:
%%time
import os
import subprocess
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
fnames=""
flabels=""
for (l,f) in files:
    fnames+=f.name+" "
    flabels+=l+" "
#print flabels
    #print(l)
    #print(f.name)
    #parallel -j $1 blastall "-p blastn -d nt/nt -i "{}" -o "{.}".xml -e 1e-10 -m 7 -K 100 -b 200" ::: *.fa
    #print('Exists after close:', os.path.exists(f.name))
cmd="parallel -j 8 hmmscan "+" --noali --cpu 1"+" --tblout >0.tab  " + "../amoCAB {} ::: "+fnames+" > /dev/null"
p=subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE)
print(p.stdout.read())
#os.system(cmd)
#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames
print(cmd)


parallel -j 8 hmmscan  --noali --cpu 1 --tblout >0.tab  ../amoCAB {} ::: /tmp/tmp7100qP /tmp/tmpnD7HAz /tmp/tmpoKVoBZ /tmp/tmpm1kpsd  > /dev/null
CPU times: user 1.03 ms, sys: 4.03 ms, total: 5.06 ms
Wall time: 93.7 ms


## os.system version for output redirection

In [21]:
files

[('group_1.fasta', <open file '<fdopen>', mode 'w+b' at 0x7fbba15f5e40>),
 ('group_2.fasta', <open file '<fdopen>', mode 'w+b' at 0x7fbbafb60150>),
 ('group_3.fasta', <open file '<fdopen>', mode 'w+b' at 0x7fbba15f5c90>),
 ('group_4.fasta', <open file '<fdopen>', mode 'w+b' at 0x7fbbafb600c0>)]

In [24]:
fnames

NameError: name 'fnames' is not defined

## Finally it works: --tblout >(tee >> 0.tab) to the rescue

In [8]:
files

[(0, <tempfile._TemporaryFileWrapper at 0x7f3dde9c70b8>),
 (1, <tempfile._TemporaryFileWrapper at 0x7f3dded38a58>)]

In [116]:
files2={j.name:i for i,j in files}
files2

{'/tmp/tmp0ifa7fhr': 49,
 '/tmp/tmp0zr06t4w': 10,
 '/tmp/tmp1_592eid': 24,
 '/tmp/tmp1fkph2jw': 23,
 '/tmp/tmp1itgy8v9': 110,
 '/tmp/tmp1te8x7fz': 7,
 '/tmp/tmp1y1_zlaz': 76,
 '/tmp/tmp28758qiu': 29,
 '/tmp/tmp2pt0n5uu': 117,
 '/tmp/tmp35pfxpew': 100,
 '/tmp/tmp35spj29e': 1,
 '/tmp/tmp3829di35': 106,
 '/tmp/tmp3ao49lex': 79,
 '/tmp/tmp3qvhmve6': 101,
 '/tmp/tmp3u8mf83p': 99,
 '/tmp/tmp47l4almu': 66,
 '/tmp/tmp4k2gck0b': 53,
 '/tmp/tmp4xgzr196': 46,
 '/tmp/tmp4xpfb1he': 3,
 '/tmp/tmp52d1gh4w': 96,
 '/tmp/tmp52jd5p0f': 84,
 '/tmp/tmp544mvn8q': 2,
 '/tmp/tmp59rpnwml': 102,
 '/tmp/tmp5gda1pem': 32,
 '/tmp/tmp5v6tewhb': 82,
 '/tmp/tmp626m5oem': 88,
 '/tmp/tmp6o18bihe': 107,
 '/tmp/tmp6pux_65w': 64,
 '/tmp/tmp7_qf1bue': 26,
 '/tmp/tmp7wwklmmb': 65,
 '/tmp/tmp8pgkwpo1': 8,
 '/tmp/tmp9dndu8ro': 89,
 '/tmp/tmp9hldbodb': 5,
 '/tmp/tmp_whqcbr5': 31,
 '/tmp/tmpb1w07r9s': 85,
 '/tmp/tmpb7law7w7': 12,
 '/tmp/tmpbf48snt_': 40,
 '/tmp/tmpbhqrb20e': 28,
 '/tmp/tmpbtkpqh70': 56,
 '/tmp/tmpcyc330jn': 112

In [117]:
%%time
import os
import subprocess
import shlex
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
fnames=""
flabels=""
for (l,f) in files:
    fnames+=f.name+" "
    flabels+=str(l)+" "
#print flabels
    #print(l)
    #print(f.name)
    #parallel -j $1 blastall "-p blastn -d nt/nt -i "{}" -o "{.}".xml -e 1e-10 -m 7 -K 100 -b 200" ::: *.fa
    #print('Exists after close:', os.path.exists(f.name))
#this prints nicely to stdout, and is caught by p.stdout.read()
#cmd="parallel -j 8 hmmscan "+"-o /dev/null --noali --cpu 1"+" --tblout >(tee /dev/stdout)  " + "../amoCAB {} ::: "+fnames
#this prints to the file and stdout
#0.tab should be deleted if exists
if os.path.exists("0.tab"):
    os.remove("0.tab")
cmd="parallel -j 8 hmmscan "+"-o /dev/null --noali --cpu 1"+" --tblout >(tee -a 0.tab)  " + "../amoCAB {} ::: "+fnames
args = shlex.split(cmd)
#os.system(cmd)
p=subprocess.Popen(args,stdout=subprocess.PIPE)
#print(p.stdout.read())
output=p.stdout.read()
#os.system(cmd)
#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames
print(cmd)

parallel -j 8 hmmscan -o /dev/null --noali --cpu 1 --tblout >(tee -a 0.tab)  ../amoCAB {} ::: /tmp/tmpkmnxyh54 /tmp/tmp35spj29e /tmp/tmp544mvn8q /tmp/tmp4xpfb1he /tmp/tmpg9djl04p /tmp/tmp9hldbodb /tmp/tmpg5c71gkh /tmp/tmp1te8x7fz /tmp/tmp8pgkwpo1 /tmp/tmpqlji72nb /tmp/tmp0zr06t4w /tmp/tmpdjakbemx /tmp/tmpb7law7w7 /tmp/tmpvff9ck70 /tmp/tmpksea5fvb /tmp/tmpnvw8lv14 /tmp/tmpuihu7z6g /tmp/tmpo310ptue /tmp/tmpe15zo4nn /tmp/tmpll9ixewh /tmp/tmpm9tmjwx7 /tmp/tmpgl2qif1j /tmp/tmpe38e3s67 /tmp/tmp1fkph2jw /tmp/tmp1_592eid /tmp/tmppsmthbz5 /tmp/tmp7_qf1bue /tmp/tmpzwd50atk /tmp/tmpbhqrb20e /tmp/tmp28758qiu /tmp/tmpht6f0a92 /tmp/tmp_whqcbr5 /tmp/tmp5gda1pem /tmp/tmpmgx3hg8l /tmp/tmprsztfba7 /tmp/tmpz_7ytxdv /tmp/tmpuwd6s_hx /tmp/tmpt888hzu0 /tmp/tmpp44zgftp /tmp/tmpt2vnbetx /tmp/tmpbf48snt_ /tmp/tmptghqn3p2 /tmp/tmph5tx84o1 /tmp/tmpq7umltwx /tmp/tmpr7bir7r9 /tmp/tmpn7rj2y8k /tmp/tmp4xgzr196 /tmp/tmpk5jav5eu /tmp/tmpkblv6zei /tmp/tmp0ifa7fhr /tmp/tmpsk7ob8vt /tmp/tmpzv7r6fn0 /tmp/tmpyz9c3jfb /tmp/

In [118]:
output

b'#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----\n# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target\n#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------\nMonooxygenase_B      PF04744.11 25186                -            1.3e-05   11.8   0.0   1.6e-05   11.5   0.0   1.0   1   0   0   1   1   1   1 Monooxygenase subunit B protein\n#\n# Program:         hmmscan\n# Version:         3.1b2 (February 2015)\n# Pipeline mode:   SCAN\n# Query file:      /tmp/tmpkmnxyh54\n# Target file:     ../amoCAB\n# Option settings: hmmscan -o /dev/null --tblout /dev/fd/63 --noali --cpu 1 ../amoCAB /tmp/tmpkmnxyh54 \n# Current dir:     /media/andriy/5E8D984477029FC2/scripts/operon_finder/

In [119]:
len(files)

119

In [120]:
len(output)

177533

In [9]:
split_line="#                                                               --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----"
split_line="#                                                                --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----"
chunks=output.split(split_line)

In [121]:
#imporved split with regex
import re

chunks=re.split(r"#\s+--- full sequence ---- --- best 1 domain ---- --- domain number estimation ----", output.decode())


In [16]:
#output

In [17]:
#chunks

In [102]:
#chunks

In [13]:
import re
re.findall('# Query file:\s*(.*)', chunks[0])

[]

In [14]:
#chunks

In [15]:
#mros_entire

In [1]:
# for i in chunk.split('\n'):
#     print('+'+i)

In [None]:
'#                                                                --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----'

In [2]:
# for chunk in chunks:
#     print('<')
#     print(chunk)
#     print(len(chunk))
#     print('>')

In [104]:
#files2

In [None]:
# df=pd.DataFrame([x.split() for x in mros_entire[3:-11]])

# #df['period'] = df[['Year', 'quarter']].apply(lambda x: ''.join(x), axis=1)
# df[18]=df.loc[:,18:].apply(joinDescriptionColumns, axis=1)
# df=df.loc[:,:18]
# df

In [987]:
#o=output.split('\n')
#o

In [983]:
#df_scramb

In [122]:
def joinDescriptionColumns(descr_columns):
    merged=''
    for row in descr_columns.dropna():
        if (row != None):
            #print(merged)
            merged+=row+' '
    #print(descr_columns)
    return merged.strip()

In [123]:
del df_scramb

In [107]:
df_scramb

NameError: name 'df_scramb' is not defined

In [124]:
import pandas as pd

In [137]:
total=[]
df_scramb=pd.DataFrame()
test_cnt=0
for chunk in chunks:
    if chunk != '':
        #print((chunk))

        data=chunk.split('\n')[3:-11]
#         if test_cnt <2:
#             print(data)
        test_cnt+=1
        #print(data)
        if data:
            d=pd.DataFrame([x.split() for x in data])
        else:
            continue
        
        query=re.findall('# Query file:\s*(.*)', chunk)[0]
        label=files2[query]
        #print(query)
        #d['query file']=query
        #df.insert(loc=idx, column='A', value=new_col)
        d.insert(loc=0, column='query file', value=query)
        d.insert(loc=1, column='label', value=label)
        #d['label']=files2[query]
        df_scramb=df_scramb.append(d)
        #total+=chunk.split("\n")[3:-11]
#seems to have fixed the performance warning
df_scramb.sort_values(by='query file', inplace=True)
        
df_scramb.set_index(['query file', 'label'], inplace=True)

#df_scramb[18]

df_scramb.iloc[:,18]=df_scramb.iloc[:,18:].apply(joinDescriptionColumns, axis=1)
df_scramb=df_scramb.iloc[:,:19]
#df_scramb.reset_index(inplace=True)
#df_scramb.sort_index(level=1, inplace=True)#this seems unnecessary since later we are sorting by position

df_scramb[2]=df_scramb[2].astype(int)
df_scramb.reset_index(inplace=True)
df_scramb.sort_values(by=2, inplace=True)


In [138]:
df_scramb

Unnamed: 0,query file,label,0,1,2,3,4,5,6,7,...,9,10,11,12,13,14,15,16,17,18
216,/tmp/tmpkmnxyh54,0,Monooxygenase_B,PF04744.11,25186,-,1.3e-05,11.8,0.0,1.6e-05,...,0.0,1.0,1,0,0,1,1,1,1,Monooxygenase subunit B protein
56,/tmp/tmp35spj29e,1,Monooxygenase_B,PF04744.11,54949,-,0.0021,4.6,6.9,0.00044,...,3.1,1.6,2,0,0,2,2,2,1,Monooxygenase subunit B protein
83,/tmp/tmp544mvn8q,2,AmoC,PF04896.11,67923,-,1.8e-05,11.8,0.5,2.9e-05,...,0.5,1.3,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
82,/tmp/tmp544mvn8q,2,AmoC,PF04896.11,72299,-,4.3e-06,13.8,0.2,4.6e-06,...,0.2,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
84,/tmp/tmp544mvn8q,2,Monooxygenase_B,PF04744.11,80785,-,9.1e-99,318.4,0.6,1.1e-98,...,0.6,1.0,1,0,0,1,1,1,1,Monooxygenase subunit B protein
87,/tmp/tmp544mvn8q,2,AMO,PF02461.15,80787,-,1.2e-66,212.6,23.8,1.4e-66,...,23.8,1.0,1,0,0,1,1,1,1,Ammonia monooxygenase
85,/tmp/tmp544mvn8q,2,AmoC,PF04896.11,80788,-,1.4e-55,175.9,16.2,1.6e-55,...,16.2,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
86,/tmp/tmp544mvn8q,2,AmoC,PF04896.11,87047,-,1.5e-06,15.3,1.6,3e-06,...,1.6,1.6,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
77,/tmp/tmp4xpfb1he,3,AmoC,PF04896.11,94490,-,3.6e-07,17.3,0.8,3.6e-07,...,0.8,2.4,2,1,0,2,2,2,1,"Ammonia monooxygenase/methane monooxygenase, s..."
76,/tmp/tmp4xpfb1he,3,AmoC,PF04896.11,116259,-,0.015,2.2,15.5,0.00011,...,4.5,2.8,3,1,0,3,3,3,0,"Ammonia monooxygenase/methane monooxygenase, s..."


In [1006]:
fragment="""# target name        accession  query name           accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
#------------------- ---------- -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
Monooxygenase_B      PF04744.11 Ga0073928_1000008053 -            1.3e-05   11.8   0.0   1.6e-05   11.5   0.0   1.0   1   0   0   1   1   1   1 Monooxygenase subunit B protein
#
# Program:         hmmscan
# Version:         3.1b2 (February 2015)
# Pipeline mode:   SCAN
# Query file:      /tmp/tmpkdKejz
# Target file:     ../amoCAB
# Option settings: hmmscan -o /dev/null --tblout /dev/fd/61 --noali --cpu 1 ../amoCAB /tmp/tmpkdKejz 
# Current dir:     /media/andriy/5E8D984477029FC2/scripts/operon_finder/gen_func_file_parsing
# Date:            Thu Oct  5 22:29:52 2017
# [ok]

"""

In [192]:
def n(a):
    global b
    return a-b
def m():
    =4
    b=3
    
    return a+b

SyntaxError: invalid syntax (<ipython-input-192-9ad4ce2e54b4>, line 5)

In [191]:
()

7

In [1089]:
#df_scramb

In [22]:
#df

In [24]:
#df2[18][0]

In [803]:
df==df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
6,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
7,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
8,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
9,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [28]:
df_scramb

Unnamed: 0,query file,label,0,1,2,3,4,5,6,7,...,9,10,11,12,13,14,15,16,17,18
0,/tmp/tmp0xm_7iqz,0,AmoC,PF04896.11,320,-,5.6e-120,386.8,15.8,7.199999999999999e-120,...,15.8,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
1,/tmp/tmp0xm_7iqz,0,AmoC,PF04896.11,1475,-,5.8e-122,393.3,14.7,7e-122,...,14.7,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
2,/tmp/tmp0xm_7iqz,0,AMO,PF02461.15,1476,-,4.800000000000001e-124,400.4,24.0,5.4e-124,...,24.0,1.0,1,0,0,1,1,1,1,Ammonia monooxygenase
3,/tmp/tmp0xm_7iqz,0,Monooxygenase_B,PF04744.11,1477,-,3.3e-170,553.6,0.0,3.7e-170,...,0.0,1.0,1,0,0,1,1,1,1,Monooxygenase subunit B protein
4,/tmp/tmp0xm_7iqz,0,AmoC,PF04896.11,2043,-,5.8e-122,393.3,14.7,7e-122,...,14.7,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
5,/tmp/tmp0xm_7iqz,0,Monooxygenase_B,PF04744.11,2816,-,3.3e-170,553.6,0.0,3.7e-170,...,0.0,1.0,1,0,0,1,1,1,1,Monooxygenase subunit B protein
6,/tmp/tmp0xm_7iqz,0,AMO,PF02461.15,2817,-,4.800000000000001e-124,400.4,24.0,5.4e-124,...,24.0,1.0,1,0,0,1,1,1,1,Ammonia monooxygenase
7,/tmp/tmp0xm_7iqz,0,AmoC,PF04896.11,2818,-,5.8e-122,393.3,14.7,7e-122,...,14.7,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
8,/tmp/tmp13o6_f9t,1,AmoC,PF04896.11,3042,-,5.600000000000001e-111,357.3,11.5,6.6000000000000005e-111,...,11.5,1.0,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."
9,/tmp/tmp13o6_f9t,1,AmoC,PF04896.11,3482,-,1.7e-30,93.8,0.0,2.2000000000000002e-30,...,0.0,1.1,1,0,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s..."


In [249]:
# def sizes(s):
#     s['size_kb'] = locale.format("%.1f", s['size'] / 1024.0, grouping=True) + ' KB'
#     s['size_mb'] = locale.format("%.1f", s['size'] / 1024.0 ** 2, grouping=True) + ' MB'
#     s['size_gb'] = locale.format("%.1f", s['size'] / 1024.0 ** 3, grouping=True) + ' GB'
#     return s

# df_test = df_test.append(rows_list)
# df_test = df_test.apply(sizes, axis=1)

In [285]:
files2['/tmp/tmp4h4Gou']

'group_4.fasta'

# Putting it all together

## Split file version - needs a fix

In [27]:
filename

NameError: name 'filename' is not defined

In [1091]:
#df_scramb

### A rookie fix with scanning the entire mg1 instead of combining chunks

In [133]:
filename='../2517287028.genes.faa'
filename='../76969.assembled.faa'

In [134]:


#names

In [139]:
def extractFeatures(ids):
    features=[]
    pos=0
    for record in SeqIO.parse(filename,"fasta"):
        pos+=1
        if record.name in ids:
            features.append({'qid':record.id, 'position':pos, 'descr':record.description, 'seq':str(record.seq)})
    return pd.DataFrame(features)

def extractFeatures2(ids):#putting in a seq object instead of string
    features=[]
    pos=0
    for record in SeqIO.parse(filename,"fasta"):
        pos+=1
        if record.name in ids:
            features.append({'qid':record.id, 'position':pos, 'descr':record.description, 'seq':record})
    return pd.DataFrame(features)

def extractFeatures3(ids):#putting in a seq object instead of string
    features=[]
    pos=0
    for record in SeqIO.parse(filename,"fasta"):
        pos+=1
        if record.name in ids:
            features.append({'qid':record.id, 'position':pos, 'descr':record.description, 'seq':record})
    return pd.DataFrame(features)

def is_operon(x):
    return x['diff1'] or x['diff2']

def rel_coordinates(x):
    if ((x['diff1'] < max_distance) or (x['diff2']) <max_distance):
        return min(x['diff1'],x['diff2'])
    else:
        return 0

#this function combines adjacent operons(fragments) into one
def operonCount(lst):
    opCnt=0
    state=False
    for i in xrange(len(lst)):
        if lst[i]:
            newState=True
            if (state==False) and (newState==True):
                opCnt+=1
            lst[i]=opCnt
            state=newState
        else:
            state=False
    return lst

#this function splits adjacent operons, based on distance
#for the operons of large size it will artificially break it into pieces

def operonCount2(lst, pos):
    opCnt=0
    state=False
    position=0
    
    for i in range(len(lst)):
        if lst[i]:
            newState=True
            #position=pos[i]
            if (state==False) and (newState==True):
                position=pos[i]
                opCnt+=1
            if (pos[i]>position+max_distance+min_operon_size+5):#!!! needs a fix so large operons won't be affected
                opCnt+=1
    
            lst[i]=opCnt
            state=newState
        else:
            state=False
    return lst

In [140]:
names=df_scramb[2].tolist()
names

[25186,
 54949,
 67923,
 72299,
 80785,
 80787,
 80788,
 87047,
 94490,
 116259,
 117995,
 141868,
 150445,
 150642,
 150643,
 150644,
 150645,
 150941,
 165876,
 165877,
 165879,
 177743,
 180794,
 187576,
 191730,
 200529,
 216390,
 216725,
 226146,
 227425,
 246097,
 255637,
 270559,
 272936,
 277115,
 301546,
 302538,
 314457,
 314458,
 314459,
 319844,
 326405,
 342438,
 355939,
 360372,
 378380,
 410777,
 432475,
 441716,
 456425,
 485659,
 488205,
 488206,
 513998,
 526114,
 530509,
 530792,
 560549,
 569326,
 579432,
 611428,
 617055,
 618212,
 624564,
 633544,
 650252,
 692980,
 708958,
 715358,
 718061,
 753688,
 760351,
 768877,
 782393,
 803081,
 803082,
 811556,
 814898,
 834627,
 845893,
 845894,
 845895,
 852087,
 854309,
 864966,
 892604,
 892605,
 892606,
 902819,
 905808,
 905809,
 920255,
 929162,
 935308,
 953386,
 965056,
 971025,
 975259,
 993079,
 999318,
 1002023,
 1007208,
 1022832,
 1039444,
 1039445,
 1051777,
 1097273,
 1105031,
 1108955,
 1108956,
 1114924,

In [141]:
def extractFeatures4(positions, generator):#putting in a seq object instead of string
    #file_iter = SeqIO.parse(open(filename),"fasta")#m.rosea
    offset=0
    features=[]
    #entries=list()
    for i in range(len(positions)):
        position=names[i]-offset
        offset=names[i]
        record=next(islice(generator, position-1,position))
        
        features.append({'qid':record.id, 'position':positions[i], 'descr':record.description, 'seq':record})
    return pd.DataFrame(features)

In [80]:
#df_scramb

In [142]:


# file_iter = SeqIO.parse(open(filename),"fasta")#m.rosea
# next(islice(file_iter, 319,319+1))
# #extract_features4(names, file_iter)


In [143]:
# file_iter = SeqIO.parse(open(filename),"fasta")#m.rosea
# offset=0
# entries=list()
# for i in range(len(names)):
#     position=names[i]-offset
#     offset=names[i]
#     entries.append(next(islice(file_iter, position-1,position)))

In [144]:
# for entry in entries:
#     print(entry)

In [145]:
# from itertools import islice
# next(islice(file_iter, 10,11))

In [146]:
# next(file_iter)

In [98]:
# file_iter = SeqIO.parse(open(filename),"fasta")

# extractFeatures4(names, file_iter)

Unnamed: 0,descr,position,qid,seq
0,2517401456 A3OODRAFT_0323 methane monooxygenas...,320,2517401456,"(M, S, V, T, T, Q, S, A, A, G, A, M, D, R, A, ..."
1,2517402628 A3OODRAFT_1495 methane monooxygenas...,1475,2517402628,"(M, S, S, T, T, D, T, A, A, R, A, A, A, G, T, ..."
2,2517402629 A3OODRAFT_1496 methane monooxygenas...,1476,2517402629,"(M, S, Q, S, K, S, G, G, A, V, G, P, F, N, S, ..."
3,2517402630 A3OODRAFT_1497 methane monooxygenas...,1477,2517402630,"(M, K, K, F, V, K, L, A, A, I, G, A, A, A, A, ..."
4,2517403201 A3OODRAFT_2068 methane monooxygenas...,2043,2517403201,"(M, S, S, T, T, D, T, A, A, R, A, A, A, G, T, ..."
5,2517403996 A3OODRAFT_2863 methane monooxygenas...,2816,2517403996,"(M, K, K, F, V, K, L, A, A, I, G, A, A, A, A, ..."
6,2517403997 A3OODRAFT_2864 methane monooxygenas...,2817,2517403997,"(M, S, Q, S, K, S, G, G, A, V, G, P, F, N, S, ..."
7,2517403998 A3OODRAFT_2865 methane monooxygenas...,2818,2517403998,"(M, S, S, T, T, D, T, A, A, R, A, A, A, G, T, ..."
8,2517404225 A3OODRAFT_3092 Ammonia monooxygenas...,3042,2517404225,"(M, S, M, T, K, T, E, A, R, S, A, A, R, S, A, ..."
9,2517404670 A3OODRAFT_3537 Ammonia monooxygenas...,3482,2517404670,"(M, S, E, T, T, Q, S, A, V, G, A, I, E, R, A, ..."


In [147]:
# df_scramb

In [54]:
df_scramb1=df_scramb.rename(columns={2:'qid'})

In [None]:
df_scramb1.merge(extractFeatures2(names), left_index=True, right_index=True)#doesn't work for a scrambled file

In [148]:
file_iter = SeqIO.parse(open(filename),"fasta")
names=df_scramb[2].tolist()
df_scramb=df_scramb.merge(extractFeatures4(names, file_iter), left_on=2, how='inner', right_on='position', suffixes=('',''))
#merging on index to fix dataframe duplication when ids are duplicated
#df_scramb=df_scramb.merge(extractFeatures2(names), left_index=True, how='inner', right_index=True, suffixes=('',''))
df_scramb.sort_values('position', inplace=True)
df_scramb

Unnamed: 0,query file,label,0,1,2,3,4,5,6,7,...,13,14,15,16,17,18,descr,position,qid,seq
0,/tmp/tmpkmnxyh54,0,Monooxygenase_B,PF04744.11,25186,-,1.3e-05,11.8,0.0,1.6e-05,...,0,1,1,1,1,Monooxygenase subunit B protein,Ga0073928_1000008053,25186,Ga0073928_1000008053,"(M, R, N, W, L, P, P, V, L, I, L, L, I, A, G, ..."
1,/tmp/tmp35spj29e,1,Monooxygenase_B,PF04744.11,54949,-,0.0021,4.6,6.9,0.00044,...,0,2,2,2,1,Monooxygenase subunit B protein,Ga0073928_1000028111,54949,Ga0073928_1000028111,"(M, T, A, T, N, G, P, A, L, T, V, W, F, A, L, ..."
2,/tmp/tmp544mvn8q,2,AmoC,PF04896.11,67923,-,1.8e-05,11.8,0.5,2.9e-05,...,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s...",Ga0073928_1000040165,67923,Ga0073928_1000040165,"(L, R, P, L, T, L, S, R, L, L, T, H, C, A, I, ..."
3,/tmp/tmp544mvn8q,2,AmoC,PF04896.11,72299,-,4.3e-06,13.8,0.2,4.6e-06,...,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s...",Ga0073928_100004499,72299,Ga0073928_100004499,"(M, E, Q, R, R, S, L, L, A, V, G, F, L, A, V, ..."
4,/tmp/tmp544mvn8q,2,Monooxygenase_B,PF04744.11,80785,-,9.1e-99,318.4,0.6,1.1e-98,...,0,1,1,1,1,Monooxygenase subunit B protein,Ga0073928_1000054854,80785,Ga0073928_1000054854,"(M, R, I, R, F, P, V, L, S, A, A, L, L, I, L, ..."
5,/tmp/tmp544mvn8q,2,AMO,PF02461.15,80787,-,1.2e-66,212.6,23.8,1.4e-66,...,0,1,1,1,1,Ammonia monooxygenase,Ga0073928_1000054856,80787,Ga0073928_1000054856,"(M, A, T, A, P, S, I, A, A, A, A, A, A, E, R, ..."
6,/tmp/tmp544mvn8q,2,AmoC,PF04896.11,80788,-,1.4e-55,175.9,16.2,1.6e-55,...,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s...",Ga0073928_1000054857,80788,Ga0073928_1000054857,"(M, R, S, M, E, E, V, A, A, G, R, P, P, T, G, ..."
7,/tmp/tmp544mvn8q,2,AmoC,PF04896.11,87047,-,1.5e-06,15.3,1.6,3e-06,...,0,1,1,1,1,"Ammonia monooxygenase/methane monooxygenase, s...",Ga0073928_1000062722,87047,Ga0073928_1000062722,"(M, T, T, V, D, V, M, N, L, F, V, S, A, V, L, ..."
8,/tmp/tmp4xpfb1he,3,AmoC,PF04896.11,94490,-,3.6e-07,17.3,0.8,3.6e-07,...,0,2,2,2,1,"Ammonia monooxygenase/methane monooxygenase, s...",Ga0073928_1000073153,94490,Ga0073928_1000073153,"(M, P, A, P, S, R, L, S, E, I, D, V, T, R, G, ..."
9,/tmp/tmp4xpfb1he,3,AmoC,PF04896.11,116259,-,0.015,2.2,15.5,0.00011,...,0,3,3,3,0,"Ammonia monooxygenase/methane monooxygenase, s...",Ga0073928_1000110012,116259,Ga0073928_1000110012,"(M, S, S, A, A, S, A, P, A, A, L, E, V, S, S, ..."


In [149]:
import numpy as np
max_distance=3
min_operon_size=2

#df_scramb.sort_values('position')
model2type={'AmoC': 'C', 'AMO': 'A', 'Monooxygenase_B': 'B'}



l=df_scramb['position'].tolist()

df_scramb['diff1']=abs((np.array(l)-np.array([0] + l[:-1])))#shift to left
df_scramb['diff2']=abs(np.array(l)-np.array(l[1:]+[0]))#shift to right
#df[(df['diff1'] <= 2) | (df['diff2'] <= 2)]

df_scramb['is_operon']=df_scramb[['diff1', 'diff2']].apply(lambda x: x < max_distance).apply(is_operon, axis=1)

df_scramb['rel_coordinates']=df_scramb[['diff1', 'diff2']].apply(rel_coordinates, axis=1)

#df_scramb['operon_count']=operonCount(df_scramb['is_operon'].tolist(), df_scramb['position'].tolist())
df_scramb['operon_count']=operonCount2(df_scramb['is_operon'].tolist(), df_scramb['position'].tolist())

df_scramb['type']=df_scramb[0].apply(lambda x: model2type[x])
df_scramb

Unnamed: 0,query file,label,0,1,2,3,4,5,6,7,...,descr,position,qid,seq,diff1,diff2,is_operon,rel_coordinates,operon_count,type
0,/tmp/tmpkmnxyh54,0,Monooxygenase_B,PF04744.11,25186,-,1.3e-05,11.8,0.0,1.6e-05,...,Ga0073928_1000008053,25186,Ga0073928_1000008053,"(M, R, N, W, L, P, P, V, L, I, L, L, I, A, G, ...",25186,29763,False,0,False,B
1,/tmp/tmp35spj29e,1,Monooxygenase_B,PF04744.11,54949,-,0.0021,4.6,6.9,0.00044,...,Ga0073928_1000028111,54949,Ga0073928_1000028111,"(M, T, A, T, N, G, P, A, L, T, V, W, F, A, L, ...",29763,12974,False,0,False,B
2,/tmp/tmp544mvn8q,2,AmoC,PF04896.11,67923,-,1.8e-05,11.8,0.5,2.9e-05,...,Ga0073928_1000040165,67923,Ga0073928_1000040165,"(L, R, P, L, T, L, S, R, L, L, T, H, C, A, I, ...",12974,4376,False,0,False,C
3,/tmp/tmp544mvn8q,2,AmoC,PF04896.11,72299,-,4.3e-06,13.8,0.2,4.6e-06,...,Ga0073928_100004499,72299,Ga0073928_100004499,"(M, E, Q, R, R, S, L, L, A, V, G, F, L, A, V, ...",4376,8486,False,0,False,C
4,/tmp/tmp544mvn8q,2,Monooxygenase_B,PF04744.11,80785,-,9.1e-99,318.4,0.6,1.1e-98,...,Ga0073928_1000054854,80785,Ga0073928_1000054854,"(M, R, I, R, F, P, V, L, S, A, A, L, L, I, L, ...",8486,2,True,2,1,B
5,/tmp/tmp544mvn8q,2,AMO,PF02461.15,80787,-,1.2e-66,212.6,23.8,1.4e-66,...,Ga0073928_1000054856,80787,Ga0073928_1000054856,"(M, A, T, A, P, S, I, A, A, A, A, A, A, E, R, ...",2,1,True,1,1,A
6,/tmp/tmp544mvn8q,2,AmoC,PF04896.11,80788,-,1.4e-55,175.9,16.2,1.6e-55,...,Ga0073928_1000054857,80788,Ga0073928_1000054857,"(M, R, S, M, E, E, V, A, A, G, R, P, P, T, G, ...",1,6259,True,1,1,C
7,/tmp/tmp544mvn8q,2,AmoC,PF04896.11,87047,-,1.5e-06,15.3,1.6,3e-06,...,Ga0073928_1000062722,87047,Ga0073928_1000062722,"(M, T, T, V, D, V, M, N, L, F, V, S, A, V, L, ...",6259,7443,False,0,False,C
8,/tmp/tmp4xpfb1he,3,AmoC,PF04896.11,94490,-,3.6e-07,17.3,0.8,3.6e-07,...,Ga0073928_1000073153,94490,Ga0073928_1000073153,"(M, P, A, P, S, R, L, S, E, I, D, V, T, R, G, ...",7443,21769,False,0,False,C
9,/tmp/tmp4xpfb1he,3,AmoC,PF04896.11,116259,-,0.015,2.2,15.5,0.00011,...,Ga0073928_1000110012,116259,Ga0073928_1000110012,"(M, S, S, A, A, S, A, P, A, A, L, E, V, S, S, ...",21769,1736,False,0,False,C


In [150]:
operons=list()
output=pd.DataFrame()
for count,frame in df_scramb.groupby('operon_count', sort=False):
    if count > 0:
        #print("***********"+str(count)+"*****************************")
        #print(frame)
#         
        #frame.columns = ['{}_{}'.format(*c) for c in df.columns]

        #frame=frame.stack().to_frame().T
        
        #frame.columns=frame.columns.droplevel(0)
        operons.append(frame.copy())
        #output=output.append(frame, ignore_index=True)
        #print('===========================================================')

In [152]:
operons[1]

Unnamed: 0,query file,label,0,1,2,3,4,5,6,7,...,descr,position,qid,seq,diff1,diff2,is_operon,rel_coordinates,operon_count,type
13,/tmp/tmp9hldbodb,5,Monooxygenase_B,PF04744.11,150642,-,3.4000000000000003e-118,382.4,0.0,3.8e-118,...,Ga0073928_1000187914,150642,Ga0073928_1000187914,"(M, A, N, W, P, L, E, I, P, S, P, A, G, L, T, ...",197,1,True,1,2,B
14,/tmp/tmp9hldbodb,5,Monooxygenase_B,PF04744.11,150643,-,3.3e-15,43.4,0.0,3.6e-15,...,Ga0073928_1000187915,150643,Ga0073928_1000187915,"(V, T, L, P, A, L, L, A, A, V, I, A, L, G, F, ...",1,1,True,1,2,B
15,/tmp/tmp9hldbodb,5,AMO,PF02461.15,150644,-,2.2e-93,300.1,22.5,2.4e-93,...,Ga0073928_1000187916,150644,Ga0073928_1000187916,"(M, T, A, A, R, M, A, R, S, Q, P, A, I, T, S, ...",1,1,True,1,2,A
16,/tmp/tmp9hldbodb,5,AmoC,PF04896.11,150645,-,1.9e-93,299.9,15.3,2.4e-93,...,Ga0073928_1000187917,150645,Ga0073928_1000187917,"(M, A, T, A, L, N, T, A, K, L, G, A, P, A, Q, ...",1,296,True,1,2,C


In [104]:
''.join(operons[1].reset_index(drop=True)['type'].sort_index(ascending=False).tolist())

'CABB'

In [105]:
tmp=operons[1].reset_index(drop=True)
tmp

Unnamed: 0,query file,label,0,1,2,3,4,5,6,7,...,descr,position,qid,seq,diff1,diff2,is_operon,rel_coordinates,operon_count,type
0,/tmp/tmpYiCF3t,5,Monooxygenase_B,PF04744.11,Ga0073928_1000187914,-,3.4000000000000003e-118,382.4,0.0,3.8e-118,...,Ga0073928_1000187914,150642,Ga0073928_1000187914,"(M, A, N, W, P, L, E, I, P, S, P, A, G, L, T, ...",197,1,True,1,2,B
1,/tmp/tmpYiCF3t,5,Monooxygenase_B,PF04744.11,Ga0073928_1000187915,-,3.3e-15,43.4,0.0,3.6e-15,...,Ga0073928_1000187915,150643,Ga0073928_1000187915,"(V, T, L, P, A, L, L, A, A, V, I, A, L, G, F, ...",1,1,True,1,2,B
2,/tmp/tmpYiCF3t,5,AMO,PF02461.15,Ga0073928_1000187916,-,2.2e-93,300.1,22.5,2.4e-93,...,Ga0073928_1000187916,150644,Ga0073928_1000187916,"(M, T, A, A, R, M, A, R, S, Q, P, A, I, T, S, ...",1,1,True,1,2,A
3,/tmp/tmpYiCF3t,5,AmoC,PF04896.11,Ga0073928_1000187917,-,1.9e-93,299.9,15.3,2.4e-93,...,Ga0073928_1000187917,150645,Ga0073928_1000187917,"(M, A, T, A, L, N, T, A, K, L, G, A, P, A, Q, ...",1,296,True,1,2,C


In [1197]:
operon_structure=''.join(tmp['type'].tolist())
if ('BAC' or 'CAB') not in operon_structure:    
    print('no operon')
elif 'BAC' in operon_structure:
    print('reversed operon')
    tmp.sort_index(ascending=False, inplace=True)
elif 'CAB' in operon_structure:
    print('forward operon')

reversed operon


In [1249]:
for i,f in tmp.groupby('type', sort=False):
    print(i, f.sort_values(4).head(1))

('C',        query file  label     0           1                     2  3        4  \
3  /tmp/tmplLe78K      5  AmoC  PF04896.11  Ga0073928_1000187917  -  1.9e-93   

       5     6        7 ...                  descr position  \
3  299.9  15.3  2.4e-93 ...   Ga0073928_1000187917   150645   

                    qid                                                seq  \
3  Ga0073928_1000187917  MATALNTAKLGAPAQAPPLAIRSDDYRWISPTLMVLVCGIVGILMI...   

  diff1 diff2 is_operon rel_coordinates operon_count type  
3     1   296      True               1            2    C  

[1 rows x 31 columns])
('A',        query file  label    0           1                     2  3        4  \
2  /tmp/tmplLe78K      5  AMO  PF02461.15  Ga0073928_1000187916  -  2.2e-93   

       5     6        7 ...                  descr position  \
2  300.1  22.5  2.4e-93 ...   Ga0073928_1000187916   150644   

                    qid                                                seq  \
2  Ga0073928_1000187916  MTAARMARSQ

In [107]:
operons[11]

Unnamed: 0,query file,label,0,1,2,3,4,5,6,7,...,descr,position,qid,seq,diff1,diff2,is_operon,rel_coordinates,operon_count,type
125,/tmp/tmpmG3LIw,41,Monooxygenase_B,PF04744.11,Ga0073928_102923531,-,5.3e-65,207.3,2.1,6.5e-65,...,Ga0073928_102923531,1236689,Ga0073928_102923531,"(M, R, I, R, F, A, V, I, L, S, L, V, A, V, L, ...",7399,2,True,2,12,B
124,/tmp/tmpmG3LIw,41,AMO,PF02461.15,Ga0073928_102923533,-,5.1e-15,43.5,0.2,5.1e-15,...,Ga0073928_102923533,1236691,Ga0073928_102923533,"(Q, G, I, A, Y, I, R, S, Q, T, P, E, Y, L, R, ...",2,8428,True,2,12,A


## That's how it is suggested to process operons if full_operons=True

In [33]:
#
[str(x.seq) for x in operons[1]['seq'].values]

['MANWPLEIPSPAGLTFLNVGVPGPVFIRTASTVNGVSGASSMSLGLGHDYEYRIVLRGRIPGRYHVHPMLDVRDTGPILGPGKWVSVTGNAAAFTDPVTTLTGRRLDLARYGLGAIAGWHLVWVVVALLWLGYWLRKPLLMPRYEAVQAGDGDRLITRPDRIAAAVILAITLIVATGETVLTNIAFPVTIPLQSNQTKVPPLVAAPAAITLVPEQVTYLVPGRTVQFRLRVTNLSVKPIRFGEFTTANLRFLNRAVVKAESDYPQDLIAPAGLVVRPGEPIAPGVSEEVYIEATSSVWETDRLTALMSDPTNRIGGLFMFYAPDGERSLVEFSSPIVPTFTQPVRDAE*',
 'VTLPALLAAVIALGFSANALAHGERAQEPSLRLSTIQWYDTKWSSDRIGVNQELTLSGAST*',
 'MTAARMARSQPAITSAEDLRRAYRWFDLVVAVLLFFLFMGLYHLITMLTVGDWDFWSDWKDSRWWLSFTPIVEIAFPAAITFIFWSKFRLPFAATFLTLCLVIAEWINRICNMHGWAYFPLNFVWPATLIPGGLALDTVLVLTESYLFTAIIGGMLWAILFYLANWPLISHFLLPVNLNGILMSLADLQGFHYIRTGTPEYIRIINRGTLRTFGQDPIYLSVAFAGFMSMLVYMLFLGLGWIFTRTQFKWLKRL*',
 'MATALNTAKLGAPAQAPPLAIRSDDYRWISPTLMVLVCGIVGILMIAARLYQQVYAWSAGLDATSPEFRTYWMNLLFAQFAVEGLAAASIWGYVWFSRPKDLGRVQTREELRRFGVFVALIFCYVFAVFWAASFFAEEDASWHQTVVRDTSFTPSHIILFYWAFPIYIILGITAYLYGMTRLPRFAARHSLPWIIGVAGPFMLLPVVAYNEWAHSFWILEERFAAPVHWWFMIFAWSGLGLGGLLLQVCDYLHELTTGEAQTYAAEDDPAVDPALEFEGEVG*']

In [108]:
operons[1]

Unnamed: 0,query file,label,0,1,2,3,4,5,6,7,...,descr,position,qid,seq,diff1,diff2,is_operon,rel_coordinates,operon_count,type
19,/tmp/tmpYiCF3t,5,Monooxygenase_B,PF04744.11,Ga0073928_1000187914,-,3.4000000000000003e-118,382.4,0.0,3.8e-118,...,Ga0073928_1000187914,150642,Ga0073928_1000187914,"(M, A, N, W, P, L, E, I, P, S, P, A, G, L, T, ...",197,1,True,1,2,B
18,/tmp/tmpYiCF3t,5,Monooxygenase_B,PF04744.11,Ga0073928_1000187915,-,3.3e-15,43.4,0.0,3.6e-15,...,Ga0073928_1000187915,150643,Ga0073928_1000187915,"(V, T, L, P, A, L, L, A, A, V, I, A, L, G, F, ...",1,1,True,1,2,B
17,/tmp/tmpYiCF3t,5,AMO,PF02461.15,Ga0073928_1000187916,-,2.2e-93,300.1,22.5,2.4e-93,...,Ga0073928_1000187916,150644,Ga0073928_1000187916,"(M, T, A, A, R, M, A, R, S, Q, P, A, I, T, S, ...",1,1,True,1,2,A
16,/tmp/tmpYiCF3t,5,AmoC,PF04896.11,Ga0073928_1000187917,-,1.9e-93,299.9,15.3,2.4e-93,...,Ga0073928_1000187917,150645,Ga0073928_1000187917,"(M, A, T, A, L, N, T, A, K, L, G, A, P, A, Q, ...",1,296,True,1,2,C


In [109]:
operons[1][4]

19    3.4e-118
18     3.3e-15
17     2.2e-93
16     1.9e-93
Name: 4, dtype: object

In [121]:

idxb=operons[1][operons[1].loc[:,'type']=='B'].sort_values(4, ascending=True).head(1).index[0]
operons[1].loc[[idxb]]['seq'].values[0]

SeqRecord(seq=Seq('MANWPLEIPSPAGLTFLNVGVPGPVFIRTASTVNGVSGASSMSLGLGHDYEYRI...AE*', SingleLetterAlphabet()), id='Ga0073928_1000187914', name='Ga0073928_1000187914', description='Ga0073928_1000187914', dbxrefs=[])

In [153]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC


ops=list(operons)

def analyzeStructure(name_string, filter_operons={'CAB':0,'ABC':0}):#fix this for the extended version
    for key in filter_operons.keys():
    
        if key in name_string:
            filter_operons[key]=1
            return (key, 1)
        if key[::-1] in name_string:
            filter_operons[key]=-1
            return (key, -1)
    return (None, None)

def concatSeparatedValues(row, sep=" "):
    concatVal=''
    for i in range(output['type'].iloc[1].size):
        concatVal+=str(row.iloc[i])+sep
    concatVal=concatVal[:-len(sep)]#removes last unnesessary separator
    return concatVal.rstrip()

def operonParser(op_lst, op_type=False, full_operons=False):
    output_lst=list()#list of seq elements for fasta exporting
    
    if op_lst == []:
        return 1
    if full_operons:
        out= pd.DataFrame(columns=['operon #','input_file','type',  'rel_distance','size', 'concat_ids','concat_eval', 'concat_descr', 'C', 'A', 'B', 'full_operon'])
    out= pd.DataFrame(columns=['operon #','input_file','type','concat_eval','concat_ids','size',  'concat_descr', 'C', 'A', 'B'])
    
    for operon in op_lst:
        
        #print(operon)
        operon_structure=''.join(operon['type'].tolist())
        #print(operon_structure)
        op_type,forward=analyzeStructure(operon_structure)
        if op_type:
            #converting evalue to float
            operon[4]=operon[4].astype(float)#for correct sorting
            #reverse operon if needed
            #print(operon)
            if forward == -1:
                #''.join(operons[1].reset_index(drop=True)['type'].sort_index(ascending=False).tolist())
                operon=operon.reset_index(drop=True).sort_index(ascending=False)#rookie way   
            else:
                operon=operon.reset_index(drop=True) #else clause is a bit unnecessary
            print(type(operon))
            cnt=int(operon['operon_count'].mean())#!!! needs an assert for integer value
            inp_file=filename
            
            #ot=operon_structure #inclues extended understanding of an operon
            ot=op_type
            
            
            rd=''.join(operon.loc[:, 'rel_coordinates'].astype(str).tolist())
            sz=len(operon.index)
            
            
            idx_a=operon[operon.loc[:,'type']=='A'].sort_values(4, ascending=True).head(1).index[0]
            idx_b=operon[operon.loc[:,'type']=='B'].sort_values(4, ascending=True).head(1).index[0]
            idx_c=operon[operon.loc[:,'type']=='C'].sort_values(4, ascending=True).head(1).index[0]
            
            
            
            concat_eval= ";".join([str(x) for x in operon.loc[[idx_c, idx_a, idx_b]][4].astype(str)])
            #peron
            
            
            concat_ids=";".join([x.id for x in operon.loc[[idx_c, idx_a, idx_b]]['seq']])
            concat_descr=";".join([x.description for x in operon.loc[[idx_c, idx_a, idx_b]]['seq']])
            #str(operons[1][operons[1]['type']=='C']['seq'].values[0].seq)
            #takes one value from operon with the smallest e-vaue if multiple values are present
            #str(record.seq)
#             c_seq=str(operon[operon.loc[:,'type']=='C'].sort_values(4, ascending=True).head(1)['seq'].values[0].seq)
#             a_seq=str(operon[operon.loc[:,'type']=='A'].sort_values(4, ascending=True).head(1)['seq'].values[0].seq)
#             b_seq=str(operon[operon.loc[:,'type']=='B'].sort_values(4, ascending=True).head(1)['seq'].values[0].seq)
            c_seq=str(operon.loc[idx_c]['seq'].seq)
            a_seq=str(operon.loc[idx_a]['seq'].seq)
            b_seq=str(operon.loc[idx_b]['seq'].seq)
    
            record=SeqRecord(Seq(c_seq+a_seq+b_seq, IUPAC.protein), id=concat_ids, description=concat_descr)
#           records.append(record)
            
            output_lst.append(record)
#             if full_operons:
#                 full_op=
            
            
            out=out.append(pd.Series({'operon #':cnt, 
                                  'input_file':inp_file, 
                                  'type':ot, 
                                  #'rel_distance':rd, 
                                  'size': sz,
                                  'concat_eval': concat_eval,
                                  'concat_ids': concat_ids,
                                  'concat_descr': concat_descr,
                                  'C':c_seq,
                                  'A':a_seq,
                                  'B':b_seq}), ignore_index=True)
            
            
    return out, output_lst

In [111]:
operons[2]['type']

14    C
21    A
12    B
Name: type, dtype: object

In [113]:
for op in operons:
    print(''.join(op['type'].tolist()))

BAC
BBAC
CAB
CBA
BB
BA
BAC
BBA
AC
CA
BC
BA
AC
A
B
CA
BC
C
A
AC
AB
BC
BA
CA
AB
AB
AB


In [154]:
final_frame, output_list=operonParser(operons)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [108]:
final_frame

Unnamed: 0,operon #,input_file,type,concat_eval,concat_ids,size,concat_descr,C,A,B
0,1,../2517287028.genes.faa,CAB,5.8e-122;4.8e-124;3.3e-170,2517402628;2517402629;2517402630,3,2517402628 A3OODRAFT_1495 methane monooxygenas...,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...
1,2,../2517287028.genes.faa,CAB,5.8e-122;4.8e-124;3.3e-170,2517403998;2517403997;2517403996,3,2517403998 A3OODRAFT_2865 methane monooxygenas...,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...
2,3,../2517287028.genes.faa,ABC,4.2e-93;6.5e-114;7.5e-155,2517404865;2517404867;2517404866,3,2517404865 A3OODRAFT_3733 methane monooxygenas...,MSTTAQTISQSTPQAPFNLPWYLRDLPKYLLTFGVMTVIYVGFRMY...,MTGKSLDIPARPYTGEKSRLSRAYDYLILVLALFLFIGSFHLHVAL...,MKKSLKSLFSAVYLGLCAGALAPEAALAHGEKALEPFVRMRTIQWY...


In [89]:
output_list

[SeqRecord(seq=Seq('MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFGWRAGLDSF...DMP', IUPACProtein()), id='WP_026222791.1;WP_014892304.1;WP_018408664.1', name='<unknown name>', description='WP_026222791.1 methane monooxygenase/ammonia monooxygenase subunit C;WP_014892304.1 methane monooxygenase/ammonia monooxygenase subunit A;WP_018408664.1 methane monooxygenase/ammonia monooxygenase subunit B', dbxrefs=[]),
 SeqRecord(seq=Seq('MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFGWRAGLDSF...DMP', IUPACProtein()), id='WP_026222791.1;WP_014892304.1;WP_018408664.1', name='<unknown name>', description='WP_026222791.1 methane monooxygenase/ammonia monooxygenase subunit C;WP_014892304.1 methane monooxygenase/ammonia monooxygenase subunit A;WP_018408664.1 methane monooxygenase/ammonia monooxygenase subunit B', dbxrefs=[]),
 SeqRecord(seq=Seq('MSTTAQTISQSTPQAPFNLPWYLRDLPKYLLTFGVMTVIYVGFRMYQGAYAIST...KFY', IUPACProtein()), id='WP_018409559.1;WP_026223175.1;WP_018409560.1', name='<unknown name>', descript

In [155]:



#export_columns=['operon_type', 'rel_distance', 'concat_ids', 'concat_descr', 'concat_sequences']
outfile1 = "out5.csv"
outfile2 = "out5.fasta"
final_frame.to_csv(outfile1, sep='\t', header=True)

#if len(ids)==len(sequences):
records=list()

with open(outfile2, "w") as output_handle:
    SeqIO.write(output_list, output_handle, "fasta")

In [52]:
operons[2]

Unnamed: 0,query file,label,0,1,2,3,4,5,6,7,...,descr,position,qid,seq,diff1,diff2,is_operon,rel_coordinates,operon_count,type
18,/tmp/tmpYiCF3t,5,AmoC,PF04896.11,Ga0073928_100023124,-,2e-54,172.1,18.2,2.3999999999999998e-54,...,Ga0073928_100023124,165876,Ga0073928_100023124,"(M, D, S, T, R, Q, A, L, G, L, E, E, G, R, Y, ...",14935,1,True,1,3,C
19,/tmp/tmpYiCF3t,5,AMO,PF02461.15,Ga0073928_100023125,-,1.3e-65,209.2,21.2,1.5e-65,...,Ga0073928_100023125,165877,Ga0073928_100023125,"(M, A, S, T, P, A, A, L, T, A, A, E, R, Q, H, ...",1,2,True,1,3,A
20,/tmp/tmpYiCF3t,5,Monooxygenase_B,PF04744.11,Ga0073928_100023127,-,1.9e-97,314.0,0.7,2.4e-97,...,Ga0073928_100023127,165879,Ga0073928_100023127,"(M, R, K, W, F, V, I, S, I, P, A, I, L, L, L, ...",2,11864,True,2,3,B


In [53]:
#sum_sales_dept.astype(str)
operons[1][operons[1]['type']=='B'].sort_values(4, ascending=True)

Unnamed: 0,query file,label,0,1,2,3,4,5,6,7,...,descr,position,qid,seq,diff1,diff2,is_operon,rel_coordinates,operon_count,type
13,/tmp/tmpYiCF3t,5,Monooxygenase_B,PF04744.11,Ga0073928_1000187914,-,3.4000000000000003e-118,382.4,0.0,3.8e-118,...,Ga0073928_1000187914,150642,Ga0073928_1000187914,"(M, A, N, W, P, L, E, I, P, S, P, A, G, L, T, ...",197,1,True,1,2,B
14,/tmp/tmpYiCF3t,5,Monooxygenase_B,PF04744.11,Ga0073928_1000187915,-,3.3e-15,43.4,0.0,3.6e-15,...,Ga0073928_1000187915,150643,Ga0073928_1000187915,"(V, T, L, P, A, L, L, A, A, V, I, A, L, G, F, ...",1,1,True,1,2,B


In [77]:
operons[1][4]

13    3.400000e-118
14     3.300000e-15
15     2.200000e-93
16     1.900000e-93
Name: 4, dtype: float64

In [1244]:
i

Unnamed: 0,query file,label,0,1,2,3,4,5,6,7,...,descr,position,qid,seq,diff1,diff2,is_operon,rel_coordinates,operon_count,type


In [1248]:
f.sort_values(4).head(1)

Unnamed: 0,query file,label,0,1,2,3,4,5,6,7,...,descr,position,qid,seq,diff1,diff2,is_operon,rel_coordinates,operon_count,type
3,/tmp/tmplLe78K,5,AmoC,PF04896.11,Ga0073928_1000187917,-,1.9e-93,299.9,15.3,2.4e-93,...,Ga0073928_1000187917,150645,Ga0073928_1000187917,MATALNTAKLGAPAQAPPLAIRSDDYRWISPTLMVLVCGIVGILMI...,1,296,True,1,2,C


In [919]:
output['operon_type']=output['type'].apply(concatValues, axis=1)
output['rel_distance']=output['rel_coordinates'].apply(concatValues, axis=1)
output['concat_ids']=output[2].apply(concatSeparatedValues, axis=1)
output['concat_descr']=output['descr'].apply(concatSeparatedValues, axis=1)
output['concat_sequences']=output['seq'].apply(concatValues, axis=1)

In [920]:
output

Unnamed: 0,query file,label,0,1,2,3,4,5,6,7,...,diff2,is_operon,rel_coordinates,operon_count,type,operon_type,rel_distance,concat_ids,concat_descr,concat_sequences
0,/tmp/tmpd_zEcV,1,AmoC,PF04896.11,2517402628,-,5.8e-122,393.3,14.7,7e-122,...,566,True,1,1,B,CAB,111,2517402628|2517402629|2517402630,2517402628 A3OODRAFT_1495 methane monooxygenas...,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...
1,/tmp/tmpNdc_YO,2,Monooxygenase_B,PF04744.11,2517403996,-,3.3e-170,553.6,0.0,3.7e-170,...,224,True,1,2,C,BAC,111,2517403996|2517403997|2517403998,2517403996 A3OODRAFT_2863 methane monooxygenas...,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...
2,/tmp/tmpO4S3j2,3,AmoC,PF04896.11,2517404865,-,4.2e-93,298.8,17.7,5.1e-93,...,3676,True,1,3,A,CBA,111,2517404865|2517404866|2517404867,2517404865 A3OODRAFT_3733 methane monooxygenas...,MSTTAQTISQSTPQAPFNLPWYLRDLPKYLLTFGVMTVIYVGFRMY...


In [None]:
op

In [921]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC


export_columns=['operon_type', 'rel_distance', 'concat_ids', 'concat_descr', 'concat_sequences']
outfile1 = "out2.csv"
outfile2 = "out2.fasta"
output[export_columns].to_csv(outfile1, sep='\t', header=True)

ids = output['concat_ids'].tolist()
sequences=output['concat_sequences'].tolist()
descriptions=output['concat_descr'].tolist()
types=output['operon_type'].tolist()
#if len(ids)==len(sequences):
records=list()
for i in range(len(ids)):
    record=SeqRecord(Seq(sequences[i], IUPAC.protein), id=ids[i],name=types[i] , description=types[i]+" "+descriptions[i])
    records.append(record)
with open(outfile2, "w") as output_handle:
    SeqIO.write(records, output_handle, "fasta")

## Adding sequence and position - entire version

In [804]:
filename

'../2517287028.genes.faa'

In [805]:
names=df[2].tolist()

In [806]:
def extractFeatures(ids):
    features=[]
    pos=0
    for record in SeqIO.parse(filename,"fasta"):
        pos+=1
        if record.name in ids:
            features.append({'qid':record.name, 'position':pos, 'descr':record.description, 'seq':str(record.seq)})
    return pd.DataFrame(features)
        

In [814]:
df=df.merge(extractFeatures(names), left_on=2, right_on='qid', suffixes=('',''))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,qid_x,seq_x,descr_y,position_y,qid_y,seq_y,descr,position,qid,seq
0,AmoC,PF04896.11,2517401456,-,5.6e-120,386.8,15.8,7.199999999999999e-120,386.4,15.8,...,2517401456,MSVTTQSAAGAMDRAQPIVELKGMWIGLAVLNGFYLVVRIYEQIYG...,2517401456 A3OODRAFT_0323 methane monooxygenas...,320,2517401456,MSVTTQSAAGAMDRAQPIVELKGMWIGLAVLNGFYLVVRIYEQIYG...,2517401456 A3OODRAFT_0323 methane monooxygenas...,320,2517401456,MSVTTQSAAGAMDRAQPIVELKGMWIGLAVLNGFYLVVRIYEQIYG...
1,AmoC,PF04896.11,2517402628,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,2517402628,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,2517402628 A3OODRAFT_1495 methane monooxygenas...,1475,2517402628,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,2517402628 A3OODRAFT_1495 methane monooxygenas...,1475,2517402628,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...
2,AMO,PF02461.15,2517402629,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,...,2517402629,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...,2517402629 A3OODRAFT_1496 methane monooxygenas...,1476,2517402629,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...,2517402629 A3OODRAFT_1496 methane monooxygenas...,1476,2517402629,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...
3,Monooxygenase_B,PF04744.11,2517402630,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,2517402630,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,2517402630 A3OODRAFT_1497 methane monooxygenas...,1477,2517402630,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,2517402630 A3OODRAFT_1497 methane monooxygenas...,1477,2517402630,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...
4,AmoC,PF04896.11,2517403201,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,2517403201,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,2517403201 A3OODRAFT_2068 methane monooxygenas...,2043,2517403201,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,2517403201 A3OODRAFT_2068 methane monooxygenas...,2043,2517403201,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...
5,Monooxygenase_B,PF04744.11,2517403996,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,2517403996,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,2517403996 A3OODRAFT_2863 methane monooxygenas...,2816,2517403996,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,2517403996 A3OODRAFT_2863 methane monooxygenas...,2816,2517403996,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...
6,AMO,PF02461.15,2517403997,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,...,2517403997,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...,2517403997 A3OODRAFT_2864 methane monooxygenas...,2817,2517403997,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...,2517403997 A3OODRAFT_2864 methane monooxygenas...,2817,2517403997,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...
7,AmoC,PF04896.11,2517403998,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,2517403998,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,2517403998 A3OODRAFT_2865 methane monooxygenas...,2818,2517403998,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,2517403998 A3OODRAFT_2865 methane monooxygenas...,2818,2517403998,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...
8,AmoC,PF04896.11,2517404225,-,5.600000000000001e-111,357.3,11.5,6.6000000000000005e-111,357.1,11.5,...,2517404225,MSMTKTEARSAARSAERIIDTRPVLIGVPALMLFVAILRLYEQLFA...,2517404225 A3OODRAFT_3092 Ammonia monooxygenas...,3042,2517404225,MSMTKTEARSAARSAERIIDTRPVLIGVPALMLFVAILRLYEQLFA...,2517404225 A3OODRAFT_3092 Ammonia monooxygenas...,3042,2517404225,MSMTKTEARSAARSAERIIDTRPVLIGVPALMLFVAILRLYEQLFA...
9,AmoC,PF04896.11,2517404670,-,1.7e-30,93.8,0.0,2.2000000000000002e-30,93.3,0.0,...,2517404670,MSETTQSAVGAIERAEPIVDLRGMWIGLAVLNGFYLVVRIYEQVFG...,2517404670 A3OODRAFT_3537 Ammonia monooxygenas...,3482,2517404670,MSETTQSAVGAIERAEPIVDLRGMWIGLAVLNGFYLVVRIYEQVFG...,2517404670 A3OODRAFT_3537 Ammonia monooxygenas...,3482,2517404670,MSETTQSAVGAIERAEPIVDLRGMWIGLAVLNGFYLVVRIYEQVFG...


In [818]:
l

[320, 1475, 1476, 1477, 2043, 2816, 2817, 2818, 3042, 3482, 3674, 3675, 3676]

## Counting, identifying and annotating operons

In [819]:
max_distance=2

model2type={'AmoC': 'C', 'AMO': 'A', 'Monooxygenase_B': 'B'}

def is_operon(x):
    return x['diff1'] or x['diff2']

def rel_coordinates(x):
    if ((x['diff1'] < max_distance) or (x['diff2']) <max_distance):
        return min(x['diff1'],x['diff2'])
    else:
        return 0

l=df['position'].tolist()

df['diff1']=abs((np.array(l)-np.array([0] + l[:-1])))#shift to left
df['diff2']=abs(np.array(l)-np.array(l[1:]+[0]))#shift to right
#df[(df['diff1'] <= 2) | (df['diff2'] <= 2)]

df['is_operon']=df[['diff1', 'diff2']].apply(lambda x: x < max_distance).apply(is_operon, axis=1)

df['rel_coordinates']=df[['diff1', 'diff2']].apply(rel_coordinates, axis=1)

df['operon_count']=operonCount(df['is_operon'].tolist())
df['type']=df[0].apply(lambda x: model2type[x])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,descr,position,qid,seq,diff1,diff2,is_operon,rel_coordinates,operon_count,type
0,AmoC,PF04896.11,2517401456,-,5.6e-120,386.8,15.8,7.199999999999999e-120,386.4,15.8,...,2517401456 A3OODRAFT_0323 methane monooxygenas...,320,2517401456,MSVTTQSAAGAMDRAQPIVELKGMWIGLAVLNGFYLVVRIYEQIYG...,320,1155,False,0,False,C
1,AmoC,PF04896.11,2517402628,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,2517402628 A3OODRAFT_1495 methane monooxygenas...,1475,2517402628,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,1155,1,True,1,1,C
2,AMO,PF02461.15,2517402629,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,...,2517402629 A3OODRAFT_1496 methane monooxygenas...,1476,2517402629,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...,1,1,True,1,1,A
3,Monooxygenase_B,PF04744.11,2517402630,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,2517402630 A3OODRAFT_1497 methane monooxygenas...,1477,2517402630,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,1,566,True,1,1,B
4,AmoC,PF04896.11,2517403201,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,2517403201 A3OODRAFT_2068 methane monooxygenas...,2043,2517403201,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,566,773,False,0,False,C
5,Monooxygenase_B,PF04744.11,2517403996,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,2517403996 A3OODRAFT_2863 methane monooxygenas...,2816,2517403996,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,773,1,True,1,2,B
6,AMO,PF02461.15,2517403997,-,4.800000000000001e-124,400.4,24.0,5.4e-124,400.3,24.0,...,2517403997 A3OODRAFT_2864 methane monooxygenas...,2817,2517403997,MSQSKSGGAVGPFNSVAEAAGCVQTVDWMLLVLLFFAVLGGYHVHF...,1,1,True,1,2,A
7,AmoC,PF04896.11,2517403998,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,2517403998 A3OODRAFT_2865 methane monooxygenas...,2818,2517403998,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,1,224,True,1,2,C
8,AmoC,PF04896.11,2517404225,-,5.600000000000001e-111,357.3,11.5,6.6000000000000005e-111,357.1,11.5,...,2517404225 A3OODRAFT_3092 Ammonia monooxygenas...,3042,2517404225,MSMTKTEARSAARSAERIIDTRPVLIGVPALMLFVAILRLYEQLFA...,224,440,False,0,False,C
9,AmoC,PF04896.11,2517404670,-,1.7e-30,93.8,0.0,2.2000000000000002e-30,93.3,0.0,...,2517404670 A3OODRAFT_3537 Ammonia monooxygenas...,3482,2517404670,MSETTQSAVGAIERAEPIVDLRGMWIGLAVLNGFYLVVRIYEQVFG...,440,192,False,0,False,C


In [820]:
import numpy as np

In [821]:
df['is_operon'].tolist()
        

[False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True]

In [822]:
def operonCount(lst):
    opCnt=0
    state=False
    for i in xrange(len(lst)):
        if lst[i]:
            newState=True
            if (state==False) and (newState==True):
                opCnt+=1
            lst[i]=opCnt
            state=newState
        else:
            state=False
    return lst
            

In [824]:
opCnt

[0, 1, 1, 1, 0, 2, 2, 2, 0, 0, 3, 3, 3]

In [23]:
df

NameError: name 'df' is not defined

In [826]:
operons[1]

Index([                 2,             u'0_x',             u'1_x',
                   u'2_x',             u'3_x',                  4,
                        5,                  6,                  7,
                        8,                  9,                 10,
                       11,                 12,                 13,
                       14,                 15,                 16,
                       17,                 18,             u'0_y',
                   u'1_y',             u'2_y',             u'3_y',
                 u'diff1',           u'diff2',       u'is_operon',
          u'operon_count', u'rel_coordinates',            u'type',
                        2,             u'0_x',             u'1_x',
                   u'2_x',             u'3_x',                  4,
                        5,                  6,                  7,
                        8,                  9,                 10,
                       11,                 12,                

In [683]:
# df = df.stack().to_frame().T
# df.columns = ['{}_{}'.format(*c) for c in df.columns]

In [865]:
operons=list()
output=pd.DataFrame()
for count,frame in df.groupby('operon_count'):
    if count > 0:
        #print("***********"+str(count)+"*****************************")
        #print(frame)
        #print('===========================================================')
        #frame.columns = ['{}_{}'.format(*c) for c in df.columns]
        frame=frame.stack().to_frame().T
        operons.append(frame.copy())
        frame.columns=frame.columns.droplevel(0)
        output=output.append(frame, ignore_index=True)


In [866]:
def concatValues(row):
    concatVal=''
    for i in range(output['type'].iloc[1].size):
        concatVal+=str(row.iloc[i])
    return concatVal

In [883]:
def concatSeparatedValues(row, sep="|"):
    concatVal=''
    for i in range(output['type'].iloc[1].size):
        concatVal+=str(row.iloc[i])+sep
    concatVal=concatVal[:-1]
    return concatVal

In [884]:
output['type'].apply(lambda x: x.iloc[0]+x.iloc[1]+x.iloc[2] ,axis=1)

0    CAB
1    BAC
2    CBA
dtype: object

In [869]:
output['type']

Unnamed: 0,type,type.1,type.2
0,C,A,B
1,B,A,C
2,C,B,A


In [870]:
output['type'].iloc[1].size

3

In [871]:
output['type'].apply(concatValues,axis=1)


0    CAB
1    BAC
2    CBA
dtype: object

In [872]:
output

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,descr,position,qid,seq,diff1,diff2,is_operon,rel_coordinates,operon_count,type
0,AmoC,PF04896.11,2517402628,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,2517402630 A3OODRAFT_1497 methane monooxygenas...,1477,2517402630,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,1,566,True,1,1,B
1,Monooxygenase_B,PF04744.11,2517403996,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,2517403998 A3OODRAFT_2865 methane monooxygenas...,2818,2517403998,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,1,224,True,1,2,C
2,AmoC,PF04896.11,2517404865,-,4.2e-93,298.8,17.7,5.1e-93,298.5,17.7,...,2517404867 A3OODRAFT_3735 Ammonia monooxygenas...,3676,2517404867,MTGKSLDIPARPYTGEKSRLSRAYDYLILVLALFLFIGSFHLHVAL...,1,3676,True,1,3,A


In [873]:
output['descr']

Unnamed: 0,descr,descr.1,descr.2
0,2517402628 A3OODRAFT_1495 methane monooxygenas...,2517402629 A3OODRAFT_1496 methane monooxygenas...,2517402630 A3OODRAFT_1497 methane monooxygenas...
1,2517403996 A3OODRAFT_2863 methane monooxygenas...,2517403997 A3OODRAFT_2864 methane monooxygenas...,2517403998 A3OODRAFT_2865 methane monooxygenas...
2,2517404865 A3OODRAFT_3733 methane monooxygenas...,2517404866 A3OODRAFT_3734 methane monooxygenas...,2517404867 A3OODRAFT_3735 Ammonia monooxygenas...


In [874]:
operons[0].columns

MultiIndex(levels=[[1, 2, 3], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, u'descr', u'descr_x', u'descr_y', u'diff1', u'diff2', u'is_operon', u'operon_count', u'position', u'position_x', u'position_y', u'qid', u'qid_x', u'qid_y', u'rel_coordinates', u'seq', u'seq_x', u'seq_y', u'type']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 27, 30, 34, 21, 28, 31, 35, 20, 27, 30, 34, 21, 28, 31, 35, 19, 26, 29, 33, 22, 23, 24, 32, 25, 36, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 27, 30, 34, 21, 28, 3

In [878]:
output['operon_type']=output['type'].apply(concatValues, axis=1)
output['rel_distance']=output['rel_coordinates'].apply(concatValues, axis=1)
output['concat_ids']=output[2].apply(concatSeparatedValues, axis=1)
output['concat_descr']=output['descr'].apply(concatSeparatedValues, axis=1)
output['concat_sequences']=output['seq'].apply(concatValues, axis=1)

In [879]:
output

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,is_operon,rel_coordinates,operon_count,type,operon_type,rel_distance,concat_ids,concat_descr,concat_operon,concat_sequences
0,AmoC,PF04896.11,2517402628,-,5.8e-122,393.3,14.7,7e-122,393.0,14.7,...,True,1,1,B,CAB,111,2517402628|2517402629|2517402630,2517402628 A3OODRAFT_1495 methane monooxygenas...,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...,MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFG...
1,Monooxygenase_B,PF04744.11,2517403996,-,3.3e-170,553.6,0.0,3.7e-170,553.4,0.0,...,True,1,2,C,BAC,111,2517403996|2517403997|2517403998,2517403996 A3OODRAFT_2863 methane monooxygenas...,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...,MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWY...
2,AmoC,PF04896.11,2517404865,-,4.2e-93,298.8,17.7,5.1e-93,298.5,17.7,...,True,1,3,A,CBA,111,2517404865|2517404866|2517404867,2517404865 A3OODRAFT_3733 methane monooxygenas...,MSTTAQTISQSTPQAPFNLPWYLRDLPKYLLTFGVMTVIYVGFRMY...,MSTTAQTISQSTPQAPFNLPWYLRDLPKYLLTFGVMTVIYVGFRMY...


## Exporting values

In [880]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC


export_columns=['operon_type', 'rel_distance', 'concat_ids', 'concat_descr', 'concat_sequences']
outfile1 = "out.csv"
outfile2 = "out.fasta"
output[export_columns].to_csv(outfile1, sep='\t', header=True)

In [907]:


ids = output['concat_ids'].tolist()
sequences=output['concat_sequences'].tolist()
descriptions=output['concat_descr'].tolist()
types=output['operon_type'].tolist()
#if len(ids)==len(sequences):
records=list()
for i in range(len(ids)):
    record=SeqRecord(Seq(sequences[i], IUPAC.protein), id=ids[i],name=types[i] , description=types[i]+" "+descriptions[i])
    records.append(record)
with open(outfile2, "w") as output_handle:
    SeqIO.write(records, output_handle, "fasta")


In [908]:
records

[SeqRecord(seq=Seq('MSSTTDTAARAAAGTEAVVDLKGMWIGLAVLNGFYLVVRIYEQVFGWRAGLDSF...DMP', IUPACProtein()), id='2517402628|2517402629|2517402630', name='CAB', description='CAB 2517402628 A3OODRAFT_1495 methane monooxygenase/ammonia monooxygenase, subunit C [Methylocystis rosea SV97T]|2517402629 A3OODRAFT_1496 methane monooxygenase/ammonia monooxygenase, subunit A [Methylocystis rosea SV97T]|2517402630 A3OODRAFT_1497 methane monooxygenase/ammonia monooxygenase, subunit B [Methylocystis rosea SV97T]', dbxrefs=[]),
 SeqRecord(seq=Seq('MKKFVKLAAIGAAAAVAATLGAVAPASAHGEKSQQAFLRMRTLNWYDVQWSKTT...LTE', IUPACProtein()), id='2517403996|2517403997|2517403998', name='BAC', description='BAC 2517403996 A3OODRAFT_2863 methane monooxygenase/ammonia monooxygenase, subunit B [Methylocystis rosea SV97T]|2517403997 A3OODRAFT_2864 methane monooxygenase/ammonia monooxygenase, subunit A [Methylocystis rosea SV97T]|2517403998 A3OODRAFT_2865 methane monooxygenase/ammonia monooxygenase, subunit C [Methylocystis rosea SV

In [887]:
my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")

In [897]:

record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
                       IUPAC.protein),
                   id="YP_025292.1", name="HokC",
                   description="toxic membrane protein, small")

140443830384016

In [891]:
my_rna.id='haha'

In [893]:
my_rna.id

'haha'

In [895]:
my_rna.des='hehe'

In [896]:
my_rna.des

'hehe'

In [710]:
# for record in SeqIO.parse(filename,"fasta"):
#     print(record.description)

In [695]:
def locationAndSequence(s):
    s['loc']=

SyntaxError: invalid syntax (<ipython-input-695-c917f59625ee>, line 2)

In [696]:
# for i in chunks:
#     print(i)

In [697]:
pipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

In [698]:
pipe.communicate()

('', '/bin/sh: 1: Syntax error: "(" unexpected\n')

In [699]:
#process = subprocess.Popen(['ls','-l'], stdout=subprocess.PIPE)
#print(process.stdout.read())

In [700]:
%%time
import os
#hmmscan --tblout amocab_hmmscan_mros.tab  amoCAB 2517287028.genes.faa > /dev/null
#os.system("makeblastdb -in "+input_ref_0+" -dbtype nucl -title "+title_db+" -out "+outfile_db+" -parse_seqids")
fnames=""
flabels=""
for (l,f) in files:
    fnames+=f.name+" "
    flabels+=l+" "
#print flabels
    #print(l)
    #print(f.name)
    #parallel -j $1 blastall "-p blastn -d nt/nt -i "{}" -o "{.}".xml -e 1e-10 -m 7 -K 100 -b 200" ::: *.fa
    #print('Exists after close:', os.path.exists(f.name))
cmd="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {#}.tab " + "../amoCAB {} ::: "+fnames+" > /dev/null"

#command="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {}.tab " + "../amoCAB {} ::: "+fnames
print(cmd)

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [701]:
c

2

In [702]:
cmd="parallel -j 8 hmmscan "+"--cpu 1"+" --tblout {#}.tab " + "../amoCAB {} ::: "+fnames+" > /dev/null"
#flabels
print(cmd)

parallel -j 8 hmmscan --cpu 1 --tblout {#}.tab ../amoCAB {} ::: /tmp/tmpSqwnnp  > /dev/null


In [703]:
filename

'../2517287028.genes.faa'

In [704]:
/tmp/tmprOkRVb.close()

SyntaxError: invalid syntax (<ipython-input-704-02dd9a23c7c6>, line 1)

In [705]:
filename.name

AttributeError: 'str' object has no attribute 'name'

In [706]:
txt=filename.seek(0)
txt=filename.read()
#txt

AttributeError: 'str' object has no attribute 'seek'

In [707]:
filename

'../2517287028.genes.faa'

In [708]:
txt2

NameError: name 'txt2' is not defined

In [709]:
#Coool

In [26]:
txt2=filename.read()

## In-memory files
[https://stackoverflow.com/questions/23028071/open-a-file-in-memory]

In [11]:
#import io##for python3
import StringIO

output = StringIO.StringIO()
output.write('First line.\n')
print >>output, 'Second line.'

# Retrieve file contents -- this will be
# 'First line.\nSecond line.\n'
contents = output.getvalue()
print(contents)
# Close object and discard memory buffer --
# .getvalue() will now raise an exception.
output.close()


First line.
Second line.



## Redoing generator with an in-memory implementation

In [15]:
import StringIO

In [21]:
%%time
from Bio import SeqIO
import StringIO
#output = io.StringIO()
record_iter = SeqIO.parse(open("../2517287028.genes.faa"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 50000)):
    filename = "group_%i.fastq" % (i + 1)
    filename = StringIO.StringIO()
    
    count = SeqIO.write(batch, filename, "fasta")
    print("Wrote %i records to %s" % (count, filename))

Wrote 3891 records to <StringIO.StringIO instance at 0x7f5bc2c582d8>
CPU times: user 58.6 ms, sys: 128 µs, total: 58.7 ms
Wall time: 58.7 ms


In [24]:
dir(filename)

['__doc__',
 '__init__',
 '__iter__',
 '__module__',
 'buf',
 'buflist',
 'close',
 'closed',
 'flush',
 'getvalue',
 'isatty',
 'len',
 'next',
 'pos',
 'read',
 'readline',
 'readlines',
 'seek',
 'softspace',
 'tell',
 'truncate',
 'write',
 'writelines']

## memory map
[https://docs.python.org/2/library/mmap.html]

In [None]:
import mmap
from Bio import SeqIO
#import StringIO

record_iter = SeqIO.parse(open("../76969.assembled.faa"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 50000)):
    filename = "group_%i.fastq" % (i + 1)
    with open(filename, "w") as handle:
        count = SeqIO.write(batch, handle, "fasta")
    print("Wrote %i records to %s" % (count, filename))