### Create TSA Submission Template

https://submit.ncbi.nlm.nih.gov/genbank/template/submission/



In [None]:
%run ../config/init.py
import itertools
from goenrichment.go import parse_go_obo
from Bio.ExPASy import Enzyme
THREADS = 30

### Download latest copy of tbl2asn

In [None]:
if not os.path.exists(os.path.join(BIN,'tbl2asn')):
    !wget -O {BIN}/tbl2asn.gz https://ftp.ncbi.nih.gov/toolbox/ncbi_tools/converters/by_program/tbl2asn/linux.tbl2asn.gz
    !gunzip -v {BIN}/tbl2asn.gz
    !chmod a+x {BIN}/tbl2asn

### Download enzyme database

In [None]:
if not os.path.exists(os.path.join(DATA,'enzyme.dat')):
    !wget -O {DATA}/enzyme.dat ftp://ftp.expasy.org/databases/enzyme/enzyme.dat
enzyme = {}
with open(os.path.join(DATA,'enzyme.dat'), 'r') as e_fin:
    for r in Enzyme.parse(e_fin):
        n = r['DE'][:-1]
        if n[0].isupper() and n[1] != '-' and not n[1].isupper():
            n = n[0].lower() + n[1:]
        
        enzyme[r['ID']] = n
print('{} enzymes loaded'.format(len(enzyme)))

def validate_ec_number(enzyme, ec):
    if enzyme[ec].startswith('transferred entry'):
        for j in re.sub(' +', ' ', enzyme[ec].replace('transferred entry: ', '').replace('and', '').strip()).split(' '):
            try:
                return j
            except:
                pass
    else:
        return ec
    raise Exception()


### Loading GO graph

In [None]:
go = nx.DiGraph()
entries = parse_go_obo(os.path.join(DATA, 'go-basic.obo'))
nodes, edges = zip(*entries)
go.add_nodes_from(nodes)
go.add_edges_from(itertools.chain.from_iterable(edges))
go.graph['roots'] = {data['name']: n for n, data in go._node.items() \
                     if 'name' in data and data['name'] == data['namespace']}

### Loading sample list from GCP operations


In [None]:
samples = pandas.read_csv(os.path.join(RESULTS, DATASET, 'annotation', 'gcp', 'operations.tsv'), sep='\t')
samples = samples['sample']
bar_length = len(samples)

In [None]:
result_dir = os.path.join(RESULTS, DATASET, 'submission')
if not os.path.exists(result_dir):
    os.mkdir(result_dir)
os.chdir(result_dir)

### Loading BlastP, blast2GO, blast2CDD amd Enzyme

In [None]:
blastp = pandas.read_csv(os.path.join(RESULTS, DATASET, 'annotation', 'blastp.tsv.gz'), header=None, sep='\t')
blastp = blastp.sort_values(by=[0,4])
blastp.head()

In [None]:
blast_go = pandas.read_csv(os.path.join(RESULTS, DATASET, 'blastp_go.tsv.gz'), header=None, sep='\t')
blast_go = blast_go.sort_values(by=[0,2])
def format_go(go, g):
    try: 
        g = go.nodes()[g]
        return ('go_{}'.format(g['namespace'].split('_')[1]), 
                '{}|{}||'.format(g['name'],g['id'][3:]))
    except:
        return g
blast_go[4] = blast_go.apply(lambda x: format_go(go, x[1]), axis=1)
blast_go.head()

In [None]:
cdd = pandas.read_csv(os.path.join(RESULTS, DATASET, 'cdd.tsv.gz'), header=None, sep='\t')
cdd = cdd.replace(np.nan, '', regex=True)
cdd.head()


### Process the Contamination.txt file returned after a submission to clean up the submission files

In [None]:
%%bash -s "$RESULTS" "$DATASET"

cd $1/$2/submission
if [ -e "Contamination.txt" ]
then
    cat Contamination.txt | grep "^TRINITY_" | awk '{print $1}' > trans_to_remove.tsv
fi

In [None]:
os.chdir(result_dir)
id_err = pandas.DataFrame(columns=[0])
if os.path.exists('trans_to_remove.tsv'):
    id_err = pandas.read_csv('trans_to_remove.tsv', header=None, sep=' ')
display(id_err.head())

### Creating TBL file

In [None]:
%%time

def worker(s):
    """worker function""" 
    rfromto = re.compile(':([0-9]+)-([0-9]+)')
    dna = os.path.join(RESULTS, DATASET, 'annotation', 'blasts', s, s + '_nocont.fsa.gz')
    prot = os.path.join(RESULTS, DATASET, 'annotation', 'blasts', s, s + '_nocont_transdecoder.fsa.gz') 
    fasta = {}
    with gzip.open(dna, "rt") as dna_handle:
        for dna_record in SeqIO.parse(dna_handle, "fasta"):
            if len(dna_record.seq) >= 200 and dna_record.id.replace('|', '_') not in id_err[0].unique():
                fasta[dna_record.id] = {'f': dna_record, 'r': False}
    df_blast_p = blastp[blastp[0].isin(fasta)]
    df_blast_go = blast_go[blast_go[0].isin(fasta)]
    df_cdd = cdd[cdd[0].isin(fasta)]
    prots = {} 
    with gzip.open(prot, "rt") as prot_handle:          
        for prot_record in SeqIO.parse(prot_handle, "fasta"):
            f = prot_record.id.split('.')
            if f[0] in fasta:
                p = prots.setdefault(f[0], {})   
                d = prot_record.description.split(' ')
                m = rfromto.search(d[4])
                pFrom = int(m.group(1))
                pTo = int(m.group(2))
                d = d[1][5:]

                df = df_blast_p[(df_blast_p[0] == f[0])&(df_blast_p[7] == f[1])]
                blast_len = len(df)
                g = None
                e = None
                c = None
                cc = None
                g_in = df[(~df[2].str.contains('_')) & (df[2].str.contains('.'))].sort_values(by=4, ascending=False).head(15)
                r_in = df[(df[2].str.contains('_')) & (df[2].str.contains('.'))].sort_values(by=4, ascending=False).head(15)
                notes = df[~df[2].str.contains('.')].sort_values(by=4, ascending=False).head(10)
                if not df.empty:
                    df = df[df[4] == df[4].max()] 
                    df = df_blast_go[(df_blast_go[0] == f[0])&(df_blast_go[2] == f[1])]
                    if not df.empty:
                        g = df[[0,4]].drop_duplicates()[4].tolist()
                        e = df[[0,3]].drop_duplicates()[3].dropna().tolist()
                    df = df_cdd[(df_cdd[0] == f[0])&(df_cdd[4] == f[1])]
                    df = pandas.concat([df, df_cdd[(df_cdd[0] == f[0])&(df_cdd[4] == '')]])
                    if not df.empty:
                        c = df[2].unique().tolist()
                        cc = df.sort_values(by=3, ascending=False)[[0,1]].drop_duplicates().head(15)
                if not p:
                    p['l'] = len(prot_record.seq)
                    p['f'] = pFrom
                    p['t'] = pTo
                    p['d'] = d
                    p['b'] = blast_len
                    p['g_in'] = g_in[2].to_list()
                    p['r_in'] = r_in[2].to_list()
                    p['notes'] = notes[2].to_list()
                    p['g'] = []
                    p['e'] = []
                    p['c'] = []
                    p['cc'] = []
                    if g:
                        p['g'] = g
                    if e:
                        p['e'] = e
                    if c:
                        p['c'] = c
                        p['cc'] = cc[1].to_list()
                else:        
                    replace = True if p['d'] != 'complete' and d == 'complete' else False
                    if not replace and p['d'] != 'complete':
                        replace = True if p['b'] < blast_len else False
                    if replace: 
                        prots[f[0]]['l'] = len(prot_record.seq)
                        prots[f[0]]['f'] = pFrom
                        prots[f[0]]['t'] = pTo
                        prots[f[0]]['d'] = d
                        prots[f[0]]['b'] = blast_len
                        prots[f[0]]['g_in'] = g_in[2].to_list()
                        prots[f[0]]['r_in'] = r_in[2].to_list()
                        prots[f[0]]['notes'] = notes[2].to_list()
                        prots[f[0]]['g'] = []
                        prots[f[0]]['e'] = []
                        prots[f[0]]['c'] = []
                        if g:
                            prots[f[0]]['g'] = g
                        if e:
                            prots[f[0]]['e'] = e
                        if c:
                            prots[f[0]]['c'] = c
                            prots[f[0]]['cc'] = cc[1].to_list()
            
                        
    with open(os.path.join(result_dir, s + '.tbl'), "w") as tbl_output:
        for k, p in prots.items():
            tbl_output.write('>Feature\t{}\n'.format(k.replace('|', '_')))
            
            if p['f'] > p['t']: 
                seqLen = len(fasta[k]['f'].seq)
                p['f'] = seqLen + 1 - p['f']
                p['t'] = seqLen + 1 - p['t']
                fasta[k]['r'] = True
            
            if p['d'] =='3prime_partial':
                tbl_output.write('{}\t>{}'.format(p['f'], p['t']))
            elif p['d'] =='5prime_partial':
                tbl_output.write('<{}\t{}'.format(p['f'], p['t']))
            elif p['d'] =='internal':
                tbl_output.write('<{}\t>{}'.format(p['f'], p['t']))
            else:
                tbl_output.write('{}\t{}'.format(p['f'], p['t']))
            
            tbl_output.write('\tCDS\n')
            if len(p['e']) == 1:
                l = p['e'][0].replace('EC:', '')
                if len(l.split('.')) == 4:
                    try:
                        l = validate_ec_number(enzyme, l)                            
                        tbl_output.write('\t\t\tproduct\t{}\n'.format(enzyme[l]))
                        tbl_output.write('\t\t\tEC_number\t{}\n'.format(l))
                    except:
                        tbl_output.write('\t\t\tproduct\thypothetical protein\n')
                else:
                    tbl_output.write('\t\t\tproduct\thypothetical protein\n')
            elif len(p['e']) >= 1:
                e = []
                for g in p['e']:
                    l = g.replace('EC:', '')
                    if len(l.split('.')) == 4:
                        e.append(l)
                if len(e) == 1:
                    try:
                        l = validate_ec_number(enzyme, e[0]) 
                        tbl_output.write('\t\t\tproduct\t{}\n'.format(enzyme[l]))
                        tbl_output.write('\t\t\tEC_number\t{}\n'.format(l))
                    except:
                        tbl_output.write('\t\t\tproduct\thypothetical protein\n')
                else:                    
                    tbl_output.write('\t\t\tproduct\thypothetical protein\n') 
            else:
                tbl_output.write('\t\t\tproduct\thypothetical protein\n')
            
            start_codon = 1 if p['f'] > 1 and p['f'] <= 3 else p['f']
            tbl_output.write('\t\t\tcodon_start\t{}\n'.format(start_codon))   
            
            for i in p['g_in']:
                tbl_output.write('\t\t\tinference\talignment:blastp:2.9.0:INSD:{0}\n'.format(i))
            for i in p['r_in']:
                tbl_output.write('\t\t\tinference\talignment:blastp:2.9.0:RefSeq:{0}\n'.format(i))            

            for i in p['cc']:
                tbl_output.write('\t\t\tinference\talignment:rpsblastp:2.9.0:CDD:{0}\n'.format(i))
                
            for g in sorted(p['g'], key=lambda x: x[0], reverse=True):
                if isinstance(g, tuple):
                    tbl_output.write('\t\t\t')
                    tbl_output.write('\t'.join(g))
                    tbl_output.write('\n')
            
            for g in sorted(p['g'], key=lambda x: x[0], reverse=True):
                if isinstance(g, tuple):
                    tbl_output.write('\t\t\tdb_xref\tGO:{}\n'.format(g[1].split('|')[1]))
                else:
                    tbl_output.write('\t\t\tdb_xref\t{}\n'.format(g))
            
            for g in p['c']:
                tbl_output.write('\t\t\tdb_xref\tCDD:{}\n'.format(g)) 
    
    with open(os.path.join(result_dir, s + '.fsa'), "w") as fasta_handle:
        for k, f in fasta.items():
            f['f'].description = ''
            if f['r']:
                f['f'].seq = f['f'].seq.reverse_complement()
            SeqIO.write(f['f'], fasta_handle, "fasta")
    print('{} with {} CDS'.format(s, len(prots)))
                
p = Pool(processes=THREADS)
data = p.map(worker, samples)
p.close()


### Removing | from the transcript name

tbl2asn produces error if the | character is in the transcript name

In [None]:
%%bash
for a in *.fsa
do
    sed 's/|/_/' $a > t
    mv -v t $a
done

### Run thes commands in a terminal to generate submission files

In [None]:
os.chdir(result_dir)
print('COMMAND #1: cd {}'.format(result_dir))
print()
print('COMMAND #2: {}/tbl2asn -p . -j "[organism=Opuntia streptacantha][moltype=transcribed_RNA][tech=TSA][gcode=1]" -V tvb -a s -t template.sbt  -c x -M t'.format(BIN))
