In [1]:
import os
import sys
import pandas as pd
import math
import re
import numpy as np

In [2]:
gene_contig_file = "/Users/u241374/mike_tisza/sandbox/test_ct2_0804a/ct2_tmp/contig_gene_annotation_summary.pruned.tsv"

tRNA_table = "/Users/u241374/mike_tisza/sandbox/test_ct2_0804a/ct2_tmp/oriented_hallmark_contigs.pruned.tRNAscan.tsv"

In [3]:
gene_contig_df = pd.read_csv(gene_contig_file, sep = "\t")

tRNA_df = pd.read_csv(tRNA_table, index_col=False, sep = "\t", names = ['con_chunk', 'tRNA_num', 'gene_c1', 'gene_c2', 
                                                                      'evidence_description', 'tRNA_codon', 'other1', 'other2', 'tRNA_score'])

tRNA_df['at_pos'] = tRNA_df['con_chunk'].str.find("@")

tRNA_df['contig'] = tRNA_df.apply(
    lambda x: x["con_chunk"][0:x["at_pos"]], axis = 1)

tRNA_df['chunk_name'] = tRNA_df.apply(
    lambda x: x["con_chunk"][x["at_pos"]+1:-1], axis = 1)

tRNA_df['gene_start'] = np.where(tRNA_df['gene_c1'] < tRNA_df['gene_c2'], tRNA_df['gene_c1'], tRNA_df['gene_c2'])

tRNA_df['gene_stop'] = np.where(tRNA_df['gene_c1'] > tRNA_df['gene_c2'], tRNA_df['gene_c1'], tRNA_df['gene_c2'])

tRNA_df['gene_orient'] = np.where(tRNA_df['gene_c1'] < tRNA_df['gene_c2'], "+", "-")

tRNA_df['evidence_acession'] = "tRNAscan-SE score: " + tRNA_df['tRNA_score'].astype(str)

tRNA_df['Evidence_source'] = "tRNAscan-SE"

tRNA_df['gene_name'] = "tRNA-" + tRNA_df['evidence_description'].astype(str)

tRNA_df = tRNA_df[['contig', 'chunk_name', 'gene_start', 'gene_stop', 'gene_name', 
                   'gene_orient', 'evidence_description', 'evidence_acession', 'Evidence_source']]

tRNA_df.head()



Unnamed: 0,contig,chunk_name,gene_start,gene_stop,gene_name,gene_orient,evidence_description,evidence_acession,Evidence_source
0,test_ct2_0804a_6137,Chunk_0,60733,60806,tRNA-Cys,-,Cys,tRNAscan-SE score: 58.9,tRNAscan-SE
1,test_ct2_0804a_6137,Chunk_0,60604,60677,tRNA-Arg,-,Arg,tRNAscan-SE score: 66.5,tRNAscan-SE


In [4]:
#test_df = gene_contig_df.merge(tRNA_df, how = "left", on = ['contig', 'chunk_name'])

test_df = pd.concat([gene_contig_df, tRNA_df], ignore_index=True)

In [5]:
test_df['chunk_name'] = test_df['chunk_name'].fillna("nochunk")


In [6]:

chunk_grouped_df = test_df.groupby(['contig', 'chunk_name'], dropna = False)


test_chunk = chunk_grouped_df.get_group(('test_SRS_0803e43597', 'Chunk_0'))

test_chunk.query("Evidence_source == 'tRNAscan-SE'")

KeyError: ('test_SRS_0803e43597', 'Chunk_0')

In [None]:
#list(chunk_grouped_df.groups)

In [49]:
#test_chunk.first

if not test_chunk['chunk_name'].agg(pd.Series.mode)[0] == "no_chunk":  
    print(">Feature " + test_chunk['contig'].agg(pd.Series.mode)[0] + "@" + \
          test_chunk['chunk_name'].agg(pd.Series.mode)[0] + " Table1")
else:
    print(">Feature " + test_chunk['contig'].agg(pd.Series.mode)[0] + " Table1")    

for index, row in test_chunk.iterrows():
    trna_number = 1

    if row['Evidence_source'] == "hallmark_hmm" or row['Evidence_source'] == "common_virus_hmm": #my hmms
        tagstr = ("protein_id" + "\tlcl|" + row['gene_name'])
        productstr = re.sub("-", " ", row['evidence_description'])
        inferencestr = ("inference\tprotein motif " + str(row['evidence_acession']))
    
    elif row['Evidence_source'] == "mmseqs_cdd":  #mmseqs_cdd
        
        tagstr = ("protein_id" + "\tlcl|" + row['gene_name'])
        productstr = re.sub("\..*", "", row['evidence_description'])
        inferencestr = ("inference\tprotein motif CDD:" + str(row['evidence_acession']))

    elif row['Evidence_source'] == "tRNAscan-SE": ## tRNAs
        tagstr = ("gene\ttRNA" + str(trna_number))
        productstr = row['gene_name']
        inferencestr = ("inference\t" + row['evidence_acession'])
        trna_number =+ 1

    elif pd.isnull(row['Evidence_source']): #hypos
        tagstr = ("protein_id" + "\tlcl|" + row['gene_name'])
        productstr = row['evidence_description']
        inferencestr = ("note\tno search hits")
    else:
        raise Exception("this shouldn't happen")
        tagstr = "help"
        productstr = "help"
        inferencestr = "help"        
    
    print(str(row['gene_start']) + "\t" + str(row['gene_stop']))
    print("\t\t\t" + tagstr )
    print("\t\t\tproduct" + "\t" + productstr )
    print("\t\t\t" + inferencestr )

>Feature test_SRS_0803e43597@Chunk_0 Table1
385	565
			protein_id	lcl|test_SRS_0803e43597_3
			product	hypothetical protein
			note	no search hits
642	750
			protein_id	lcl|test_SRS_0803e43597_4
			product	hypothetical protein
			note	no search hits
721	841
			protein_id	lcl|test_SRS_0803e43597_5
			product	hypothetical protein
			note	no search hits
973	1990
			protein_id	lcl|test_SRS_0803e43597_6
			product	hypothetical protein
			note	no search hits
2054	2375
			protein_id	lcl|test_SRS_0803e43597_7
			product	hypothetical protein
			note	no search hits
2436	2592
			protein_id	lcl|test_SRS_0803e43597_8
			product	hypothetical protein
			note	no search hits
2605	2818
			protein_id	lcl|test_SRS_0803e43597_9
			product	hypothetical protein
			note	no search hits
2817	3012
			protein_id	lcl|test_SRS_0803e43597_10
			product	hypothetical protein
			note	no search hits
3004	3346
			protein_id	lcl|test_SRS_0803e43597_11
			product	hypothetical protein
			note	no search hits
3338	3683
			pro

In [7]:
chunk_grouped_df = test_df.groupby(['contig', 'chunk_name'], dropna = False)


In [10]:
for name, seq_group in chunk_grouped_df:
    print(f">Feature {name[0]}@{name[1]} Table1")

>Feature test_ct2_0804a_103@nochunk Table1
>Feature test_ct2_0804a_1048@nochunk Table1
>Feature test_ct2_0804a_1085@nochunk Table1
>Feature test_ct2_0804a_1339@nochunk Table1
>Feature test_ct2_0804a_1392@nochunk Table1
>Feature test_ct2_0804a_1552@nochunk Table1
>Feature test_ct2_0804a_1584@nochunk Table1
>Feature test_ct2_0804a_1658@nochunk Table1
>Feature test_ct2_0804a_1675@nochunk Table1
>Feature test_ct2_0804a_1690@nochunk Table1
>Feature test_ct2_0804a_1706@nochunk Table1
>Feature test_ct2_0804a_1724@nochunk Table1
>Feature test_ct2_0804a_1807@nochunk Table1
>Feature test_ct2_0804a_1853@nochunk Table1
>Feature test_ct2_0804a_188@nochunk Table1
>Feature test_ct2_0804a_194@nochunk Table1
>Feature test_ct2_0804a_1951@nochunk Table1
>Feature test_ct2_0804a_2008@nochunk Table1
>Feature test_ct2_0804a_2046@nochunk Table1
>Feature test_ct2_0804a_2084@nochunk Table1
>Feature test_ct2_0804a_2112@nochunk Table1
>Feature test_ct2_0804a_2224@nochunk Table1
>Feature test_ct2_0804a_226@nochunk