In [1]:
import pandas as pd

# these two packages are good for searching and navigating file systems
import os
import os.path as op

In [2]:
exe_dramv = "/Users/juliabrown/Google Drive/My Drive/projects/OMZvir_round2/MH_project/dramv/cv1_AM-654-B04/annotations.tsv"

In [3]:
df = pd.read_csv(exe_dramv, sep = "\t")

In [5]:
df.columns

Index(['Unnamed: 0', 'fasta', 'scaffold', 'gene_position', 'start_position',
       'end_position', 'strandedness', 'rank', 'ko_id', 'kegg_hit', 'viral_id',
       'viral_hit', 'viral_RBH', 'viral_identity', 'viral_bitScore',
       'viral_eVal', 'peptidase_id', 'peptidase_family', 'peptidase_hit',
       'peptidase_RBH', 'peptidase_identity', 'peptidase_bitScore',
       'peptidase_eVal', 'pfam_hits', 'cazy_ids', 'cazy_hits',
       'cazy_subfam_ec', 'cazy_best_hit', 'vogdb_id', 'vogdb_hits',
       'vogdb_categories', 'heme_regulatory_motif_count', 'is_transposon',
       'amg_flags'],
      dtype='object')

In [60]:
df['cazy_ids'].dropna()

7     GT25
8     GT11
10    GT11
Name: cazy_ids, dtype: object

In [7]:
ann_columns = ['viral_hit', 'kegg_hit','pfam_hits', 'vogdb_hits']

In [14]:
exe_vhit_text = df['viral_hit'][0]

In [56]:
def get_ann_text(hit_text, column_type = 'viral_hit'):
    '''
    args:
        hit_text: text string from DRAMv for 'viral_hit' column
    returns:
        text string of just annotation information, not organism or hit id
    
    exe_input: YP_004325053.1 hypothetical protein PSSM7_226 [Prochlorococcus phage P-SSM7]
    exe_output: hypothetical protein PSSM7_226
    '''
    
    if type(hit_text) == float:
        return hit_text
    
    if column_type == 'viral_hit':
        no_org = hit_text.split("[")[0]
        no_acc_id = " ".join(no_org.split(" ")[1:-1])
        return no_acc_id
    
    if column_type in ['kegg_hit']:
        no_ee = hit_text.split("[")[0].strip()
        return no_ee
    
    if column_type == 'pfam_hits':
        no_pf_ids = ";".join([text.split("[")[0].strip() for text in hit_text.split(";")])
        return no_pf_ids
    
    if column_type == 'vogdb_hits':
        '''sp|Q5UQ62|YR655_MIMIV Putative glycosyltransferase R655; Xh'''
        no_code = hit_text.split(";")[0]
        no_acc = " ".join(no_code.split(" ")[1:])
        return no_acc
                                 
        
print(get_ann_text(exe_vhit_text))
print(get_ann_text('Nitrite and sulphite reductase 4Fe-4S domain [PF01077.25]; Nitrite/Sulfite reductase ferredoxin-like half domain [PF03460.20]', column_type = 'pfam_hits'))
print(get_ann_text('sp|Q5UQ62|YR655_MIMIV Putative glycosyltransferase R655; Xh', column_type = 'vogdb_hits'))

hypothetical protein PSSM7_226
Nitrite and sulphite reductase 4Fe-4S domain;Nitrite/Sulfite reductase ferredoxin-like half domain
Putative glycosyltransferase R655


In [57]:
# apply function to each row in column
cid = 'viral_hit'
df['viral_ann_text'] = df['viral_hit'].apply(get_ann_text, args = (cid,))
df['kegg_ann_text'] = df['kegg_hit'].apply(get_ann_text, args = ('kegg_hit',))
df['pfam_ann_text'] = df['pfam_hits'].apply(get_ann_text, args = ('pfam_hits',))
df['vogdb_ann_text'] = df['vogdb_hits'].apply(get_ann_text, args = ('vogdb_hits',))

In [76]:
df

Unnamed: 0.1,Unnamed: 0,fasta,scaffold,gene_position,start_position,end_position,strandedness,rank,ko_id,kegg_hit,...,vogdb_id,vogdb_hits,vogdb_categories,heme_regulatory_motif_count,is_transposon,amg_flags,viral_ann_text,kegg_ann_text,pfam_ann_text,vogdb_ann_text
0,SCGC_AM-654-B04_contig1||full_1,cv1_AM-654-B04,SCGC_AM-654-B04_contig1||full,1,1,267,-1,E,,,...,VOG08791,REFSEQ hypothetical protein; Xu,Xu,0,False,F,hypothetical protein PSSM7_226,,,hypothetical protein
1,SCGC_AM-654-B04_contig1||full_2,cv1_AM-654-B04,SCGC_AM-654-B04_contig1||full,2,306,464,-1,E,,,...,,,,0,False,F,,,,
2,SCGC_AM-654-B04_contig1||full_3,cv1_AM-654-B04,SCGC_AM-654-B04_contig1||full,3,506,2014,-1,C,K00366,ferredoxin-nitrite reductase [EC:1.7.7.1],...,,,,0,False,MKF,,ferredoxin-nitrite reductase,Nitrite and sulphite reductase 4Fe-4S domain;N...,
3,SCGC_AM-654-B04_contig1||full_4,cv1_AM-654-B04,SCGC_AM-654-B04_contig1||full,4,2015,2950,-1,C,K21993,formate transporter,...,,,,0,False,F,,formate transporter,Formate/nitrite transporter,
4,SCGC_AM-654-B04_contig1||full_5,cv1_AM-654-B04,SCGC_AM-654-B04_contig1||full,5,3216,3386,-1,E,,,...,,,,0,False,F,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,SCGC_AM-654-B04_contig45||full_2,cv1_AM-654-B04,SCGC_AM-654-B04_contig45||full,2,1337,6220,-1,D,,,...,,,,0,False,F,,,SprB repeat,
151,SCGC_AM-654-B04_contig45||full_3,cv1_AM-654-B04,SCGC_AM-654-B04_contig45||full,3,6250,6486,-1,E,,,...,,,,0,False,F,,,,
152,SCGC_AM-654-B04_contig51||full_1,cv1_AM-654-B04,SCGC_AM-654-B04_contig51||full,1,1,1431,-1,E,,,...,,,,0,False,F,,,,
153,SCGC_AM-654-B04_contig51||full_2,cv1_AM-654-B04,SCGC_AM-654-B04_contig51||full,2,1439,2773,-1,E,,,...,,,,0,False,F,,,,


In [65]:
for i, line in df.iterrows():
    #annotation, annotation_source = grab_annotation(line)
    print(i, get_ann_text(line['pfam_hits'], column_type = 'pfam_hits'))
    if i > 10: break

0 nan
1 nan
2 Nitrite and sulphite reductase 4Fe-4S domain;Nitrite/Sulfite reductase ferredoxin-like half domain
3 Formate/nitrite transporter
4 nan
5 nan
6 nan
7 Glycosyltransferase family 25 (LPS biosynthesis protein)
8 Glycosyl transferase family 11
9 N-terminal domain of galactosyltransferase;Glycosyltransferase like family 2
10 Glycosyl transferase family 11
11 NAD dependent epimerase/dehydratase family;GDP-mannose 4,6 dehydratase;RmlD substrate binding domain


In [84]:
import math

def grab_annotation(line):
    col_preference = ['kegg_hit', 'pfam_hits', 'viral_hit', 'vogdb_hits']

    for col in col_preference:
        if type(line[col]) != str:
            continue
        elif 'hypothetical' not in line[col]:
            keep_text = get_ann_text(line[col], column_type = col)
            keep_source = col
            return keep_text, keep_source
        else:
            continue
            
    for col in col_preference:
        if type(line[col]) != str:
            continue
        else:
            keep_text = get_ann_text(line[col], column_type = col)
            keep_source = col
            return keep_text, keep_source
    
    return math.nan, math.nan

In [85]:
df[['annotation','annotation_source']] = df.apply(grab_annotation, axis=1, result_type='expand')

In [88]:
len(df['annotation'].dropna()) / len(df)

0.7935483870967742

In [93]:
df[~df['annotation'].isna() & df['annotation'].str.contains('nitr')]

Unnamed: 0.1,Unnamed: 0,fasta,scaffold,gene_position,start_position,end_position,strandedness,rank,ko_id,kegg_hit,...,vogdb_categories,heme_regulatory_motif_count,is_transposon,amg_flags,viral_ann_text,kegg_ann_text,pfam_ann_text,vogdb_ann_text,annotation,annotation_source
2,SCGC_AM-654-B04_contig1||full_3,cv1_AM-654-B04,SCGC_AM-654-B04_contig1||full,3,506,2014,-1,C,K00366,ferredoxin-nitrite reductase [EC:1.7.7.1],...,,0,False,MKF,,ferredoxin-nitrite reductase,Nitrite and sulphite reductase 4Fe-4S domain;N...,,ferredoxin-nitrite reductase,kegg_hit
