# 13_dramv_functions

This notebook creates and tests functions for summarizing metabolic potential of viral genes identified by DRAM-v.

## Load Packages

In [2]:
import pandas as pd
import math
import glob
import os # these two packages are good for searching and navigating file systems
import os.path as op

pd.set_option('display.max_columns', None)

## Function 1: get_ann_text

This function formats the text from the following columns: 'viral_hit', 'kegg_hit', 'pfam_hits', 'vogdb_hits'

In [10]:
def get_ann_text(hit_text, column_type = 'viral_hit'): # column_type = 'viral_hit' sets the default column_type as 'viral_hit'; 
                                                        # if column_type isn't specified, then the function assumes it is 'viral_hit'
    '''
    args:
        hit_text: text string from DRAMv for 'viral_hit' column
    returns:
        text string of just annotation information, not organism or hit id
    
    exe_input: YP_004325053.1 hypothetical protein PSSM7_226 [Prochlorococcus phage P-SSM7]
    exe_output: hypothetical protein PSSM7_226
    '''
    
    if type(hit_text) == float: # if the text belongs to the float type, return that text; float = NA in these data
        return hit_text
    
    if column_type == 'viral_hit': # if the column_type is 'viral_hit',
        no_org = hit_text.split("[")[0] # split the text by brackets ('[') and return the first part of the resulting text
        no_acc_id = " ".join(no_org.split(" ")[1:-1]) # join the resulting text from the line above with a space, 
                                                        # then split by a space and return the second part of the text without the 2nd to last character
        return no_acc_id
    
    if column_type in ['kegg_hit']: # if the column_type is 'kegg_hit',
        no_ee = hit_text.split("[")[0].strip() # split the text by brackets and return the first part of the resulting text; strip () removes trailing space
        return no_ee
    
    if column_type == 'pfam_hits': # if the column_type is 'pfam_hits',
        no_pf_ids = ";".join([text.split("[")[0].strip() for text in hit_text.split(";")]) # split the text by brackets and return the first part of the
                                    # resulting text without the trailing space then split by a semicolon (;) 
                                    # and return all pfam annotations joined using a semicolon (there are multiple annotations in this one column)
        return no_pf_ids
    
    if column_type == 'vogdb_hits': # if the column_type is 'vogdb_hits',
        '''sp|Q5UQ62|YR655_MIMIV Putative glycosyltransferase R655; Xh'''
        no_code = hit_text.split(";")[0] # split the text by a semicolon and return the first part of the resulting text
        no_acc = " ".join(no_code.split(" ")[1:]) # split the text from the line above with a space and return the second part of the text joined by a space
        return no_acc

In [14]:
# get_ann_text test using cv1_AM-654-B02

ex_file_path = '/Users/melissaherring/Google Drive/My Drive/MH_project/dramv/cv1_AM-654-B02/annotations.tsv' # create file path
columns_to_keep = ['Unnamed: 0', 'kegg_hit', 'viral_hit', 'pfam_hits', 'vogdb_hits'] # make a list of columns to look at
df = pd.read_csv(ex_file_path, sep = "\t")[columns_to_keep] # read the file and store only the columns from the list as a variable named df

df['viral_ann_text'] = df['viral_hit'].apply(get_ann_text, args = ('viral_hits',))
df['kegg_ann_text'] = df['kegg_hit'].apply(get_ann_text, args = ('kegg_hit',))
df['pfam_ann_text'] = df['pfam_hits'].apply(get_ann_text, args = ('pfam_hits',))
df['vogdb_ann_text'] = df['vogdb_hits'].apply(get_ann_text, args = ('vogdb_hits',))
df

Unnamed: 0.1,Unnamed: 0,kegg_hit,viral_hit,pfam_hits,vogdb_hits,viral_ann_text,kegg_ann_text,pfam_ann_text,vogdb_ann_text
0,SCGC_AM-654-B02_contig100||full_1,,,,sp|P00970|DNLI_BPT4 DNA ligase; XhXr,,,,DNA ligase
1,SCGC_AM-654-B02_contig100||full_2,,,,,,,,
2,SCGC_AM-654-B02_contig100||full_3,,,,,,,,
3,SCGC_AM-654-B02_contig100||full_4,,,,,,,,
4,SCGC_AM-654-B02_contig100||full_5,,,,,,,,
5,SCGC_AM-654-B02_contig100||full_6,,,,,,,,
6,SCGC_AM-654-B02_contig100||full_7,,,,,,,,
7,SCGC_AM-654-B02_contig106||full_1,,,,,,,,
8,SCGC_AM-654-B02_contig106||full_2,,,,,,,,
9,SCGC_AM-654-B02_contig106||full_3,,YP_010772222.1 MAG: hypothetical protein QIT37...,AAA domain (dynein-related subfamily) [PF07728...,sp|P04526|LOADL_BPT4 Sliding-clamp-loader larg...,,,AAA domain (dynein-related subfamily),Sliding-clamp-loader large subunit
