## dramv_sum_funs_dev

This notebook creates and tests summarizing functions for the dramv annotations outputs from 13_clean_dramv_annot_script.

# Load packages and data

In [1]:
import pandas as pd
import math
import glob
from collections import defaultdict
import os # these two packages are good for searching and navigating file systems
import os.path as op

pd.set_option('display.max_columns', None)

example = pd.read_csv('/Users/melissaherring/Google Drive/My Drive/MH_project/dramv_trim/jv-119-vMAG_31.csv')

In [2]:
example.head()

Unnamed: 0,fasta,annotation,annotation_source,amg_flags,V,M,A,P,E,K,T,F,B
0,jv-119-vMAG_31,Phosphotransferase enzyme family;Choline/ethan...,pfam_hits,F,0,0,0,0,0,0,0,1,0
1,jv-119-vMAG_31,,,F,0,0,0,0,0,0,0,1,0
2,jv-119-vMAG_31,,,F,0,0,0,0,0,0,0,1,0
3,jv-119-vMAG_31,Acetyltransferase (GNAT) family,pfam_hits,F,0,0,0,0,0,0,0,1,0
4,jv-119-vMAG_31,Neck protein gp13,vogdb_hits,VF,1,0,0,0,0,0,0,1,0


In [6]:
# create a data frame that summarizes by annotation source

source_df = pd.DataFrame(example['annotation_source'].value_counts().reset_index())
source_df['ID'] = example.iloc[0, 0]
source_df = source_df.pivot(index='ID', columns='index', values='annotation_source')
source_df

Unnamed: 0,index,annotation_source,ID
0,pfam_hits,16,jv-119-vMAG_31
1,vogdb_hits,3,jv-119-vMAG_31
2,kegg_hit,1,jv-119-vMAG_31


In [29]:
for col in source_df:
    if 'kegg_hit' not in source_df.columns:
        source_df.insert(loc=len(source_df.columns), column='kegg_hit', value=0)
    if 'viral_hit' not in source_df.columns:
        source_df.insert(loc=len(source_df.columns), column='viral_hit', value=0)
    if 'pfam_hits' not in source_df.columns:
        source_df.insert(loc=len(source_df.columns), column='pfam_hits', value=0)
    if 'vogdb_hits' not in source_df.columns:
        source_df.insert(loc=len(source_df.columns), column='vogdb_hits', value=0)

source_df.rename(columns={'kegg_hit':'kegg_count','pfam_hits': 'pfam_count','vogdb_hits':'vogdb_count'}, inplace=True)
source_df['total_genes_annot'] = source_df['kegg_count'] + source_df['viral_hit'] + source_df['pfam_count'] + source_df['vogdb_count']
source_df

index,kegg_count,pfam_count,vogdb_count,viral_hit,total_genes_annot
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
jv-119-vMAG_31,1,16,3,0,20


In [30]:
source_df['V_count'] = len(example[example['V'] == 1])
source_df['M_count'] = len(example[example['M'] == 1])
source_df['A_count'] = len(example[example['A'] == 1])
source_df['P_count'] = len(example[example['P'] == 1])
source_df['E_count'] = len(example[example['E'] == 1])
source_df['K_count'] = len(example[example['K'] == 1])
source_df['T_count'] = len(example[example['T'] == 1])
source_df['F_count'] = len(example[example['F'] == 1])
source_df['B_count'] = len(example[example['B'] == 1])

source_df

index,kegg_count,pfam_count,vogdb_count,viral_hit,total_genes_annot,V_count,M_count,A_count,P_count,E_count,K_count,T_count,F_count,B_count
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
jv-119-vMAG_31,1,16,3,0,20,7,4,0,0,0,4,0,22,3


In [46]:
# for loop

csv_pattern = "/Users/melissaherring/Google Drive/My Drive/MH_project/dramv_trim/*.csv"
csv_file_paths = glob.glob(csv_pattern)

dfs_list = []

for file in csv_file_paths:
    df = pd.read_csv(file)
    df_name = df.iloc[0, 0]
    df_count = pd.DataFrame(df['annotation_source'].value_counts().reset_index())
    df_count['ID'] = df_name
    
    df_piv = df_count.pivot(index='ID', columns='index', values='annotation_source')

    for col in df_piv:
        if 'kegg_hit' not in df_piv.columns:
            df_piv.insert(loc=len(df_piv.columns), column='kegg_hit', value=0)
        if 'viral_hit' not in df_piv.columns:
            df_piv.insert(loc=len(df_piv.columns), column='viral_hit', value=0)
        if 'pfam_hits' not in df_piv.columns:
            df_piv.insert(loc=len(df_piv.columns), column='pfam_hits', value=0)
        if 'vogdb_hits' not in df_piv.columns:
            df_piv.insert(loc=len(df_piv.columns), column='vogdb_hits', value=0)
            
    df_piv.rename(columns={'kegg_hit':'kegg_count','pfam_hits': 'pfam_count','vogdb_hits':'vogdb_count'}, inplace=True)
    df_piv['total_genes_annot'] = df_piv['kegg_count'] + df_piv['viral_hit'] + df_piv['pfam_count'] + df_piv['vogdb_count']
    
    df_piv['V_count'] = len(df[df['V'] == 1])
    df_piv['M_count'] = len(df[df['M'] == 1])
    df_piv['A_count'] = len(df[df['A'] == 1])
    df_piv['P_count'] = len(df[df['P'] == 1])
    df_piv['E_count'] = len(df[df['E'] == 1])
    df_piv['K_count'] = len(df[df['K'] == 1])
    df_piv['T_count'] = len(df[df['T'] == 1])
    df_piv['F_count'] = len(df[df['F'] == 1])
    df_piv['B_count'] = len(df[df['B'] == 1])
    
    dfs_list.append(df_piv)
    
    result_df = pd.concat(dfs_list)

result_df.to_csv('/Users/melissaherring/Google Drive/My Drive/MH_project/full_summary.csv')

In [34]:
# summarize by annotation
annots = pd.DataFrame(example['annotation'].value_counts())
annots['ID'] = example.iloc[0, 0]
annots

Unnamed: 0,annotation,ID
Putative 2OG-Fe(II) oxygenase,2,jv-119-vMAG_31
2OG-Fe(II) oxygenase superfamily,1,jv-119-vMAG_31
"Bacteriophage T4, Gp8",1,jv-119-vMAG_31
Bacteriophage T4 gp9/10-like protein,1,jv-119-vMAG_31
hypothetical protein,1,jv-119-vMAG_31
Baseplate J-like protein,1,jv-119-vMAG_31
Baseplate wedge protein gp25,1,jv-119-vMAG_31
PAAR motif,1,jv-119-vMAG_31
Gp5 N-terminal OB domain,1,jv-119-vMAG_31
Phosphotransferase enzyme family;Choline/ethanolamine kinase,1,jv-119-vMAG_31


In [None]:
# make annotation category?