## dramv_sum_funs_dev

This notebook creates and tests summarizing functions for the dramv annotations outputs from 13_clean_dramv_annot_script.

# Load packages and data

In [5]:
import pandas as pd
import math
import glob
from collections import defaultdict
import os # these two packages are good for searching and navigating file systems
import os.path as op

pd.set_option('display.max_columns', None)

#example = pd.read_csv('/Users/melissaherring/Google Drive/My Drive/MH_project/dramv_trim/jv-119-vMAG_31.csv')

In [50]:
example.head()

Unnamed: 0,fasta,scaffold,start_position,end_position,annotation,annotation_source,amg_flags,V,M,A,P,E,K,T,F,B
0,jv-119-vMAG_31,jv-119-k141_3927498,3,527,Phosphotransferase enzyme family;Choline/ethan...,pfam_hits,F,0,0,0,0,0,0,0,1,0
1,jv-119-vMAG_31,jv-119-k141_3927498,506,1273,,,F,0,0,0,0,0,0,0,1,0
2,jv-119-vMAG_31,jv-119-k141_3927498,1266,1883,,,F,0,0,0,0,0,0,0,1,0
3,jv-119-vMAG_31,jv-119-k141_3927498,1897,2385,Acetyltransferase (GNAT) family,pfam_hits,F,0,0,0,0,0,0,0,1,0
4,jv-119-vMAG_31,jv-119-k141_3927498,2397,3218,Neck protein gp13,vogdb_hits,VF,1,0,0,0,0,0,0,1,0


In [6]:
# create a data frame that summarizes by annotation source

source_df = pd.DataFrame(example['annotation_source'].value_counts().reset_index())
source_df['ID'] = example.iloc[0, 0]
source_df = source_df.pivot(index='ID', columns='index', values='annotation_source')
source_df

Unnamed: 0,index,annotation_source,ID
0,pfam_hits,16,jv-119-vMAG_31
1,vogdb_hits,3,jv-119-vMAG_31
2,kegg_hit,1,jv-119-vMAG_31


In [29]:
for col in source_df:
    if 'kegg_hit' not in source_df.columns:
        source_df.insert(loc=len(source_df.columns), column='kegg_hit', value=0)
    if 'viral_hit' not in source_df.columns:
        source_df.insert(loc=len(source_df.columns), column='viral_hit', value=0)
    if 'pfam_hits' not in source_df.columns:
        source_df.insert(loc=len(source_df.columns), column='pfam_hits', value=0)
    if 'vogdb_hits' not in source_df.columns:
        source_df.insert(loc=len(source_df.columns), column='vogdb_hits', value=0)

source_df.rename(columns={'kegg_hit':'kegg_count','pfam_hits': 'pfam_count','vogdb_hits':'vogdb_count'}, inplace=True)
source_df['total_genes_annot'] = source_df['kegg_count'] + source_df['viral_hit'] + source_df['pfam_count'] + source_df['vogdb_count']
source_df

index,kegg_count,pfam_count,vogdb_count,viral_hit,total_genes_annot
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
jv-119-vMAG_31,1,16,3,0,20


In [30]:
source_df['V_count'] = len(example[example['V'] == 1])
source_df['M_count'] = len(example[example['M'] == 1])
source_df['A_count'] = len(example[example['A'] == 1])
source_df['P_count'] = len(example[example['P'] == 1])
source_df['E_count'] = len(example[example['E'] == 1])
source_df['K_count'] = len(example[example['K'] == 1])
source_df['T_count'] = len(example[example['T'] == 1])
source_df['F_count'] = len(example[example['F'] == 1])
source_df['B_count'] = len(example[example['B'] == 1])

source_df

index,kegg_count,pfam_count,vogdb_count,viral_hit,total_genes_annot,V_count,M_count,A_count,P_count,E_count,K_count,T_count,F_count,B_count
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
jv-119-vMAG_31,1,16,3,0,20,7,4,0,0,0,4,0,22,3


In [46]:
# for loop

csv_pattern = "/Users/melissaherring/Google Drive/My Drive/MH_project/dramv_trim/*.csv"
csv_file_paths = glob.glob(csv_pattern)

dfs_list = []

for file in csv_file_paths:
    df = pd.read_csv(file)
    df_name = df.iloc[0, 0]
    df_count = pd.DataFrame(df['annotation_source'].value_counts().reset_index())
    df_count['ID'] = df_name
    
    df_piv = df_count.pivot(index='ID', columns='index', values='annotation_source')

    for col in df_piv:
        if 'kegg_hit' not in df_piv.columns:
            df_piv.insert(loc=len(df_piv.columns), column='kegg_hit', value=0)
        if 'viral_hit' not in df_piv.columns:
            df_piv.insert(loc=len(df_piv.columns), column='viral_hit', value=0)
        if 'pfam_hits' not in df_piv.columns:
            df_piv.insert(loc=len(df_piv.columns), column='pfam_hits', value=0)
        if 'vogdb_hits' not in df_piv.columns:
            df_piv.insert(loc=len(df_piv.columns), column='vogdb_hits', value=0)
            
    df_piv.rename(columns={'kegg_hit':'kegg_count','pfam_hits': 'pfam_count','vogdb_hits':'vogdb_count'}, inplace=True)
    df_piv['total_genes_annot'] = df_piv['kegg_count'] + df_piv['viral_hit'] + df_piv['pfam_count'] + df_piv['vogdb_count']
    
    df_piv['V_count'] = len(df[df['V'] == 1])
    df_piv['M_count'] = len(df[df['M'] == 1])
    df_piv['A_count'] = len(df[df['A'] == 1])
    df_piv['P_count'] = len(df[df['P'] == 1])
    df_piv['E_count'] = len(df[df['E'] == 1])
    df_piv['K_count'] = len(df[df['K'] == 1])
    df_piv['T_count'] = len(df[df['T'] == 1])
    df_piv['F_count'] = len(df[df['F'] == 1])
    df_piv['B_count'] = len(df[df['B'] == 1])
    
    dfs_list.append(df_piv)
    

    
result_df = pd.concat(dfs_list)
result_df.to_csv('/Users/melissaherring/Google Drive/My Drive/MH_project/full_summary.csv')

# add columns for: host phylogeny, sample, depth, sag/mag

In [6]:
# for loop

csv_pattern = "/Users/juliabrown/Google Drive/My Drive/projects/OMZvir_round2/MH_project/dramv_trim/*.csv"

def create_summary_row(file):
    df = pd.read_csv(file)
    df_name = df.iloc[0, 0]
    df_count = pd.DataFrame(df['annotation_source'].value_counts().reset_index())
    df_count['ID'] = df_name

    df_piv = df_count.pivot(index='ID', columns='index', values='annotation_source')

    for col in df_piv:
        if 'kegg_hit' not in df_piv.columns:
            df_piv.insert(loc=len(df_piv.columns), column='kegg_hit', value=0)
        if 'viral_hit' not in df_piv.columns:
            df_piv.insert(loc=len(df_piv.columns), column='viral_hit', value=0)
        if 'pfam_hits' not in df_piv.columns:
            df_piv.insert(loc=len(df_piv.columns), column='pfam_hits', value=0)
        if 'vogdb_hits' not in df_piv.columns:
            df_piv.insert(loc=len(df_piv.columns), column='vogdb_hits', value=0)

    df_piv.rename(columns={'kegg_hit':'kegg_count','pfam_hits': 'pfam_count','vogdb_hits':'vogdb_count'}, inplace=True)
    df_piv['total_genes_annot'] = df_piv['kegg_count'] + df_piv['viral_hit'] + df_piv['pfam_count'] + df_piv['vogdb_count']

    df_piv['V_count'] = len(df[df['V'] == 1])
    df_piv['M_count'] = len(df[df['M'] == 1])
    df_piv['A_count'] = len(df[df['A'] == 1])
    df_piv['P_count'] = len(df[df['P'] == 1])
    df_piv['E_count'] = len(df[df['E'] == 1])
    df_piv['K_count'] = len(df[df['K'] == 1])
    df_piv['T_count'] = len(df[df['T'] == 1])
    df_piv['F_count'] = len(df[df['F'] == 1])
    df_piv['B_count'] = len(df[df['B'] == 1])
    return df_piv


def summarize_files(csv_pattern):
    csv_file_paths = glob.glob(csv_pattern)

    dfs_list = []

    for file in csv_file_paths:
        df_piv = create_summary_row(file)
        dfs_list.append(df_piv)

    result_df = pd.concat(dfs_list)
    return result_df
    
    #result_df.to_csv('/Users/melissaherring/Google Drive/My Drive/MH_project/full_summary.csv')

    # add columns for: host phylogeny, sample, depth, sag/mag


In [7]:
summarize_files(csv_pattern)

index,pfam_count,kegg_count,viral_hit,vogdb_count,total_genes_annot,V_count,M_count,A_count,P_count,E_count,K_count,T_count,F_count,B_count
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
jv-132-vMAG_16,3,0,0,0,3,1,0,0,0,0,0,0,39,0
jv-132-vMAG_3,4,0,0,0,4,2,0,0,0,0,0,0,13,0
cv1_AM-664-O20,7,0,0,0,7,2,0,0,1,0,0,0,34,0
cv1_AM-680-O21,5,4,0,0,9,1,3,0,0,0,2,0,15,3
cv1_AM-660-G21,10,4,0,3,17,4,1,0,0,0,1,0,29,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
jv-119-vMAG_36,3,0,0,4,7,1,0,0,0,0,0,0,16,0
jv-119-vMAG_22,4,0,0,4,8,3,0,0,0,0,0,0,23,0
cv1_AM-678-M21,50,17,0,36,103,18,10,0,2,3,8,0,213,0
jv-132-vMAG_33,7,3,0,4,14,2,0,0,0,0,0,0,49,0


In [51]:
# summarize by annotation
m_flag = example[example['M'] != 0]
annots = pd.DataFrame(m_flag['annotation'].value_counts())
annots['ID'] = example.iloc[0, 0]
annots
m_flag

Unnamed: 0,fasta,scaffold,start_position,end_position,annotation,annotation_source,amg_flags,V,M,A,P,E,K,T,F,B
13,jv-119-vMAG_31,jv-119-k141_5494923,1015,2247,Asparagine synthase;Glutamine amidotransferase...,pfam_hits,MKF,0,1,0,0,0,1,0,1,0
18,jv-119-vMAG_31,jv-119-k141_5494923,4057,4872,Putative 2OG-Fe(II) oxygenase,pfam_hits,MKFB,0,1,0,0,0,1,0,1,1
19,jv-119-vMAG_31,jv-119-k141_5494923,4878,5453,2OG-Fe(II) oxygenase superfamily,pfam_hits,MKFB,0,1,0,0,0,1,0,1,1
20,jv-119-vMAG_31,jv-119-k141_5494923,5446,6105,Putative 2OG-Fe(II) oxygenase,pfam_hits,MKB,0,1,0,0,0,1,0,0,1


In [None]:
# for loop to subset each fasta by M flag and combine outputs together
# get unique list of annotions of M flag
# create a data frame with the each annotation as a row and count as a column- total; split total by sample and add a column for each fasta
# add columns for scaffold, start position, end position

* how many vSAGs and vMAGs have integrases?

* Counts of "phage integrases" per individual
    * table that has ['fasta','integrase_count', 'has_integrase']

* summarizing AMG by individual
    * table that has total AMGs/indivual
    * pull out KEGG ID from original kegg column/EC value from original pfam column and then in each case make summary table that counts by each different kegID/EC value 
    * indivdual == fasta
    * EC X.Y.Z.W