## dramv_sum_funs_dev

This notebook creates and tests summarizing functions for the dramv annotations outputs from 13_clean_dramv_annot_script.

# Load packages and data

In [1]:
import pandas as pd
import math
import glob
from collections import defaultdict
import os # these two packages are good for searching and navigating file systems
import os.path as op

pd.set_option('display.max_columns', None)

# function to split classification and get rid of _
def split_classification(df):
    df[['domain', 'phyla', 'class', 'order', 'family', 'genus', 'species']] = df.classification.str.split(';', expand=True)
    df['domain'] = df['domain'].str.replace('d__', '') # remove the d__ in front of all observations
    df['phyla'] = df['phyla'].str.replace('p__', '')
    df['class'] = df['class'].str.replace('c__', '')
    df['order'] = df['order'].str.replace('o__', '')
    df['family'] = df['family'].str.replace('f__', '')
    df['genus'] = df['genus'].str.replace('g__', '')
    df['species'] = df['species'].str.replace('s__', '')
    return df

example = pd.read_csv('/Users/melissaherring/Google Drive/My Drive/MH_project/dramv_trim/jv-119-vMAG_31.csv')

cols_to_keep = ['vir_id','depth', 'domain', 'phyla', 'class', 'order', 'family', 'genus', 'species']

sag_tax = pd.read_csv('~/Documents/Bigelow/Virus_Project/OMZ_MH_Analysis/Data/sag_data/MPvsag_info_230818.csv')
sag_tax.rename(columns={'classification_via_GTDBTk': 'classification'}, inplace=True) # rename classification column
sag_tax = split_classification(sag_tax)[cols_to_keep]

mag_tax = pd.read_csv('~/Documents/Bigelow/Virus_Project/OMZ_MH_Analysis/Data/proximeta_viral_files/vMAG_associations.csv')
mag_cols= ['virus_name', 'sample_name', 'sample_depth','domain', 'phyla', 'class', 'order', 'family', 'genus', 'species',]
mag_tax = mag_tax[mag_cols]

In [None]:
example.head()[:20]

In [None]:
sag_tax.head()

In [None]:
# create a data frame that summarizes by annotation source

source_df = pd.DataFrame(example['annotation_source'].value_counts().reset_index())
source_df['ID'] = example.iloc[0, 0]
source_df = source_df.pivot(index='ID', columns='index', values='annotation_source')
source_df

In [None]:
def create_db_cols(df):
    for col in df:
        if 'kegg_hit' not in df.columns:
            df.insert(loc=len(df.columns), column='kegg_hit', value=0)
        if 'viral_hit' not in df.columns:
            df.insert(loc=len(source_df.columns), column='viral_hit', value=0)
        if 'pfam_hits' not in df.columns:
            df.insert(loc=len(source_df.columns), column='pfam_hits', value=0)
        if 'vogdb_hits' not in df.columns:
            df.insert(loc=len(source_df.columns), column='vogdb_hits', value=0)
    return df

In [None]:
create_db_cols(source_df)

In [None]:
source_df.rename(columns={'kegg_hit':'kegg_count','pfam_hits': 'pfam_count','vogdb_hits':'vogdb_count'}, inplace=True)
source_df['total_genes_annot'] = source_df['kegg_count'] + source_df['viral_hit'] + source_df['pfam_count'] + source_df['vogdb_count']
source_df

In [None]:
source_df['V_count'] = len(example[example['V'] == 1])
source_df['M_count'] = len(example[example['M'] == 1])
source_df['A_count'] = len(example[example['A'] == 1])
source_df['P_count'] = len(example[example['P'] == 1])
source_df['E_count'] = len(example[example['E'] == 1])
source_df['K_count'] = len(example[example['K'] == 1])
source_df['T_count'] = len(example[example['T'] == 1])
source_df['F_count'] = len(example[example['F'] == 1])
source_df['B_count'] = len(example[example['B'] == 1])

source_df

In [6]:
def create_db_cols(df):
    for col in df:
        if 'kegg_hit' not in df.columns:
            df.insert(loc=len(df.columns), column='kegg_hit', value=0)
        if 'viral_hit' not in df.columns:
            df.insert(loc=len(df.columns), column='viral_hit', value=0)
        if 'pfam_hits' not in df.columns:
            df.insert(loc=len(df.columns), column='pfam_hits', value=0)
        if 'vogdb_hits' not in df.columns:
            df.insert(loc=len(df.columns), column='vogdb_hits', value=0)
    return df

# function for creating counts summary 

def create_count_row(file):
    df = pd.read_csv(file)
    df_name = df.iloc[0, 0]
    df_count = pd.DataFrame(df['annotation_source'].value_counts().reset_index())
    df_count['ID'] = df_name

    df_piv = df_count.pivot(index='ID', columns='index', values='annotation_source')
    df_piv = create_db_cols(df_piv)
    df_piv.rename(columns={'kegg_hit':'kegg_count','pfam_hits': 'pfam_count','vogdb_hits':'vogdb_count'}, inplace=True)
    df_piv['total_genes_annot'] = df_piv['kegg_count'] + df_piv['viral_hit'] + df_piv['pfam_count'] + df_piv['vogdb_count']

    df_piv['V_count'] = len(df[df['V'] == 1])
    df_piv['M_count'] = len(df[df['M'] == 1])
    df_piv['A_count'] = len(df[df['A'] == 1])
    df_piv['P_count'] = len(df[df['P'] == 1])
    df_piv['E_count'] = len(df[df['E'] == 1])
    df_piv['K_count'] = len(df[df['K'] == 1])
    df_piv['T_count'] = len(df[df['T'] == 1])
    df_piv['F_count'] = len(df[df['F'] == 1])
    df_piv['B_count'] = len(df[df['B'] == 1])
    
    return df_piv


def count_concat(csv_pattern):
    csv_file_paths = glob.glob(csv_pattern)

    dfs_list = []

    for file in csv_file_paths:
        df_piv = create_count_row(file)
        dfs_list.append(df_piv)

    result_df = pd.concat(dfs_list)
    return result_df
    



In [5]:
csv_pattern = "/Users/melissaherring/Google Drive/My Drive/MH_project/dramv_trim/*.csv"
result = count_concat(csv_pattern)
result.head()
result.to_csv('/Users/melissaherring/Google Drive/My Drive/MH_project/dramv_full_summary.csv')

In [19]:
# def for creating data_type column
def assign_data_type(row):
    if 'jv-' in row.name:
        return 'vMAG'
    else:
        return 'vSAG'

In [20]:
result['data_type'] = result.apply(assign_data_type, axis=1)
result.head()

index,pfam_count,kegg_count,viral_hit,vogdb_count,total_genes_annot,V_count,M_count,A_count,P_count,E_count,K_count,T_count,F_count,B_count,data_type
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
jv-132-vMAG_16,3,0,0,0,3,1,0,0,0,0,0,0,39,0,vMAG
jv-132-vMAG_3,4,0,0,0,4,2,0,0,0,0,0,0,13,0,vMAG
cv1_AM-664-O20,7,0,0,0,7,2,0,0,1,0,0,0,34,0,vSAG
cv1_AM-680-O21,5,4,0,0,9,1,3,0,0,0,2,0,15,3,vSAG
cv1_AM-660-G21,10,4,0,3,17,4,1,0,0,0,1,0,29,0,vSAG


In [None]:
sag_tax.head()

In [None]:
mag_tax.head()

In [23]:
# function for adding metadata columns

# if data_type = vSAG, use sag lookup table
# if data_type = vMAG, use mag lookup table

new_column = []

for index, row in df.iterrows():
    key = row['Key']
    lookup_result = lookup_table[lookup_table['Key'] == key]['Value2'].values
    if len(lookup_result) > 0:
        new_column.append(lookup_result[0])
    else:
        new_column.append('Key not found')

df['New_Column'] = new_column

In [23]:
result = pd.read_csv('/Users/melissaherring/Google Drive/My Drive/MH_project/dramv_full_summary.csv')

sag_cols = ['vir_id','depth', 'classification_via_GTDBTk']
sag_tax = pd.read_csv('~/Documents/Bigelow/Virus_Project/OMZ_MH_Analysis/Data/sag_data/MPvsag_info_230818.csv')[sag_cols]
sag_tax.rename(columns={'classification_via_GTDBTk': 'classification','depth':'sample_depth'}, inplace=True)
sag_tax = split_classification(sag_tax)
sag_tax['virus_name'] = sag_tax['vir_id'].str.replace('vir', 'cv1')

mag_tax = pd.read_csv('~/Documents/Bigelow/Virus_Project/OMZ_MH_Analysis/Data/proximeta_viral_files/vMAG_associations.csv')
mag_cols= ['virus_name', 'sample_depth','domain', 'phyla', 'class', 'order', 'family', 'genus', 'species',]
mag_tax = mag_tax[mag_cols]

result[['virus_name','

for row in result:
    if result['data_type'] == 'vSAG':
        result.merge(sag_tax, on=['ID','virus_name'], how='left')
    else: result.merge(mag_tax, on=['ID','vir_id'], how = 'left')

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [21]:
metadata_cols(result)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [14]:
mag_tax.head()

Unnamed: 0,virus_name,sample_depth,domain,phyla,class,order,family,genus,species
0,vMAG_32,400,Bacteria,Pseudomonadota,Gammaproteobacteria,Pseudomonadales,HTCC2089,,
1,vMAG_32,400,Bacteria,Pseudomonadota,Gammaproteobacteria,Pseudomonadales,HTCC2089,,
2,vMAG_32,400,Bacteria,Pseudomonadota,Gammaproteobacteria,Pseudomonadales,HTCC2089,,
3,vMAG_32,400,Bacteria,Pseudomonadota,Gammaproteobacteria,Pseudomonadales,HTCC2089,,
4,vMAG_1,400,Unclassified Bacteria,,,,,,


In [13]:
sag_tax.head()

Unnamed: 0,vir_id,depth,classification,domain,phyla,class,order,family,genus,species,virus_name
0,vir_AM-654-B02,80,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,Bacteria,Proteobacteria,Gammaproteobacteria,Arenicellales,UBA5680,UBA5680,UBA5680 sp002420425,cv1_AM-654-B02
1,vir_AM-654-B17,80,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,Bacteria,Proteobacteria,Alphaproteobacteria,Pelagibacterales,Pelagibacteraceae,Pelagibacter,,cv1_AM-654-B17
2,vir_AM-654-B06,80,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,Bacteria,Proteobacteria,Alphaproteobacteria,Pelagibacterales,Pelagibacteraceae,GCA-2704625,GCA-2704625 sp017640245,cv1_AM-654-B06
3,vir_AM-654-C02,80,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,Bacteria,Proteobacteria,Alphaproteobacteria,Pelagibacterales,Pelagibacteraceae,Pelagibacter,,cv1_AM-654-C02
4,vir_AM-654-B04,80,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,Bacteria,Bacteroidota,Bacteroidia,Flavobacteriales,BACL11,DUAL01,,cv1_AM-654-B04


In [None]:
# for loop to subset each fasta by M flag and combine outputs together
# get unique list of annotions of M flag
# create a data frame with the each annotation as a row and count as a column- total; split total by sample and add a column for each fasta
# add columns for scaffold, start position, end position

* how many vSAGs and vMAGs have integrases?

* Counts of "phage integrases" per individual
    * table that has ['fasta','integrase_count', 'has_integrase']

* summarizing AMG by individual
    * table that has total AMGs/indivual
    * pull out KEGG ID from original kegg column/EC value from original pfam column and then in each case make summary table that counts by each different kegID/EC value 
    * indivdual == fasta
    * EC X.Y.Z.W

In [10]:
# Plot of average num of annots per genome

summary = pd.read_csv('/Users/melissaherring/Google Drive/My Drive/MH_project/dramv_full_summary.csv')
summary.head()

Unnamed: 0,ID,pfam_count,kegg_count,viral_hit,vogdb_count,total_genes_annot,V_count,M_count,A_count,P_count,E_count,K_count,T_count,F_count,B_count
0,jv-132-vMAG_16,3,0,0,0,3,1,0,0,0,0,0,0,39,0
1,jv-132-vMAG_3,4,0,0,0,4,2,0,0,0,0,0,0,13,0
2,cv1_AM-664-O20,7,0,0,0,7,2,0,0,1,0,0,0,34,0
3,cv1_AM-680-O21,5,4,0,0,9,1,3,0,0,0,2,0,15,3
4,cv1_AM-660-G21,10,4,0,3,17,4,1,0,0,0,1,0,29,0


In [None]:
sns.stripplot(x="species", y="sepal_length", data=data, jitter=True)