## dramv_summary_functions

This notebook creates and tests summarizing functions for the dramv annotations outputs from 13_clean_dramv_annot_script.

# Load packages and data

In [55]:
import pandas as pd
import math
import glob
import os # these two packages are good for searching and navigating file systems
import os.path as op

pd.set_option('display.max_columns', None)

example = pd.read_csv('/Users/melissaherring/Google Drive/My Drive/MH_project/dramv_trim/cv1_AM-654-B02.csv')

In [26]:
example.head()

Unnamed: 0,fasta,rank,kegg_hit,viral_hit,pfam_hits,vogdb_hits,annotation,annotation_source,amg_flags
0,cv1_AM-654-B02,E,,,,sp|P00970|DNLI_BPT4 DNA ligase; XhXr,DNA ligase,vogdb_hits,VF
1,cv1_AM-654-B02,E,,,,,,,F
2,cv1_AM-654-B02,E,,,,,,,F
3,cv1_AM-654-B02,E,,,,,,,F
4,cv1_AM-654-B02,E,,,,,,,F


In [37]:
# create a data frame that summarizes by annotation source

source_df = pd.DataFrame(example['annotation_source'].value_counts().reset_index())
source_df['ID'] = example.iloc[0, 0]
source_df = source_df.pivot(index='ID', columns='index', values='annotation_source')
source_df.rename(columns={'kegg_hit':'kegg_count','pfam_hits': 'pfam_count','vogdb_hits':'vogdb_count'}, inplace=True)
source_df

index,kegg_count,pfam_count,vogdb_count
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cv1_AM-654-B02,3,6,1


In [None]:
#example['amg_flags'].str.slice(0, 1)

width = 1  
num_parts = len(example['amg_flags'].iloc[0]) // width

# Initialize column names
columns = ['{}'.format(i) for i in range(1, num_parts + 1)]

# Loop through the fixed width and create new columns
for i in range(num_parts):
    start = i * width
    end = start + width
    example[columns[i]] = example['amg_flags'].str.slice(start, end)

example

In [70]:
# create columns for all amg flags
# if the column name is in amg_flags column, print column name

# V, M, A, P, E, K, T, F, B

new_column_names = ['V', 'M', 'A', 'P', 'E', 'K', 'T', 'F', 'B']

example.assign(**{name: None for name in new_column_names})

for col in example[new_column_names]:
    
    if col in example['amg_flags']:
        example[col] = print(col)
    else: example['col'] = None

example


Unnamed: 0,fasta,rank,kegg_hit,viral_hit,pfam_hits,vogdb_hits,annotation,annotation_source,amg_flags,V,M,A,P,E,K,T,F,B,col
0,cv1_AM-654-B02,E,,,,sp|P00970|DNLI_BPT4 DNA ligase; XhXr,DNA ligase,vogdb_hits,VF,,,,,,,,,,
1,cv1_AM-654-B02,E,,,,,,,F,,,,,,,,,,
2,cv1_AM-654-B02,E,,,,,,,F,,,,,,,,,,
3,cv1_AM-654-B02,E,,,,,,,F,,,,,,,,,,
4,cv1_AM-654-B02,E,,,,,,,F,,,,,,,,,,
5,cv1_AM-654-B02,E,,,,,,,F,,,,,,,,,,
6,cv1_AM-654-B02,E,,,,,,,F,,,,,,,,,,
7,cv1_AM-654-B02,E,,,,,,,F,,,,,,,,,,
8,cv1_AM-654-B02,E,,,,,,,F,,,,,,,,,,
9,cv1_AM-654-B02,D,,YP_010772222.1 MAG: hypothetical protein QIT37...,AAA domain (dynein-related subfamily) [PF07728...,sp|P04526|LOADL_BPT4 Sliding-clamp-loader larg...,AAA domain (dynein-related subfamily),pfam_hits,VF,,,,,,,,,,


In [39]:
# summarize by amg_flags

flags_df = pd.DataFrame(example['amg_flags'].value_counts().reset_index())
flags_df['ID'] = example.iloc[0,0]
flags_df = flags_df.pivot(index='ID', columns='index', values='amg_flags')
flags_df.rename(columns={'F':'F_amg_flag_count','PF':'PF_amg_flag_count','VF':'VF_amg_flag_count'}, inplace=True)
flags_df

index,F_amg_flag_count,PF_amg_flag_count,VF_amg_flag_count
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cv1_AM-654-B02,23,1,3


In [40]:
combo = pd.concat([source_df,flags_df])
combo

index,kegg_count,pfam_count,vogdb_count,F_amg_flag_count,PF_amg_flag_count,VF_amg_flag_count
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cv1_AM-654-B02,3.0,6.0,1.0,,,
cv1_AM-654-B02,,,,23.0,1.0,3.0


In [None]:
# summarize by annotation

In [None]:
# make annotation category?