In [17]:
import glob
import os
import pandas as pd
import csv
from types import *
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pickle
import swifter
from os import path

## Function Definitions

In [1]:
#TODO: memory greedy, keeps the dictionary of profiles, 
#>100 mg would choke on memory, better process one by one instead of dict

In [2]:
def filter_cogs(groupped_cog_df, msk=None):
    if not msk:
        msk = ((groupped_cog_df.full_e_val < 1e-6) | 
               ((groupped_cog_df.alignment_length > 100) & 
                (groupped_cog_df.percent_id) > 50 ))
    return groupped_cog_df[msk]

In [3]:
def extract_raw_cog_hits_gff(cog_df_outfile):
    """
    Assumptions:
    only one COG of same type per gene: by bitscore, eval(reversed), alen
    """
    unstacked_dct = {}
    
    
    unstacked_dct = {}
    unstacked_dct['gene_id_lst'] = []
    unstacked_dct['hit_id_lst'] = []
    unstacked_dct['fake_percent_id_lst'] = []
    unstacked_dct['alignment_length_lst'] = []
    #unstacked_dct['independent_domain_e-value_lst'] = []
    unstacked_dct['full_sequence_e-value_lst'] = []
    unstacked_dct['full_sequence_bitscore_lst'] = []
    unstacked_dct['cog_id_lst'] = []
    #unstacked_dct['cnt_lst'] = []
    
    data_col = cog_df_outfile.iloc[:, 8]
    cog_col = cog_df_outfile.iloc[:, 2]
    gene_id_col = cog_df_outfile.iloc[:, 0]
    
    for cog_id, gene_id, str_data in zip(cog_col,gene_id_col, data_col): 
        dict_data = {} #one string entry broken down as dictionary
        for entry in str_data.split(';'):
            k,v = entry.split("=")
            dict_data[k] = v
        unstacked_dct['gene_id_lst'].append(gene_id)
        unstacked_dct['hit_id_lst'].append(dict_data['ID'])
        unstacked_dct['cog_id_lst'].append(cog_id)
        unstacked_dct['fake_percent_id_lst'].append(dict_data['fake_percent_id'])
        unstacked_dct['alignment_length_lst'].append(dict_data['alignment_length'])
        unstacked_dct['full_sequence_e-value_lst'].append(dict_data['full_sequence_e-value'])
        unstacked_dct['full_sequence_bitscore_lst'].append(dict_data['full_sequence_bitscore'])
        
        
        
#         unstacked_dct[fn]['cnt_lst'].append(1)
#         for fn in func:
#             if fn in dict_data:
#                 for fn_id in dict_data[fn].split(','):
                    

    counts_df = pd.DataFrame({'gene_id': unstacked_dct['gene_id_lst'], 
                              'cog_id': unstacked_dct['cog_id_lst'],
                              'hit_id': unstacked_dct['hit_id_lst'], 
                              'percent_id': unstacked_dct['fake_percent_id_lst'],
                              'alignment_length': unstacked_dct['alignment_length_lst'],
                              'full_e_val': unstacked_dct['full_sequence_e-value_lst'],
                              'bitscore': unstacked_dct['full_sequence_bitscore_lst'],})
    
    counts_df = counts_df.astype({'percent_id': np.float64, 
                          'alignment_length': np.float64,
                          'full_e_val': np.float64,
                          'bitscore': np.float64})
    
    return counts_df

In [4]:
def extract_raw_cog_hits_gen(cog_df_outfile):
    """
<taxon_oid>.<au>.cog.txt (from NCBI RPSBLAST)
0 gene_id - Gene object identifier of query gene
1 cog_id - COG identifier
2 percent_identity - Perceent identity of aligned amino acid residues
3 align_length - Alignment length
4 query_start - Start coordinate of alignment on query gene
5 query_end - End coordinate of alignment on query gene
6 subj_start - Start coordinate of alignment on subject sequence
7 subj_end - End coordinate of alignment on subject sequence
8 evalue - Expectation value
9 bit_score - Bit score of alignment
    """
    unstacked_dct = {}
    
    
    unstacked_dct = {}
    unstacked_dct['gene_id_lst'] = []
    unstacked_dct['hit_id_lst'] = []
    unstacked_dct['fake_percent_id_lst'] = []
    unstacked_dct['alignment_length_lst'] = []
    #unstacked_dct['independent_domain_e-value_lst'] = []
    unstacked_dct['full_sequence_e-value_lst'] = []
    unstacked_dct['full_sequence_bitscore_lst'] = []
    unstacked_dct['cog_id_lst'] = []
    #unstacked_dct['cnt_lst'] = []
    
    gene_id_col = cog_df_outfile.iloc[:, 0]
    cog_col = cog_df_outfile.iloc[:, 1]
    percent_id_col = cog_df_outfile.iloc[:, 2]
    aln_len_col = cog_df_outfile.iloc[:, 3]
    eval_col = cog_df_outfile.iloc[:, 8]
    bitscore_col = cog_df_outfile.iloc[:, 9]
    
    
    for cog_id, gene_id, percent_id, aln_len, e_val, bitscore, idx in zip(cog_col, gene_id_col,
                                                                          percent_id_col, aln_len_col,
                                                                          eval_col, bitscore_col, 
                                                                          cog_df_outfile.index): 
        #dict_data = {} #one string entry broken down as dictionary
#         for entry in str_data.split(';'):
#             k,v = entry.split("=")
#             dict_data[k] = v
        unstacked_dct['gene_id_lst'].append(gene_id)
        unstacked_dct['hit_id_lst'].append(idx)
        unstacked_dct['cog_id_lst'].append(cog_id)
        unstacked_dct['fake_percent_id_lst'].append(percent_id)
        unstacked_dct['alignment_length_lst'].append(aln_len)
        unstacked_dct['full_sequence_e-value_lst'].append(e_val)
        unstacked_dct['full_sequence_bitscore_lst'].append(bitscore)
        
        
        
#         unstacked_dct[fn]['cnt_lst'].append(1)
#         for fn in func:
#             if fn in dict_data:
#                 for fn_id in dict_data[fn].split(','):
                    

    counts_df = pd.DataFrame({'gene_id': unstacked_dct['gene_id_lst'], 
                              'cog_id': unstacked_dct['cog_id_lst'],
                              'hit_id': unstacked_dct['hit_id_lst'], 
                              'percent_id': unstacked_dct['fake_percent_id_lst'],
                              'alignment_length': unstacked_dct['alignment_length_lst'],
                              'full_e_val': unstacked_dct['full_sequence_e-value_lst'],
                              'bitscore': unstacked_dct['full_sequence_bitscore_lst'],})
    
    counts_df = counts_df.astype({'percent_id': np.float64, 
                          'alignment_length': np.float64,
                          'full_e_val': np.float64,
                          'bitscore': np.float64})
    
    return counts_df

<taxon_oid>.<au>.cog.txt (from NCBI RPSBLAST)
0 gene_id - Gene object identifier of query gene
1 cog_id - COG identifier
2 percent_identity - Perceent identity of aligned amino acid residues
3 align_length - Alignment length
4 query_start - Start coordinate of alignment on query gene
5 query_end - End coordinate of alignment on query gene
6 subj_start - Start coordinate of alignment on subject sequence
7 subj_end - End coordinate of alignment on subject sequence
8 evalue - Expectation value
9 bit_score - Bit score of alignment

In [5]:
def process_raw_cogs(raw_cogs_df):
    sorted_raw_cogs = raw_cogs_df.sort_values(['gene_id', 'cog_id', 'bitscore', 'full_e_val', 'alignment_length', 'percent_id'], 
                    ascending=[True, True, False, True, False, False])
    #aggregation takes weighted averages if multiple hits occur for cog-gene pair
    
    #this one's good but slow
    gr_gene_cog = raw_cogs_df.groupby(['gene_id', 'cog_id'], as_index=False).apply(my_agg).reset_index()
    
    #this aggregation just takes the best hit according to sorting
#     gr_gene_cog = sorted_raw_cogs.groupby(['gene_id', 'cog_id'], as_index=False).agg({'bitscore': 'first', 
#                                                                                       'full_e_val': 'first', 
#                                                                                       'percent_id': 'first', 
#                                                                                       'alignment_length': 'first'})
    return gr_gene_cog

In [6]:
def process_raw_cogs_fast(raw_cogs_df):
    sorted_raw_cogs = raw_cogs_df.sort_values(['gene_id', 'cog_id', 'bitscore', 'full_e_val', 'alignment_length', 'percent_id'], 
                    ascending=[True, True, False, True, False, False])
    #aggregation takes weighted averages if multiple hits occur for cog-gene pair
    
    #this one's good but slow
    #gr_gene_cog = raw_cogs_df.groupby(['gene_id', 'cog_id'], as_index=False).apply(my_agg).reset_index()
    
    #this aggregation just takes the best hit according to sorting
    gr_gene_cog = sorted_raw_cogs.groupby(['gene_id', 'cog_id'], as_index=False).agg({'bitscore': 'first', 
                                                                                      'full_e_val': 'first', 
                                                                                      'percent_id': 'first', 
                                                                                      'alignment_length': 'first'})
    return gr_gene_cog

In [7]:
def my_agg(x):
    names = {'percent_id': (x['percent_id'] * x['alignment_length']).sum()/x['alignment_length'].sum(),
             'bitscore': (x['bitscore'] * x['alignment_length']).sum()/x['alignment_length'].sum(),
             'full_e_val': (x['full_e_val'] * x['alignment_length']).sum()/x['alignment_length'].sum(),
             'alignment_length': x['alignment_length'].sum(),
            }
    return pd.Series(names, index=['percent_id', 'bitscore', 'full_e_val', 'alignment_length'])

### functional_annotation specific

In [8]:
def summarize_by_function(unstacked_fn_df, function_metadata_df=None, agg_col='Relative_count_percent'):
    summary_df = unstacked_fn_df.groupby('Function').aggregate({agg_col: sum})
    if function_metadata_df:
        merged_summary = function_metadata_df.merge(summary_df, how='left', left_index=True, right_on='Function').reset_index(drop=True).fillna(0)
    else:
        merged_summary = summary_df
    if agg_col == 'Count':
        merged_summary.Count = merged_summary.Count.astype(int)
    return merged_summary[merged_summary.iloc[:, -1] != 0]
    #m2.gene_count = m2.gene_count.astype(int)

In [9]:
def extract_func_counts(annot_df, func=['cog']):
    """
    Extract absolute function counts from JGI functional annotation file
    
    parameters:
        annot_df: dataframe of raw functional annotation
        func=['cog', 'pfam', 'tigrfam']: JGI keyword for a function annotation
        
    returns:
        counts_df_dct: first key is a function according to a func list. Values are dataframes 
        with unstacked function information.
    """
    unstacked_dct = {}
    counts_df_dct = {}
    for fn in func:
        unstacked_dct[fn] = {}
        unstacked_dct[fn]['func_ids_lst'] = []
        unstacked_dct[fn]['cont_ids_lst'] = []
        unstacked_dct[fn]['gene_ids_lst'] = []
        unstacked_dct[fn]['cnt_lst'] = []
    
    data_col = annot_df.iloc[:, 8]
    contig_col = annot_df.iloc[:, 0]
    
    for str_data, contig_id in zip(data_col, contig_col): 
        dict_data = {} #one string entry broken down as dictionary
        for entry in str_data.split(';'):
            k,v = entry.split("=")
            dict_data[k] = v #values may still be stacked
        for fn in func:
            if fn in dict_data:
                for fn_id in dict_data[fn].split(','):
                    unstacked_dct[fn]['cont_ids_lst'].append(contig_id)
                    unstacked_dct[fn]['gene_ids_lst'].append(dict_data['ID'])
                    unstacked_dct[fn]['func_ids_lst'].append(fn_id)
                    unstacked_dct[fn]['cnt_lst'].append(1)
    
    for fn in func:
        counts_df_dct[fn] = pd.DataFrame({'gene_id': unstacked_dct[fn]['gene_ids_lst'], 
                               'Function': unstacked_dct[fn]['func_ids_lst'], 
                               'contig_id': unstacked_dct[fn]['cont_ids_lst'], 
                               'Count': unstacked_dct[fn]['cnt_lst']})
    
    return counts_df_dct

In [10]:
def find_data(path, parse_folders=True):
    if parse_folders:
        result = find_data_in_folders(path)
    else:
        result = find_data_plain(path)
    return result

In [11]:
def find_data_in_folders(path_to_data_folders):
    file_data_dct = {}
    
    folders = glob.glob(path_to_data_folders)

    
    for folder in folders:        
        cog_file = glob.glob(folder+"/*cog*")
        func_annot_file = glob.glob(folder+"/*functional_annotation*")
        #contig_mapping_file = glob.glob(folder+"/*mapping*")
                

        if len(cog_file) != 1:
            print("Inconsistent coverage file in folder: " + folder)
            continue
        elif len(func_annot_file) != 1:
            print("Inconsistent annotation file in folder: " + folder)
            continue
#         elif len(contig_mapping_file) != 1:
#             print("Inconsistent mapping file in folder: " + folder)
#             continue
        
        file_data_dct[folder] = {}
        
        file_data_dct[folder]['cog_file'] = cog_file[0]
        file_data_dct[folder]['func_annot_file'] = func_annot_file[0]
        #file_data_dct[folder]['contig_mapping_file'] = contig_mapping_file[0]

    return file_data_dct

In [12]:
def find_data_plain(path):
    file_data_dct = {}
    
    #use general path here, we'll than filter it
    files = glob.glob(path)
    

    cnt = 0
    for cog_file in files:        
        if 'cog' not in cog_file.lower():
            continue
        #cog_file = glob.glob(folder+"/*cog*")
        #func_annot_file = glob.glob(folder+"/*functional_annotation*")
        #contig_mapping_file = glob.glob(folder+"/*mapping*")
                

#         if len(cog_file) != 1:
#             print("Inconsistent coverage file in folder: " + folder)
#             continue
#         elif len(func_annot_file) != 1:
#             print("Inconsistent annotation file in folder: " + folder)
#             continue
# #         elif len(contig_mapping_file) != 1:
# #             print("Inconsistent mapping file in folder: " + folder)
# #             continue
        
        pth = path +"_"+str(cnt)
        file_data_dct[pth] = {}
        
        file_data_dct[pth]['cog_file'] = cog_file
        file_data_dct[pth]['func_annot_file'] = ""
        #file_data_dct[folder]['contig_mapping_file'] = contig_mapping_file[0]
        cnt += 1

    return file_data_dct

In [13]:
def export_function_profiles(fn_profiles_dct, 
                             out_folder_path="", 
                             export_type_lst=None,
                             suffix="", minimal_fname=False,
                            function_metadata_path=None):

    if suffix:
        suffix = "_" + suffix
    for taxon_oid in fn_profiles_dct.keys():
      
        if export_type_lst is None:
            profile_types = fn_profiles_dct[taxon_oid].keys()
        else:
            profile_types = [i for i in fn_profiles_dct[taxon_oid].keys() if i in export_type_lst]
        for profile_type in profile_types:
            if minimal_fname:
                fname = taxon_oid+".tsv"
            else:
                fname = taxon_oid+"_"+profile_type+suffix+".tsv"  
            
            outfile_path = path.join(out_folder_path, fname)
            df = fn_profiles_dct[taxon_oid][profile_type].rename(columns={'cog_id': 'COG ID', 'gene_id': "Gene Count"})
            if function_metadata_path:
                func_full = pd.read_csv(function_metadata_path, delimiter='\t')
                func_full = func_full.loc[:, ~func_full.columns.str.match('Unnamed')]
                df = df.merge(func_full.set_index("COG ID", drop=True), how="left", left_on="COG ID", right_index=True)
                df = df[['COG ID', 'COG Name', 'Gene Count']]
        
            df.to_csv(outfile_path, sep='\t', index=False)
            
            

In [14]:
def retrieve_cog_tables(source_data_dct, to_filt_lst=[], parse_taxon_oid=True):
    
    output_profiles_dct = {}
    taxon_id = ""
    for folder, file_dct in source_data_dct.items():
        cog_file = source_data_dct[folder]['cog_file']
        if parse_taxon_oid:
            for field in cog_file.split('/'):
                if "IMG_" in field:
                    taxon_id = field.split("_")[1]
                    break
                
        else:
            
            taxon_id = os.path.basename(cog_file)
            taxon_id = taxon_id.split("_")[0].split(".")[0]
        print(taxon_id)
        #func_annot_file = source_data_dct[folder]['func_annot_file']
        

        assert len(taxon_id) > 0, "No profile identifier detected"
        
        if taxon_id in to_filt_lst:
            continue



        #current_profile = pd.read_csv(func_annot_file, delimiter='\t', header=None) 
        raw_cog_data = pd.read_csv(cog_file, delimiter='\t', header=None)

        output_profiles_dct[taxon_id] = {}
        if "gff" in cog_file.lower():
            output_profiles_dct[taxon_id]['raw'] = extract_raw_cog_hits_gff(raw_cog_data)
        else:
            output_profiles_dct[taxon_id]['raw'] = extract_raw_cog_hits_gen(raw_cog_data)

        #slightly more agressive but way faster
        output_profiles_dct[taxon_id]['grouped'] = process_raw_cogs_fast(output_profiles_dct[taxon_id]['raw'])

        #more considerate but way slower
        #cog_page_df_dct[taxon_id]['grouped'] = process_raw_cogs(cog_page_df_dct[taxon_id]['raw'])

        output_profiles_dct[taxon_id]['filtered'] = filter_cogs(output_profiles_dct[taxon_id]['grouped'])

    return output_profiles_dct


#         unstacked_by_fn_dct[taxon_id] = extract_func_counts(current_profile, functions)
#         result_dct[taxon_id] = {}


#         for fn, unstacked_fn_df in unstacked_by_fn_dct[taxon_id].items():
#             result_dct[taxon_id][fn] = {}

#             result_dct[taxon_id][fn]['raw'] = summarize_by_function(unstacked_fn_df, None, agg_col='Count')




In [15]:
def summarize_cog_tables(tables_dct):
    summary_dct = {}
    for taxon_id in tables_dct.keys():
        summary_dct[taxon_id] = {}
        for dtype in tables_dct[taxon_id]:
            summary_dct[taxon_id][dtype] = tables_dct[taxon_id][dtype].groupby('cog_id').aggregate({'gene_id':len}).reset_index()
    return summary_dct

## COG parsing

In [17]:
#!mkdir request_number44_shale_fracking/

In [28]:
#data_files_dct = find_data("cog_training_data/cog_files/*", parse_folders=False)
data_files_dct = find_data("request_number45_contaminated/*", parse_folders=False)

In [29]:
data_files_dct

{'request_number45_contaminated/*_0': {'cog_file': 'request_number45_contaminated/Ga0209514.COG',
  'func_annot_file': ''}}

In [30]:
cog_page_df_dct = retrieve_cog_tables(data_files_dct, parse_taxon_oid=False)

Ga0209514


In [31]:
cog_counts_df_dct = summarize_cog_tables(cog_page_df_dct)

In [32]:
#export_function_profiles(cog_page_df_dct,export_type_lst=['filtered'] , suffix="cog")

In [33]:
#summary_dct = summarize_cog_tables(cog_page_df_dct)

In [34]:
#!mkdir cog_training_data/cog_profiles
!mkdir request_number45_contaminated/

mkdir: cannot create directory ‘request_number45_contaminated/’: File exists


In [35]:
export_function_profiles(cog_counts_df_dct, export_type_lst=['filtered'],
                         out_folder_path="request_number45_contaminated_cog_output/", 
                         suffix="", function_metadata_path='coglist123340_25-mar-2018.tsv')

In [30]:
export_function_profiles(cog_counts_df_dct, export_type_lst=['filtered'],
                         out_folder_path="cog_training_data/cog_profiles", minimal_fname=True)

## Checkups, tests

In [18]:
data_files_dct

{'cog_training_small/IMG_3300034404/IMG_Data': {'cog_file': 'cog_training_small/IMG_3300034404/IMG_Data/Ga0374090_cog.gff',
  'func_annot_file': 'cog_training_small/IMG_3300034404/IMG_Data/Ga0374090_functional_annotation.gff'},
 'cog_training_small/IMG_3300038493/IMG_Data': {'cog_file': 'cog_training_small/IMG_3300038493/IMG_Data/Ga0188167_cog.gff',
  'func_annot_file': 'cog_training_small/IMG_3300038493/IMG_Data/Ga0188167_functional_annotation.gff'},
 'cog_training_small/IMG_3300045129/IMG_Data': {'cog_file': 'cog_training_small/IMG_3300045129/IMG_Data/Ga0495008_cog.gff',
  'func_annot_file': 'cog_training_small/IMG_3300045129/IMG_Data/Ga0495008_functional_annotation.gff'},
 'cog_training_small/IMG_3300045855/IMG_Data': {'cog_file': 'cog_training_small/IMG_3300045855/IMG_Data/Ga0485835_cog.gff',
  'func_annot_file': 'cog_training_small/IMG_3300045855/IMG_Data/Ga0485835_functional_annotation.gff'}}

In [23]:
!ls *cog*tsv

raw_cogs.tsv  sorted_raw_cogs.tsv


In [31]:
!tail -n +2 3300034404_filtered_cog.tsv | cut -f2 | sort | uniq | wc -l

2276


In [103]:
cog_page_df_dct[taxon_oid]['filtered']

Unnamed: 0,gene_id,cog_id,bitscore,full_e_val,percent_id,alignment_length
3,Ga0495008_000001_171121_171699,COG0817,55.0,1.500000e-13,93.79,182.0
4,Ga0495008_000001_17516_20077,COG0463,148.1,1.300000e-41,76.04,274.0
7,Ga0495008_000001_188249_188872,COG2214,73.0,8.800000e-19,26.24,180.0
8,Ga0495008_000001_191757_193220,COG0328,97.0,2.000000e-26,94.23,190.0
9,Ga0495008_000001_1_768,COG4112,63.9,3.300000e-16,75.86,207.0
...,...,...,...,...,...,...
162013,Ga0495008_301438_1_201,COG3391,61.5,1.700000e-15,17.37,67.0
162019,Ga0495008_301451_1_201,COG3391,70.5,3.100000e-18,17.37,67.0
162021,Ga0495008_301457_3_203,COG3391,65.3,1.200000e-16,17.37,67.0
162027,Ga0495008_301484_1_201,COG0208,58.3,1.300000e-14,16.67,65.0


In [102]:
for taxon_oid in cog_page_df_dct.keys():
    raw_genes = len(cog_page_df_dct[taxon_oid]['raw'])
    grp_genes = len(cog_page_df_dct[taxon_oid]['grouped'])
    flt_genes = len(cog_page_df_dct[taxon_oid]['filtered'])
    #jgi_genes = len(unstacked_by_fn_dct[taxon_oid]['cog'])
    
    grp_u_cog = len(cog_page_df_dct[taxon_oid]['grouped'].groupby('cog_id').agg({'gene_id':'count'}))
    grp_t_cog = int(cog_page_df_dct[taxon_oid]['grouped'].groupby('cog_id').agg({'gene_id':'count'}).sum())
    flt_u_cog = len(cog_page_df_dct[taxon_oid]['filtered'].groupby('cog_id').agg({'gene_id':'count'}))
    flt_t_cog = int(cog_page_df_dct[taxon_oid]['filtered'].groupby('cog_id').agg({'gene_id':'count'}).sum())
    #jgi_u_cog = len(result_dct[taxon_oid]['cog']['raw'])
    #jgi_t_cog = result_dct[taxon_oid]['cog']['raw'].Count.sum()
    
    print("******** Taxon ID:\t", taxon_oid, "**********")
    print("COG processing stats:")
    print("\traw genes:", raw_genes)
    print("\tgrouped genes:", grp_genes)
    print("\tgrouped unique COGs:", grp_u_cog)
    print("\tgrouped total COGs:", grp_t_cog)
    print("\tfiltered genes:", flt_genes, f" ({flt_genes/grp_genes*100:.2f}%)")
    print("\tfiltered unique COGs:", flt_u_cog, f" ({flt_u_cog/grp_u_cog*100:.2f}%)")
    print("\tfiltered total COGs:", flt_t_cog, f" ({flt_t_cog/grp_t_cog*100:.2f}%)")
    
#     print("JGI stats:")
#     print("\tunstacked genes:", jgi_genes)
#     print("\tunique COGs: ", jgi_u_cog)
#     print("\ttotal COGs:", jgi_t_cog)
    print()


******** Taxon ID:	 Ga0188167 **********
COG processing stats:
	raw genes: 14194
	grouped genes: 13336
	grouped unique COGs: 2279
	grouped total COGs: 13336
	filtered genes: 11180  (83.83%)
	filtered unique COGs: 1897  (83.24%)
	filtered total COGs: 11180  (83.83%)

******** Taxon ID:	 Ga0374090 **********
COG processing stats:
	raw genes: 47954
	grouped genes: 45949
	grouped unique COGs: 3006
	grouped total COGs: 45949
	filtered genes: 36355  (79.12%)
	filtered unique COGs: 2276  (75.72%)
	filtered total COGs: 36355  (79.12%)

******** Taxon ID:	 Ga0485835 **********
COG processing stats:
	raw genes: 243471
	grouped genes: 233310
	grouped unique COGs: 4341
	grouped total COGs: 233310
	filtered genes: 193706  (83.03%)
	filtered unique COGs: 3660  (84.31%)
	filtered total COGs: 193706  (83.03%)

******** Taxon ID:	 Ga0495008 **********
COG processing stats:
	raw genes: 168485
	grouped genes: 162035
	grouped unique COGs: 4440
	grouped total COGs: 162035
	filtered genes: 116364  (71.81%)
