In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
data_dir = os.environ['DATA_DIR']
isolates_data_fp = os.path.join(data_dir, 'isolates.tsv')
val_strains_fp = os.path.join(data_dir, 'validation_strains_ml_predictions.tsv')
full_cladogram_data = os.path.join(data_dir, 'full_cladogram.txt')
full_graphlan_data = os.path.join(data_dir, 'full_graphlan.txt')
actives_cladogram_data = os.path.join(data_dir, 'actives_cladogram.txt')
actives_graphlan_data = os.path.join(data_dir, 'actives_graphlan.txt')

In [3]:
isolates_df = pd.read_csv(isolates_data_fp, sep='\t')

clades_list = {'strain_id':[], 'clade_str':[]}
for i, row in isolates_df.iterrows():
    bpts = str(row['species']).split()
    if len(bpts) > 1:
        species = bpts[1]
    else:
        species = 'gtdb_novel_strain'
    clades_list['clade_str'].append(f"{row['phylum']}.{row['class']}.{row['family']}.{row['genus']}.{species}.{row.strain_id}")
    clades_list['strain_id'].append(row.strain_id)
clades_df = pd.DataFrame(clades_list)

isolates_clades_df = isolates_df.merge(clades_df, on='strain_id', how='left')
isolates_clades_df.tail(3)

Unnamed: 0,strain_id,selection_tool,sample_type,country,state,phylum,class,order,family,genus,...,sa_purity_test,sa_purity_confirmed,bs_primary_test,bs_confirmation_test,bs_confirmed_activity,bs_purity_test,bs_purity_confirmed,sa_ml_disease_ctrl,bs_ml_disease_ctrl,clade_str
1224,AFS099881,diversity,Core,United States,North Carolina,Firmicutes_I,Bacilli_A,Paenibacillales,Paenibacillaceae,Paenibacillus,...,0,0,0.0,0.0,0.0,0.0,0.0,,,Firmicutes_I.Bacilli_A.Paenibacillaceae.Paenib...
1225,AFS099918,diversity,Amphibian,United States,North Carolina,Firmicutes,Bacilli,Staphylococcales,Staphylococcaceae,Staphylococcus,...,0,0,1.0,0.0,0.0,0.0,0.0,,,Firmicutes.Bacilli.Staphylococcaceae.Staphyloc...
1226,AFS099934,diversity,Soil,Uganda,Amuria,Firmicutes_I,Bacilli_A,Paenibacillales,Paenibacillaceae,Paenibacillus,...,0,0,0.0,0.0,0.0,0.0,0.0,,,Firmicutes_I.Bacilli_A.Paenibacillaceae.Paenib...


In [4]:
val_strains_df = pd.read_csv(val_strains_fp, sep='\t')
val_strains_df.head()

Unnamed: 0,strain_id,model_support
0,AFS000256,er
1,AFS000734,all_models
2,AFS002168,rf_and_nn
3,AFS003480,er
4,AFS004460,all_models


In [5]:
def update_selection_tool(row):
    sid = row['strain_id']
    if sid in val_strains_df.strain_id.tolist():
        new_tool = val_strains_df.query(f'strain_id == "{sid}"').model_support.values[0]
        return new_tool
    else:
        return row['selection_tool']
isolates_clades_df['selection_tool'] = isolates_clades_df.apply(update_selection_tool, axis=1)
isolates_clades_df.head(3)

Unnamed: 0,strain_id,selection_tool,sample_type,country,state,phylum,class,order,family,genus,...,sa_purity_test,sa_purity_confirmed,bs_primary_test,bs_confirmation_test,bs_confirmed_activity,bs_purity_test,bs_purity_confirmed,sa_ml_disease_ctrl,bs_ml_disease_ctrl,clade_str
0,AFS000005,genome_similarity,Soil,Uganda,Amuria,Firmicutes,Bacilli,Bacillales,Bacillaceae_H,Bacillus_C,...,0,0,1.0,0.0,0.0,0.0,0.0,,,Firmicutes.Bacilli.Bacillaceae_H.Bacillus_C.me...
1,AFS000006,genome_similarity,Soil,United States,Texas,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas_E,...,0,0,1.0,0.0,0.0,0.0,0.0,,35.0,Proteobacteria.Gammaproteobacteria.Pseudomonad...
2,AFS000009,genome_similarity,Soil,United States,Texas,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas_E,...,1,1,1.0,1.0,1.0,1.0,1.0,,80.0,Proteobacteria.Gammaproteobacteria.Pseudomonad...


In [6]:
isolates_clades_df.selection_tool.unique()

array(['genome_similarity', 'spectrum', 'er', 'all_models', 'diversity',
       'rf_and_nn', 'rf', 'nn_and_er', 'rf_and_er', 'none'], dtype=object)

In [7]:
def generate_graphlan_files(df, graphlan_fn, cladogram_fn):
    with open(cladogram_fn, 'w') as dhf:
        for e in df.clade_str:
            dhf.write(f'{e}\n')

    colors = {'Actinobacteriota':'#de8735',
              'Firmicutes':'#5d7c61',
              'Firmicutes_I':'#4886af',
              'Proteobacteria':'#dc523f',
              'Bacteroidota':'#92a488',
              'ncbi_16s':'#C0C0C0',
              'poor_genome_quality':'#C0C0C0',
              'active1':'#FCDC04',
              'active2':'#000000',
              'Uganda':'#FCDC04',
              'United States':'#3C3B6E',
              'spectrum': '#C0C0C0',
              'nn_and_er': 'r',
              'rf_and_er': 'r',
              'rf_and_nn':'r',
              'er':'r',
              'rf':'r',
              'all_models':'r',              
              'none': 'darkviolet',
              'genome_similarity': '#BD7B00',
              'diversity': '#FFFFFF'
             }

    basic_formatting = """clade_separation\t0.5
    branch_thickness\t0.25
    branch_bracket_depth\t0.5
    branch_bracket_width\t0.2
    clade_marker_size\t0
    clade_marker_edge_color\t#555555
    clade_marker_edge_width\t0.75
    start_rotation\t270
    ring_internal_separator_thickness\t2\t0.5\n
    ring_internal_separator_thickness\t3\t0.5\n
    ring_external_separator_thickness\t3\t0.5\n
    """
    # annotate phyla
    with open(graphlan_fn, 'w+') as gf:
        gf.write(basic_formatting)

    # annotate phyla
    with open(graphlan_fn, 'a') as gf:
        for n, n_df in df.groupby('clade_str'):
            p = n_df['phylum'].values[0]
            lines = f"{n}\tring_color\t1\t{colors[p]}\n"
            gf.write(lines)

    # annotate country of origin
    with open(graphlan_fn, 'a') as gf:
        for n, n_df in df.groupby('clade_str'):
            c_color = colors[n_df['country'].values[0]]
            lines = f"{n}\tring_color\t2\t{c_color}\n"
            gf.write(lines)

    # annotate selection tool 
    with open(graphlan_fn, 'a') as gf:
        for n, n_df in df.groupby('clade_str'):
            st_color = colors[n_df['selection_tool'].values[0]]
            lines = f"{n}\tring_color\t3\t{st_color}\n"
            gf.write(lines)

    # highlight isolates active against multiple pathogens
    with open(graphlan_fn, 'a') as gf:
        for n, n_df in df.groupby('clade_str'):
            if (n_df.sa_confirmed_activity.values[0] == 1) != \
               (n_df.bs_confirmed_activity.values[0] == 1):
                lines = f"{n}\tclade_marker_size\t15\n{n}\tclade_marker_edge_color\t{colors['active1']}\n"
                gf.write(lines)
            elif (n_df.sa_confirmed_activity.values[0] == 1) and \
                 (n_df.bs_confirmed_activity.values[0] == 1):
                lines = f"{n}\tclade_marker_size\t15\n{n}\tclade_marker_edge_color\t{colors['active2']}\n"
                gf.write(lines)

In [8]:
generate_graphlan_files(isolates_clades_df, full_graphlan_data, full_cladogram_data)

actives_df = isolates_clades_df[(isolates_clades_df.sa_confirmed_activity == 1) | 
                                (isolates_clades_df.bs_confirmed_activity == 1)]
generate_graphlan_files(actives_df, actives_graphlan_data, actives_cladogram_data)