<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Filter-out-interactions-based-on-taxonomy" data-toc-modified-id="Filter-out-interactions-based-on-taxonomy-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Filter out interactions based on taxonomy</a></span></li><li><span><a href="#Filter-out-microbes-based-on-taxonomy" data-toc-modified-id="Filter-out-microbes-based-on-taxonomy-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Filter out microbes based on taxonomy</a></span></li></ul></div>

In [1]:
# This block is just for importing the necessary libraries.  
import os
from collections import defaultdict
# Numerical libraries
import pandas as pd
import numpy as np
import biom
import arviz as az
from scipy.spatial.distance import euclidean
# Plotting libraries
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patches as mpatches
import seaborn as sns
from matplotlib_venn import venn2, venn3
# custom utility and plotting functions
from util import (extract_differentials, select_features, 
                  get_genomic_data, collapse_transcripts, 
                  aggregate_pathways,
                  ranking, btest, log_pvalue, read_kegg_dict,
                  ilr_transform_differentials,
                  rename_clades, create_projection,
                  match_all_differentials)
from plot import (rankplot, networkplot, vectorplot)
import random

# directory paths
dan_directory = '../sfari/data/sra_shotgun/Dan2020'
averina_directory = '../sfari/data/sra_shotgun/Averina2020'
wang_directory = '../sfari/data/sra_shotgun/Wang2021'
taxa_directory = '~/databases/wol/taxonomy'
taxa_directory = '~/databases/wol/taxonomy'
gtdb_directory = '~/databases/wol/taxonomy/gtdb'
results_dir = '../results'
mmvec_edge_dir = f'{results_dir}/mmvec/Network'
kegg_dir = f'{results_dir}/kegg'
hsa_dir = f'{results_dir}/hsa_kegg'
np.random.seed(0)
random.seed(0)
%matplotlib inline

In [2]:
mmvec_dir = '../results/mmvec/cond_probs/mmvec'
fname = 'latent_dim_3_input_prior_1.00_output_prior_1.00_beta1_0.90_beta2_0.95_ranks.txt'
dan_ranks = pd.read_table(f'{mmvec_dir}/Dan/biom2022/model_summary_new/{fname}', index_col=0)
wang_ranks = pd.read_table(f'{mmvec_dir}/Wang/biom2022/model_summary_new/{fname}', index_col=0)
averina_ranks = pd.read_table(f'{mmvec_dir}/Averina/biom2022/model_summary_new/{fname}', index_col=0)

taxonomy = pd.read_table('~/ceph/wol2/wol2/taxonomy/lineages.txt', index_col=0, header=None)

In [3]:
def slashes_f(x):
    cols = ['d__', 'p__', 'c__', 'o__', 'f__', 'g__', 's__']
    x = '/'.join(x.split('; ')[1:])
    for c in cols:
        x = x.replace(c, '')
    return x
taxonomy['gtdb'] = taxonomy[1].apply(slashes_f)

In [4]:
def rank_hits(ranks, k, pos=True):
    """ Creates an edge list based on rank matrix.
    Parameters
    ----------
    ranks : pd.DataFrame
       Matrix of ranks (aka conditional probabilities)
    k : int
       Number of nearest neighbors
    pos : bool
       Specifies either most associated or least associated.
       This is a proxy to positively correlated or negatively correlated.
    Returns
    -------
    edges : pd.DataFrame
       List of edges along with corresponding ranks.
    """
    axis = 1

    def sort_f(x):
        if pos:
            return [
                ranks.columns[i] for i in np.argsort(x)[-k:]
            ]
        else:
            return [
                ranks.columns[i] for i in np.argsort(x)[:k]
            ]

    idx = ranks.index
    topk = ranks.apply(sort_f, axis=axis).values
    topk = pd.DataFrame([x for x in topk], index=idx)
    top_hits = topk.reset_index()
    top_hits = top_hits.rename(columns={'featureid': 'src'})
    edges = pd.melt(
        top_hits, id_vars=['src'],
        var_name='rank',
        value_vars=list(range(k)),
        value_name='dest')

    # fill in actual ranks
    for i in edges.index:
        src = edges.loc[i, 'src']
        dest = edges.loc[i, 'dest']
        edges.loc[i, 'rank'] = ranks.loc[src, dest]
    edges['rank'] = edges['rank'].astype(np.float64)
    return edges

In [5]:
dan_edges = rank_hits(dan_ranks, 5).query("rank > 1")
wang_edges = rank_hits(wang_ranks, 5).query("rank > 1")
averina_edges = rank_hits(averina_ranks, 5).query("rank > 1")

In [6]:
gpd_metadata = pd.read_table('../results/mmvec/GPD_metadata.tsv')
gpd_metadata = gpd_metadata.dropna(subset=['Host_range_isolates'])

In [7]:
lookup = {
 'Actinobacteriota' : 'Actinobacteria',
 'Bacteroidota' : 'Bacteroidetes',
 'Desulfobacterota': 'Deltaproteobacteria',
 'Synergistota': 'Synergistetes',
 'Campylobacterota': 'Epsilonproteobacteria',
 'Fusobacteriota': 'Fusobacteria'}
    
def fix_f(x):
    y = x.split('/')[0]
    ## uncomment if you want to fix phylum level naming
    #if y in lookup.keys():
    #    z = lookup[y]
    #    return x.replace(y, z)
    return x
        
def all_taxa(x):
    if pd.isnull(x):
        return []
    else:
        taxa = str(x).split(',')
        return list(set(list(map(fix_f, taxa))))
    
gpd_taxa = list(gpd_metadata['Host_range_taxon'].apply(all_taxa).values)

# Filter out interactions based on taxonomy

In [8]:
all_gpd_taxa = sum(gpd_taxa, [])
all_gpd_taxa = set(all_gpd_taxa)
#common_taxa = set(wol_taxa) & set(all_gpd_taxa)

idx = taxonomy['gtdb'].apply(lambda x: x in all_gpd_taxa)
subtaxa = taxonomy.loc[idx]

In [9]:
print(dan_edges.shape, wang_edges.shape, averina_edges.shape)

(1076, 3) (506, 3) (1415, 3)


In [10]:
dan_edges.shape[0] + wang_edges.shape[0] + averina_edges.shape[0]

2997

# Filter out microbes based on taxonomy

In [11]:
wang_edges = (pd.merge(wang_edges, subtaxa, left_on='dest', right_index=True)
              .rename(columns={'src': 'virus', 'dest': 'microbe'}))
dan_edges = (pd.merge(dan_edges, subtaxa, left_on='dest', right_index=True)
             .rename(columns={'src': 'virus', 'dest': 'microbe'}))
averina_edges = (pd.merge(averina_edges, subtaxa, left_on='dest', right_index=True)
                 .rename(columns={'src': 'virus', 'dest': 'microbe'}))

In [12]:
wang_edges['dataset'] = 'Wang'
dan_edges['dataset'] = 'Dan'
averina_edges['dataset'] = 'Averina'
# pths_edges['dataset'] = 'PTHS'
all_edges = pd.concat((wang_edges, dan_edges, averina_edges), axis=0)
all_edges = all_edges.rename(columns={'src': 'virus', 'dest': 'microbe'})

In [13]:
subtaxa

Unnamed: 0_level_0,1,gtdb
0,Unnamed: 1_level_1,Unnamed: 2_level_1
G000006865,d__Bacteria; p__Firmicutes; c__Bacilli; o__Lac...,Firmicutes/Bacilli/Lactobacillales/Streptococc...
G000006925,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,Proteobacteria/Gammaproteobacteria/Enterobacte...
G000007265,d__Bacteria; p__Firmicutes; c__Bacilli; o__Lac...,Firmicutes/Bacilli/Lactobacillales/Streptococc...
G000007465,d__Bacteria; p__Firmicutes; c__Bacilli; o__Lac...,Firmicutes/Bacilli/Lactobacillales/Streptococc...
G000007645,d__Bacteria; p__Firmicutes; c__Bacilli; o__Sta...,Firmicutes/Bacilli/Staphylococcales/Staphyloco...
...,...,...
G900091655,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,Bacteroidota/Bacteroidia/Bacteroidales/Marinif...
G900091675,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,Bacteroidota/Bacteroidia/Bacteroidales/Marinif...
G900104395,d__Bacteria; p__Fusobacteriota; c__Fusobacteri...,Fusobacteriota/Fusobacteriia/Fusobacteriales/F...
G900113395,d__Bacteria; p__Firmicutes; c__Bacilli; o__Sta...,Firmicutes/Bacilli/Staphylococcales/Staphyloco...


In [14]:
#all_edges['microbe'] = all_edges['microbe'].astype(np.str)
# merge microbial metadata
#all_edges = pd.merge(all_edges, gpd_metadata, left_on='virus', right_on='GPD_id')

# merge GPD metadata
#all_edges = pd.merge(all_edges, subtaxa, left_on='microbe', right_index=True)

In [15]:
cols = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

def taxa_f(x):
    tabs = x.split(';')
    
    def f(y):
        if len(y.split('__')[1]) == '':
            return np.nan
        return y.split('__')[1]
        
    return list(map(f, tabs))
    
taxa_df = taxonomy[1].apply(taxa_f)
taxa_df = pd.DataFrame(list(taxa_df.values), columns=cols, index=taxonomy.index)
taxonomy = pd.merge(taxonomy, taxa_df, left_index=True, right_index=True)

In [16]:
all_edges = pd.merge(all_edges, taxonomy[cols], left_on='microbe', right_index=True)
del all_edges[1]
all_edges.to_csv('../results/interaction_metadata.txt', sep='\t')

In [17]:
all_edges

Unnamed: 0,virus,rank,microbe,gtdb,dataset,kingdom,phylum,class,order,family,genus,species
87,uvig_143237,11.800369,G000785515,Firmicutes/Bacilli/Lactobacillales/Streptococc...,Wang,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus salivarius
134,uvig_216549,7.767537,G000785515,Firmicutes/Bacilli/Lactobacillales/Streptococc...,Wang,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus salivarius
182,uvig_236116,5.448109,G000785515,Firmicutes/Bacilli/Lactobacillales/Streptococc...,Wang,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus salivarius
251,uvig_284465,4.564700,G000785515,Firmicutes/Bacilli/Lactobacillales/Streptococc...,Wang,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus salivarius
275,uvig_293127,6.438794,G000785515,Firmicutes/Bacilli/Lactobacillales/Streptococc...,Wang,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus salivarius
...,...,...,...,...,...,...,...,...,...,...,...,...
3084,uvig_370156,7.377481,G000434095,Bacteroidota/Bacteroidia/Bacteroidales/Bactero...,Averina,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides fragilis
3088,uvig_371046,6.930983,G000434095,Bacteroidota/Bacteroidia/Bacteroidales/Bactero...,Averina,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides fragilis
3193,uvig_422447,9.869628,G000434095,Bacteroidota/Bacteroidia/Bacteroidales/Bactero...,Averina,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides fragilis
3195,uvig_422532,9.095410,G000434095,Bacteroidota/Bacteroidia/Bacteroidales/Bactero...,Averina,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides fragilis
