<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Filter-out-interactions-based-on-taxonomy" data-toc-modified-id="Filter-out-interactions-based-on-taxonomy-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Filter out interactions based on taxonomy</a></span></li><li><span><a href="#Filter-out-microbes-based-on-taxonomy" data-toc-modified-id="Filter-out-microbes-based-on-taxonomy-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Filter out microbes based on taxonomy</a></span></li></ul></div>

In [1]:
# This block is just for importing the necessary libraries.  
import os
from collections import defaultdict
# Numerical libraries
import pandas as pd
import numpy as np
import biom
import arviz as az
from scipy.spatial.distance import euclidean
# Plotting libraries
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patches as mpatches
import seaborn as sns
from matplotlib_venn import venn2, venn3
# custom utility and plotting functions
from util import (extract_differentials, select_features, 
                  get_genomic_data, collapse_transcripts, 
                  aggregate_pathways,
                  ranking, btest, log_pvalue, read_kegg_dict,
                  ilr_transform_differentials,
                  rename_clades, create_projection,
                  match_all_differentials)
from plot import (rankplot, networkplot, vectorplot)
import random

# directory paths
dan_directory = '../sfari/data/sra_shotgun/Dan2020'
averina_directory = '../sfari/data/sra_shotgun/Averina2020'
wang_directory = '../sfari/data/sra_shotgun/Wang2021'
taxa_directory = '~/databases/wol/taxonomy'
taxa_directory = '~/databases/wol/taxonomy'
gtdb_directory = '~/databases/wol/taxonomy/gtdb'
results_dir = '../results'
mmvec_edge_dir = f'{results_dir}/mmvec/Network'
kegg_dir = f'{results_dir}/kegg'
hsa_dir = f'{results_dir}/hsa_kegg'
np.random.seed(0)
random.seed(0)
%matplotlib inline

In [2]:
# Load taxonomies
taxids = pd.read_table(f'{taxa_directory}/taxid.map', header=None, dtype=str)
ranks = pd.read_table(f'{gtdb_directory}/ranks.tsv')
taxonomy = pd.merge(ranks, taxids, left_on='gid', right_on=0)
taxonomy = taxonomy.set_index(1)
del taxonomy[0]

In [3]:
def concat_slash(x):
    #k = x['kingdom']
    p = x['phylum']
    c = x['class']
    o = x['order']
    f = x['family']
    g = x['genus']
    s = x['species']
    return '/'.join(list(map(str, [p, c, o, f, g, s])))

In [4]:
wol_taxa = list(taxonomy.apply(concat_slash, axis=1).values)

In [5]:
gpd_metadata = pd.read_table('../results/mmvec/GPD_metadata.tsv')
gpd_metadata = gpd_metadata.dropna(subset=['Host_range_isolates'])

In [6]:
lookup = {
 'Actinobacteriota' : 'Actinobacteria',
 'Bacteroidota' : 'Bacteroidetes',
 'Desulfobacterota': 'Deltaproteobacteria',
 'Synergistota': 'Synergistetes',
 'Campylobacterota': 'Epsilonproteobacteria',
 'Fusobacteriota': 'Fusobacteria'}
    
def fix_f(x):
    y = x.split('/')[0]
    ## uncomment if you want to fix phylum level naming
    #if y in lookup.keys():
    #    z = lookup[y]
    #    return x.replace(y, z)
    return x
        
def all_taxa(x):
    if pd.isnull(x):
        return []
    else:
        taxa = str(x).split(',')
        return list(set(list(map(fix_f, taxa))))
    
gpd_taxa = list(gpd_metadata['Host_range_taxon'].apply(all_taxa).values)

# Filter out interactions based on taxonomy

In [7]:
all_gpd_taxa = sum(gpd_taxa, [])
all_gpd_taxa = set(all_gpd_taxa)
common_taxa = set(wol_taxa) & set(all_gpd_taxa)

In [8]:
def contains_f(x):
    return len(set(x) & common_taxa) > 0

idx = list(map(contains_f, gpd_taxa))

gpd_subset = gpd_metadata.loc[idx]
viruses = gpd_metadata.loc[idx, 'GPD_id']

In [9]:
averina_edges = pd.read_table(f'{mmvec_edge_dir}/edges_Averina.txt')
averina_edges = averina_edges.loc[averina_edges.cond_logit > 1]

dan_edges = pd.read_table(f'{mmvec_edge_dir}/edges_Dan.txt')
dan_edges = dan_edges.loc[dan_edges.cond_logit > 1]

pths_edges = pd.read_table(f'{mmvec_edge_dir}/edges_PTHS.txt')
pths_edges = pths_edges.loc[pths_edges.cond_logit > 1]

wang_edges = pd.read_table(f'{mmvec_edge_dir}/edges_Wang.txt')
wang_edges = wang_edges.loc[wang_edges.cond_logit > 1]

In [10]:
print(dan_edges.shape, wang_edges.shape, averina_edges.shape)

(23896, 3) (20584, 3) (34100, 3)


In [11]:
dan_edges.shape[0] + wang_edges.shape[0] + averina_edges.shape[0]

78580

# Filter out microbes based on taxonomy

In [12]:
def contains_taxonomy(x):
    x = str(x)
    if x in taxonomy.index:
        tstr = concat_slash(taxonomy.loc[x])
        return tstr in common_taxa
    return False
wang_edges = wang_edges.loc[wang_edges.microbe.apply(contains_taxonomy)]
dan_edges = dan_edges.loc[dan_edges.microbe.apply(contains_taxonomy)]
averina_edges = averina_edges.loc[averina_edges.microbe.apply(contains_taxonomy)]
pths_edges = pths_edges.loc[pths_edges.microbe.apply(contains_taxonomy)]

In [13]:
print(dan_edges.shape, wang_edges.shape, averina_edges.shape)

(10626, 3) (10574, 3) (10076, 3)


In [14]:
dan_edges.shape[0] + wang_edges.shape[0] + averina_edges.shape[0]

31276

In [15]:
wang_edges['dataset'] = 'Wang'
dan_edges['dataset'] = 'Dan'
averina_edges['dataset'] = 'Averina'
# pths_edges['dataset'] = 'PTHS'
all_edges = pd.concat((wang_edges, dan_edges, averina_edges), axis=0)

In [16]:
all_edges['microbe'] = all_edges['microbe'].astype(np.str)
# merge microbial metadata
all_edges = pd.merge(all_edges, gpd_metadata, left_on='virus', right_on='GPD_id')
# merge GPD metadata
all_edges = pd.merge(all_edges, taxonomy, left_on='microbe', right_index=True)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  all_edges['microbe'] = all_edges['microbe'].astype(np.str)


In [17]:
all_edges.to_csv('../results/interaction_metadata.txt', sep='\t')

In [18]:
all_edges

Unnamed: 0,microbe,virus,cond_logit,dataset,GPD_id,Source,GPD_VC,Size,Predicted_phage_taxon,Host_range_isolates,...,checkV_termini,Novel,gid,domain,phylum,class,order,family,genus,species
0,1262981,uvig_108002,4.954891,Wang,uvig_108002,Metagenome,37815,32601,Myoviridae,"GCA_003464755,GCF_000164195,12718_7#19,20298_3...",...,No,Yes,G000435955,Bacteria,Firmicutes,Bacilli,Erysipelotrichales,Erysipelotrichaceae,Eubacterium_D,Eubacterium_D innocuum
22,1262981,uvig_108002,4.954891,Wang,uvig_108002,Metagenome,37815,32601,Myoviridae,"GCA_003464755,GCF_000164195,12718_7#19,20298_3...",...,No,Yes,G000435955,Bacteria,Firmicutes,Bacilli,Erysipelotrichales,Erysipelotrichaceae,Eubacterium_D,Eubacterium_D innocuum
44,1262981,ivig_2376,4.954647,Wang,ivig_2376,Isolate,18797,40415,,GCF_000011045,...,No,Yes,G000435955,Bacteria,Firmicutes,Bacilli,Erysipelotrichales,Erysipelotrichaceae,Eubacterium_D,Eubacterium_D innocuum
66,1262981,ivig_2376,4.954647,Wang,ivig_2376,Isolate,18797,40415,,GCF_000011045,...,No,Yes,G000435955,Bacteria,Firmicutes,Bacilli,Erysipelotrichales,Erysipelotrichaceae,Eubacterium_D,Eubacterium_D innocuum
88,1262981,ivig_2256,4.915556,Wang,ivig_2256,Isolate,41768,39592,,GCA_003602765,...,No,Yes,G000435955,Bacteria,Firmicutes,Bacilli,Erysipelotrichales,Erysipelotrichaceae,Eubacterium_D,Eubacterium_D innocuum
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6007,1871003,uvig_492171,1.052579,Averina,uvig_492171,Metagenome,3999,59704,,GCA_003482185,...,No,Yes,G900104605,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Rikenellaceae,Tidjanibacter,Tidjanibacter massiliensis
6041,1871003,uvig_568287,1.173307,Averina,uvig_568287,Metagenome,908,119423,,"12718_7#33,GCA_003466785",...,No,No,G900104605,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Rikenellaceae,Tidjanibacter,Tidjanibacter massiliensis
6066,1871003,uvig_568287,1.173307,Averina,uvig_568287,Metagenome,908,119423,,"12718_7#33,GCA_003466785",...,No,No,G900104605,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Rikenellaceae,Tidjanibacter,Tidjanibacter massiliensis
6189,1871003,ivig_4295,1.122697,Averina,ivig_4295,Isolate,18307,23261,,21673_4#3,...,No,No,G900104605,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Rikenellaceae,Tidjanibacter,Tidjanibacter massiliensis


In [19]:
taxonomy

Unnamed: 0_level_0,gid,domain,phylum,class,order,family,genus,species
1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
398511,G000005825,Bacteria,Firmicutes,Bacilli,Bacillales,Bacillaceae_D,Bacillus_S,Bacillus_S pseudofirmus
456320,G000006175,Archaea,Euryarchaeota,Methanococci,Methanococcales,Methanococcaceae,Methanococcus,Methanococcus voltae
306537,G000006605,Bacteria,Actinobacteriota,Actinobacteria,Corynebacteriales,Corynebacteriaceae,Corynebacterium,Corynebacterium jeikeium
160492,G000006725,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Xylella,Xylella fastidiosa
243277,G000006745,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Vibrionaceae,Vibrio,Vibrio cholerae
...,...,...,...,...,...,...,...,...
484498,G900156675,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Saccharospirillaceae,Oleibacter,Oleibacter marinus
477680,G900156765,Bacteria,Bacteroidota,Bacteroidia,Chitinophagales,Chitinophagaceae,Filimonas,Filimonas lacunae
1926284,G900156885,Bacteria,Firmicutes,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus_H,
1926277,G900156915,Bacteria,Firmicutes,Bacilli,Bacillales,Amphibacillaceae,Sediminibacillus_A,Sediminibacillus_A massiliensis
