<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Filter-out-interactions-based-on-taxonomy" data-toc-modified-id="Filter-out-interactions-based-on-taxonomy-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Filter out interactions based on taxonomy</a></span></li><li><span><a href="#Filter-out-microbes-based-on-taxonomy" data-toc-modified-id="Filter-out-microbes-based-on-taxonomy-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Filter out microbes based on taxonomy</a></span></li></ul></div>

In [1]:
# This block is just for importing the necessary libraries.  
import os
from collections import defaultdict
# Numerical libraries
import pandas as pd
import numpy as np
import biom
import arviz as az
from scipy.spatial.distance import euclidean
# Plotting libraries
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patches as mpatches
import seaborn as sns
from matplotlib_venn import venn2, venn3
# custom utility and plotting functions
from util import (extract_differentials, select_features, 
                  get_genomic_data, collapse_transcripts, 
                  aggregate_pathways,
                  ranking, btest, log_pvalue, read_kegg_dict,
                  ilr_transform_differentials,
                  rename_clades, create_projection,
                  match_all_differentials)
from plot import (rankplot, networkplot, vectorplot)
import random

# directory paths
dan_directory = '../sfari/data/sra_shotgun/Dan2020'
averina_directory = '../sfari/data/sra_shotgun/Averina2020'
wang_directory = '../sfari/data/sra_shotgun/Wang2021'
taxa_directory = '~/databases/wol/taxonomy'

results_dir = '../results'
mmvec_edge_dir = f'{results_dir}/mmvec/Network'
kegg_dir = f'{results_dir}/kegg'
hsa_dir = f'{results_dir}/hsa_kegg'
np.random.seed(0)
random.seed(0)
%matplotlib inline

In [2]:
# Load taxonomies
taxids = pd.read_table(f'{taxa_directory}/taxid.map', header=None, dtype=str)
ranks = pd.read_table(f'{taxa_directory}/ranks.tsv')
taxonomy = pd.merge(ranks, taxids, left_on='genome', right_on=0)
taxonomy = taxonomy.set_index(1)
del taxonomy[0]

In [3]:
taxonomy

Unnamed: 0_level_0,genome,kingdom,phylum,class,order,family,genus,species
1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
398511,G000005825,Bacteria,Firmicutes,Bacilli,Bacillales,Bacillaceae,Bacillus,Bacillus pseudofirmus
456320,G000006175,Archaea,Euryarchaeota,Methanococci,Methanococcales,Methanococcaceae,Methanococcus,Methanococcus voltae
306537,G000006605,Bacteria,Actinobacteria,Actinobacteria,Corynebacteriales,Corynebacteriaceae,Corynebacterium,Corynebacterium jeikeium
160492,G000006725,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Xylella,Xylella fastidiosa
243277,G000006745,Bacteria,Proteobacteria,Gammaproteobacteria,Vibrionales,Vibrionaceae,Vibrio,Vibrio cholerae
...,...,...,...,...,...,...,...,...
484498,G900156675,Bacteria,Proteobacteria,Gammaproteobacteria,Oceanospirillales,Oceanospirillaceae,Oleibacter,Oleibacter marinus
477680,G900156765,Bacteria,Bacteroidetes,Chitinophagia,Chitinophagales,Chitinophagaceae,Filimonas,Filimonas lacunae
1926284,G900156885,Bacteria,Firmicutes,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus sp. Marseille-P3519
1926277,G900156915,Bacteria,Firmicutes,Bacilli,Bacillales,Bacillaceae,Aquibacillus,Aquibacillus sp. Marseille-P3518


In [4]:
def concat_slash(x):
    #k = x['kingdom']
    p = x['phylum']
    c = x['class']
    o = x['order']
    f = x['family']
    g = x['genus']
    s = x['species']
    return '/'.join(list(map(str, [p, c, o, f, g, s])))

In [5]:
wol_taxa = list(taxonomy.apply(concat_slash, axis=1).values)

In [6]:
gpd_metadata = pd.read_table('../results/mmvec/GPD_metadata.tsv')
gpd_metadata = gpd_metadata.dropna(subset=['Host_range_isolates'])

In [7]:
lookup = {
 'Actinobacteriota' : 'Actinobacteria',
 'Bacteroidota' : 'Bacteroidetes',
 'Desulfobacterota': 'Deltaproteobacteria',
 'Synergistota': 'Synergistetes',
 'Campylobacterota': 'Epsilonproteobacteria',
 'Fusobacteriota': 'Fusobacteria'}
    
def fix_f(x):
    y = x.split('/')[0]
    if y in lookup.keys():
        z = lookup[y]
        return x.replace(y, z)
    return x
        
def all_taxa(x):
    if pd.isnull(x):
        return []
    else:
        taxa = str(x).split(',')
        return list(set(list(map(fix_f, taxa))))
    
gpd_taxa = list(gpd_metadata['Host_range_taxon'].apply(all_taxa).values)

# Filter out interactions based on taxonomy

In [8]:
all_gpd_taxa = sum(gpd_taxa, [])
all_gpd_taxa = set(all_gpd_taxa)
common_taxa = set(wol_taxa) & set(all_gpd_taxa)

In [9]:
def contains_f(x):
    return len(set(x) & common_taxa) > 0

idx = list(map(contains_f, gpd_taxa))

gpd_subset = gpd_metadata.loc[idx]
viruses = gpd_metadata.loc[idx, 'GPD_id']

In [10]:
averina_edges = pd.read_table(f'{mmvec_edge_dir}/edges_Averina.txt')
averina_edges = averina_edges.loc[averina_edges.cond_logit > 1]

dan_edges = pd.read_table(f'{mmvec_edge_dir}/edges_Dan.txt')
dan_edges = dan_edges.loc[dan_edges.cond_logit > 1]

pths_edges = pd.read_table(f'{mmvec_edge_dir}/edges_PTHS.txt')
pths_edges = pths_edges.loc[pths_edges.cond_logit > 1]

wang_edges = pd.read_table(f'{mmvec_edge_dir}/edges_Wang.txt')
wang_edges = wang_edges.loc[wang_edges.cond_logit > 1]

# Filter out microbes based on taxonomy

In [11]:
def contains_taxonomy(x):
    x = str(x)
    if x in taxonomy.index:
        tstr = concat_slash(taxonomy.loc[x])
        return tstr in common_taxa
    return False
wang_edges = wang_edges.loc[wang_edges.microbe.apply(contains_taxonomy)]
dan_edges = dan_edges.loc[dan_edges.microbe.apply(contains_taxonomy)]
averina_edges = averina_edges.loc[averina_edges.microbe.apply(contains_taxonomy)]
pths_edges = pths_edges.loc[pths_edges.microbe.apply(contains_taxonomy)]

In [12]:
wang_edges['dataset'] = 'Wang'
dan_edges['dataset'] = 'Dan'
averina_edges['dataset'] = 'Averina'
pths_edges['dataset'] = 'PTHS'

In [13]:
all_edges = pd.concat((wang_edges, dan_edges, averina_edges, pths_edges), axis=0)
all_edges['microbe'] = all_edges['microbe'].astype(np.str)
# merge microbial metadata
all_edges = pd.merge(all_edges, gpd_metadata, left_on='virus', right_on='GPD_id')
# merge GPD metadata
all_edges = pd.merge(all_edges, taxonomy, left_on='microbe', right_index=True)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  all_edges['microbe'] = all_edges['microbe'].astype(np.str)


In [14]:
all_edges.to_csv('../results/interaction_metadata.txt', sep='\t')

In [15]:
all_edges

Unnamed: 0,microbe,virus,cond_logit,dataset,GPD_id,Source,GPD_VC,Size,Predicted_phage_taxon,Host_range_isolates,...,checkV_termini,Novel,genome,kingdom,phylum,class,order,family,genus,species
0,29466,ivig_2376,4.949254,Wang,ivig_2376,Isolate,18797,40415,,GCF_000011045,...,No,Yes,G001553335,Bacteria,Firmicutes,Negativicutes,Veillonellales,Veillonellaceae,Veillonella,Veillonella parvula
8,29466,ivig_2376,4.949254,Wang,ivig_2376,Isolate,18797,40415,,GCF_000011045,...,No,Yes,G001553335,Bacteria,Firmicutes,Negativicutes,Veillonellales,Veillonellaceae,Veillonella,Veillonella parvula
16,29466,ivig_2256,4.537919,Wang,ivig_2256,Isolate,41768,39592,,GCA_003602765,...,No,Yes,G001553335,Bacteria,Firmicutes,Negativicutes,Veillonellales,Veillonellaceae,Veillonella,Veillonella parvula
24,29466,ivig_2256,4.537919,Wang,ivig_2256,Isolate,41768,39592,,GCA_003602765,...,No,Yes,G001553335,Bacteria,Firmicutes,Negativicutes,Veillonellales,Veillonellaceae,Veillonella,Veillonella parvula
32,29466,uvig_108002,4.431313,Wang,uvig_108002,Metagenome,37815,32601,Myoviridae,"GCA_003464755,GCF_000164195,12718_7#19,20298_3...",...,No,Yes,G001553335,Bacteria,Firmicutes,Negativicutes,Veillonellales,Veillonellaceae,Veillonella,Veillonella parvula
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3198,1871003,ivig_4295,1.122697,Averina,ivig_4295,Isolate,18307,23261,,21673_4#3,...,No,No,G900104605,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Rikenellaceae,Tidjanibacter,Tidjanibacter massiliensis
3238,1871003,uvig_173554,1.054612,Averina,uvig_173554,Metagenome,3999,60158,,GCA_003482185,...,No,Yes,G900104605,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Rikenellaceae,Tidjanibacter,Tidjanibacter massiliensis
3242,1871003,uvig_173554,1.054612,Averina,uvig_173554,Metagenome,3999,60158,,GCA_003482185,...,No,Yes,G900104605,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Rikenellaceae,Tidjanibacter,Tidjanibacter massiliensis
3246,1871003,uvig_492171,1.052579,Averina,uvig_492171,Metagenome,3999,59704,,GCA_003482185,...,No,Yes,G900104605,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Rikenellaceae,Tidjanibacter,Tidjanibacter massiliensis


In [16]:
taxonomy

Unnamed: 0_level_0,genome,kingdom,phylum,class,order,family,genus,species
1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
398511,G000005825,Bacteria,Firmicutes,Bacilli,Bacillales,Bacillaceae,Bacillus,Bacillus pseudofirmus
456320,G000006175,Archaea,Euryarchaeota,Methanococci,Methanococcales,Methanococcaceae,Methanococcus,Methanococcus voltae
306537,G000006605,Bacteria,Actinobacteria,Actinobacteria,Corynebacteriales,Corynebacteriaceae,Corynebacterium,Corynebacterium jeikeium
160492,G000006725,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Xylella,Xylella fastidiosa
243277,G000006745,Bacteria,Proteobacteria,Gammaproteobacteria,Vibrionales,Vibrionaceae,Vibrio,Vibrio cholerae
...,...,...,...,...,...,...,...,...
484498,G900156675,Bacteria,Proteobacteria,Gammaproteobacteria,Oceanospirillales,Oceanospirillaceae,Oleibacter,Oleibacter marinus
477680,G900156765,Bacteria,Bacteroidetes,Chitinophagia,Chitinophagales,Chitinophagaceae,Filimonas,Filimonas lacunae
1926284,G900156885,Bacteria,Firmicutes,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus sp. Marseille-P3519
1926277,G900156915,Bacteria,Firmicutes,Bacilli,Bacillales,Bacillaceae,Aquibacillus,Aquibacillus sp. Marseille-P3518
