Analyze the output of metagenome-atlas
======================================

In [None]:
# load libraries

%matplotlib inline
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt

#load my scripts
from utils.mag_scripts import * 
from utils.barplots import * 

import os
os.environ['QT_QPA_PLATFORM']='offscreen' # ete3 has some interactive part, but we don't have acces to them here
import ete3

# supress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
#go to output file 
%cd ../Example/

# Taxonomy

In [None]:
Tax= pd.read_table('Results/taxonomy.tsv',index_col=0)
Tax.head()

In [None]:
# create a short label for each species
Labels=Tax.ffill(axis=1).species.copy()
Labels.loc[Tax.species.isnull()]+= ' '+ Labels.index[Tax.species.isnull()]

## Draw tree

In [None]:
T= ete3.Tree('genomes/tree/gtdbtk.bac120.nwk')

In [None]:
unique_phyla= Tax.phylum.unique()
phyla_colors= dict(zip(unique_phyla, 
['#bf423f',
 '#bf973f',
 '#91bf3f',
 '#3fbf42',
 '#3fbf97',
 '#3f91bf',
 '#423fbf',
 '#973fbf',
 '#bf3f91']))
    

def layout(node):
    node.img_style["size"] = 0
    if node.is_leaf():
        L= ete3.TextFace(Labels.loc[node.name])
        ete3.add_face_to_node(L, node, 0, position="branch-right")        
        node.set_style(ete3.NodeStyle(bgcolor= phyla_colors[Tax.loc[node.name,'phylum']]))
        


ts=ete3.TreeStyle()
ts.mode='c'
ts.show_leaf_name=False
ts.scale = 200

for ph in unique_phyla:
    ts.title.add_face(ete3.CircleFace(radius=15,color= phyla_colors[ph] ), column=0)
    ts.title.add_face(ete3.TextFace(ph, fsize=15), column=1)

T.render('%%inline',tree_style=ts,layout=layout)

# Genome quality 

In [None]:
genome_quality= pd.read_table('Results/genome_completeness.tsv',index_col=0)

genome_quality['Quality_Score']= genome_quality.eval('Completeness -5*Contamination')
genome_quality['Lineage']= genome_quality['Marker lineage'].map(lambda s: s.split()[0])

genome_quality['Id']= genome_quality.index

genome_quality= genome_quality.join(Tax)
genome_quality['Name']= Labels

In [None]:

xscale = alt.Scale(domain=(0, 10))
yscale = alt.Scale(domain=(50, 100))

alt.Chart(genome_quality).mark_circle(opacity= .6).encode(
    alt.X('Contamination', scale=xscale, title='Contamination [%]'),
    alt.Y('Completeness', scale=yscale, title='Completeness [%]'),
    color='phylum',
    tooltip=['Name', 'Id', 'Contamination','Completeness' ]
).interactive()

# Abundance

## Mapping rate

In [None]:
Counts= pd.read_csv('Results/counts/raw_counts_genomes.tsv',index_col=0,sep='\t').T
Counts.head()

In [None]:
mapping_rate = pd.read_table('Results/mapping_rate.tsv',index_col=0,squeeze=True)*100
f,ax= plt.subplots(figsize=(2,4))
ax.set_ylim([0,100])
ax.set_xlabel('Samples')
sns.swarmplot(y= mapping_rate,ax=ax)

ax.set_title('Mapping rate')


## Relative abundance


For the relative abundance we take the coverage over the genome not the raw counts. This inmplicit normalizes for genome size. The coverage is calculated as the median of the coverage values calculated in 1kb blocks.

In [None]:
D = pd.read_table("Results/counts/median_coverage_genomes.tsv",index_col=0)
D.head()

In [None]:
#calculate relative abundance

relab = (D.T/D.sum(1)).T

In [None]:
# get most abundant genomes

counts_per_genome= relab.sum().sort_values()
ax= counts_per_genome[-10:].plot.bar(figsize=(10,5))

_= ax.set_xticklabels(Labels.loc[counts_per_genome.index[-10:]])
ax.set_title('Most abundant genomes')
ax.set_ylabel('Abundance [%]')

### Typical bar chart

In [None]:

level='family'

grouped_data =relab.groupby(Tax[level],axis=1).sum()

ax= BarPlot(grouped_data)

ax.legend_.set_title(level,{'weight':'bold'})


# Functional annotation


Relative abundance of functional annotations per sample

The abundance is calculated as the sum of the relative abundance of all bacteria containing a function.

## CAZy

In [None]:
#CAZy
CAZy_annotations_genome= pd.read_table('Results/annotations/CAZy.tsv',index_col=0)
CAZy_presence= (CAZy_annotations_genome>0).astype(int)
CAZy_presence.head()


function_relab = relab @ CAZy_presence

sns.clustermap(function_relab)

function_relab.head()

## Kegg orthologs

In [None]:
#Kegg orthologs

Kegg_annotations_genome= pd.read_table('Results/annotations/KO.tsv',index_col=0)
Kegg_presence= (Kegg_annotations_genome>0).astype(int)
Kegg_presence.head()


function_relab = relab @ Kegg_presence

sns.clustermap(function_relab)

function_relab.head()