## This notebook formats the MEBS output the heatmap.py script

#### Import all required modules

In [2]:
import pandas as pd
import os as os
import glob as glob
import numpy as np

#### Navigate to directory with CoverM outputs

In [3]:
os.chdir("/Users/nastassia.patin/Desktop/Projects/eCruises_TIMESERIES/Metagenomes_2018-2021/MEBS")

##### Import sheet and truncate metagenome sample names (column names)

In [6]:
df = pd.read_csv("eCruises_TIMESERIES-MAGs-mebs-out.tsv", sep='\t')
df = df.rename(columns={'Unnamed: 0': 'Genome'})
df.head(3)

Unnamed: 0,Genome,sulfur,carbon,oxygen,iron,nitrogen,markers,<sulfur comp>,sulfur_1,sulfur_2,...,nitrogen_25,nitrogen_26,nitrogen_27,nitrogen_28,<markers comp>,markers_1,markers_2,markers_3,markers_4,markers_5
0,S19C702-Lasker2018,-1.85,0.638,1.435,1.418,3.539,61,29.4,50,0.0,...,0,0.0,20,25,31.3,11.2,24.4,53.7,41.9,25.2
1,S34C165-Flyer2018,-2.0,2.378,1.711,2.147,5.528,105.000*,32.1,75,0.0,...,0,0.0,20,25,45.7,54.1,20.9,57.4,38.1,58.0
2,S78C920-Flyer2018,1.226,2.425,3.882,3.62,10.504,87,37.5,75,0.0,...,0,0.0,40,50,42.1,12.2,40.7,74.1,54.3,29.0


### Merge the genome taxonomy level of interest with relative abundance sheet

#### Navigate to directory containing MAG taxonomies

In [26]:
os.chdir("/Users/nastassia.patin/Desktop/Projects/eCruises_TIMESERIES/Metagenomes_2018-2021/MAG_QC/")

##### Import the MAG taxonomy sheet

In [27]:
tax = pd.read_csv("MAGs_2018-2021_HQ_dRep_quality_tax_noCs_noFirm.csv")
tax.head(3)

Unnamed: 0,bin,quality,total_scgs,supporting_scgs,domain,phylum,class,order,family,genus,species,dataset,completion,redundancy,quality.1
0,1903c122_100m-2_MaxBin2_021-Lasker2019,56.32,15,11,Bacteria,Actinobacteriota,Acidimicrobiia,Acidimicrobiales,MedAcidi-G1,S20-B6,S20-B6 sp002699725,Lasker2019,70.42,2.82,56.32
1,1903c123_70m-2_MaxBin2_008-Lasker2019,59.15,13,9,Bacteria,Actinobacteriota,Acidimicrobiia,Acidimicrobiales,MedAcidi-G1,UBA9410,UBA9410 sp014237715,Lasker2019,59.15,0.0,59.15
2,D0089A_S15_MaxBin2_008,52.11,16,16,Bacteria,Actinobacteriota,Acidimicrobiia,Actinomarinales,Actinomarinaceae,Actinomarina,,Lasker2018,52.11,0.0,52.11


In [28]:
taxon = 'phylum'
tax = tax[['bin', taxon]].rename(columns={'bin':'Genome'})
tax.head(3)

Unnamed: 0,Genome,phylum
0,1903c122_100m-2_MaxBin2_021-Lasker2019,Actinobacteriota
1,1903c123_70m-2_MaxBin2_008-Lasker2019,Actinobacteriota
2,D0089A_S15_MaxBin2_008,Actinobacteriota


##### need to merge on the tax file because it excludes MAGs from control samples, contaminants (Firmicutes), and the zr2760_34 MAG

In [31]:
df_tax = df.merge(tax, on='Genome', how='right')
df_tax.head(3)

Unnamed: 0,Genome,sulfur,carbon,oxygen,iron,nitrogen,markers,<sulfur comp>,sulfur_1,sulfur_2,...,nitrogen_26,nitrogen_27,nitrogen_28,<markers comp>,markers_1,markers_2,markers_3,markers_4,markers_5,phylum
0,1903c122_100m-2_MaxBin2_021-Lasker2019,1.127,6.579,4.038,2.146,11.501,108.000*,56.8,75,0.0,...,0.0,60,50,49.8,16.3,62.8,70.4,65.7,33.6,Actinobacteriota
1,1903c123_70m-2_MaxBin2_008-Lasker2019,1.117,3.333,4.264,3.655,11.951,106.000*,55.5,75,0.0,...,0.0,80,50,48.1,17.3,60.5,68.5,62.9,31.3,Actinobacteriota
2,D0089A_S15_MaxBin2_008,1.232,3.264,0.447,0.477,4.477,77,25.8,75,0.0,...,0.0,20,50,38.3,9.2,31.4,75.9,47.6,27.5,Actinobacteriota


In [33]:
# get a list of columns
cols = list(df_tax)
# move the column to head of list using index, pop and insert
cols.insert(0, cols.pop(cols.index(taxon)))
df_tax = df_tax.loc[:, cols]
df_tax.head(3)

Unnamed: 0,phylum,Genome,sulfur,carbon,oxygen,iron,nitrogen,markers,<sulfur comp>,sulfur_1,...,nitrogen_25,nitrogen_26,nitrogen_27,nitrogen_28,<markers comp>,markers_1,markers_2,markers_3,markers_4,markers_5
0,Actinobacteriota,1903c122_100m-2_MaxBin2_021-Lasker2019,1.127,6.579,4.038,2.146,11.501,108.000*,56.8,75,...,0,0.0,60,50,49.8,16.3,62.8,70.4,65.7,33.6
1,Actinobacteriota,1903c123_70m-2_MaxBin2_008-Lasker2019,1.117,3.333,4.264,3.655,11.951,106.000*,55.5,75,...,0,0.0,80,50,48.1,17.3,60.5,68.5,62.9,31.3
2,Actinobacteriota,D0089A_S15_MaxBin2_008,1.232,3.264,0.447,0.477,4.477,77,25.8,75,...,0,0.0,20,50,38.3,9.2,31.4,75.9,47.6,27.5


In [34]:
# remove the 'markers' column
df_tax = df_tax.drop('markers', axis=1)

Unnamed: 0,phylum,Genome,sulfur,carbon,oxygen,iron,nitrogen,<sulfur comp>,sulfur_1,sulfur_2,...,nitrogen_25,nitrogen_26,nitrogen_27,nitrogen_28,<markers comp>,markers_1,markers_2,markers_3,markers_4,markers_5
0,Actinobacteriota,1903c122_100m-2_MaxBin2_021-Lasker2019,1.127,6.579,4.038,2.146,11.501,56.8,75,0.0,...,0,0.0,60,50,49.8,16.3,62.8,70.4,65.7,33.6
1,Actinobacteriota,1903c123_70m-2_MaxBin2_008-Lasker2019,1.117,3.333,4.264,3.655,11.951,55.5,75,0.0,...,0,0.0,80,50,48.1,17.3,60.5,68.5,62.9,31.3
2,Actinobacteriota,D0089A_S15_MaxBin2_008,1.232,3.264,0.447,0.477,4.477,25.8,75,0.0,...,0,0.0,20,50,38.3,9.2,31.4,75.9,47.6,27.5


In [49]:
# remove any values with the '*' symbol
df_tax = df_tax.replace('\*', '', regex=True)
df_tax

Unnamed: 0,phylum,Genome,sulfur,carbon,oxygen,iron,nitrogen,<sulfur comp>,sulfur_1,sulfur_2,...,nitrogen_25,nitrogen_26,nitrogen_27,nitrogen_28,<markers comp>,markers_1,markers_2,markers_3,markers_4,markers_5
0,Actinobacteriota,1903c122_100m-2_MaxBin2_021-Lasker2019,1.127,6.579,4.038,2.146,11.501,56.8,75,0.0,...,0,0.0,60,50,49.8,16.3,62.8,70.4,65.7,33.6
1,Actinobacteriota,1903c123_70m-2_MaxBin2_008-Lasker2019,1.117,3.333,4.264,3.655,11.951,55.5,75,0.0,...,0,0.0,80,50,48.1,17.3,60.5,68.5,62.9,31.3
2,Actinobacteriota,D0089A_S15_MaxBin2_008,1.232,3.264,0.447,0.477,4.477,25.8,75,0.0,...,0,0.0,20,50,38.3,9.2,31.4,75.9,47.6,27.5
3,Actinobacteriota,D0159A_S42_MaxBin2_008,-0.913,0.349,1.493,2.134,4.199,19.7,50,0.0,...,0,0.0,40,25,36.0,9.2,29.1,72.2,43.8,26.0
4,Actinobacteriota,S10C755-Lasker2018,1.030,6.827,2.334,1.642,8.740,50.2,50,0.0,...,0,0.0,40,25,40.3,13.3,41.9,63.0,55.2,28.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524,Verrucomicrobiota,S9C83-Lasker2018,2.496,1.044,1.788,5.464,6.928,50.2,50,0.0,...,0,0.0,20,25,48.7,17.3,67.4,61.1,67.6,29.8
525,Verrucomicrobiota,vae_254-Lasker2019,1.232,2.353,4.776,4.473,9.033,47.1,75,33.3,...,0,0.0,60,50,71.7,24.5,93.0,98.1,97.1,45.8
526,Verrucomicrobiota,vae_260-Lasker2019,1.887,3.031,4.894,2.487,5.777,41.9,75,33.3,...,0,0.0,60,50,35.9,11.2,40.7,51.9,49.5,26.0
527,Verrucomicrobiota,zr2760_12_MaxBin2_018,2.605,4.127,3.185,4.421,9.392,51.1,75,0.0,...,0,0.0,80,25,47.4,17.3,61.6,64.8,61.9,31.3


In [50]:
df_tax.loc[df_tax['Genome'] == 'S51C58-Lasker2018']

Unnamed: 0,phylum,Genome,sulfur,carbon,oxygen,iron,nitrogen,<sulfur comp>,sulfur_1,sulfur_2,...,nitrogen_25,nitrogen_26,nitrogen_27,nitrogen_28,<markers comp>,markers_1,markers_2,markers_3,markers_4,markers_5
339,Proteobacteria,S51C58-Lasker2018,1.156,8.06,7.367,4.136,15.847,66.4,75,33.3,...,0,0.0,60,50,62.5,21.4,90.7,72.2,88.6,39.7


##### Export

In [51]:
os.chdir("/Users/nastassia.patin/Desktop/Projects/eCruises_TIMESERIES/Metagenomes_2018-2021/MEBS")
df_tax.to_csv("eCruises_MAGs_HQ_derep_MEBS-forheatmap.tsv", sep='\t', index=False)