## This notebook merges results from the sourmash-gather output into one sheet with results from all samples.

#### Import all required modules

In [1]:
import pandas as pd
import os as os
import glob as glob
import numpy as np

#### Navigate to your working directory, which should contain both spreadsheets you want to merge

In [2]:
os.chdir("/Users/nastassia.patin/Desktop/Projects/Lasker2019/Long read paper/mSystems/Revisions/new-sourmash/nts-GTDB-MMETSP/tax_out")

#### Import an example sheet

In [3]:
summary = pd.read_csv("1903c117_50m-1_tax.summarized.csv")
summary.head()

Unnamed: 0,query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank
0,,superkingdom,0.057079,d__Bacteria,41f6a87e,1903c117_50m-1_bmtag_interleaved.fq.gz,0.057079,35107000
1,,superkingdom,0.014186,d__Archaea,41f6a87e,1903c117_50m-1_bmtag_interleaved.fq.gz,0.014186,8725000
2,,superkingdom,0.010271,Viridiplantae,41f6a87e,1903c117_50m-1_bmtag_interleaved.fq.gz,0.010271,6317000
3,,superkingdom,0.002268,Haptophyta,41f6a87e,1903c117_50m-1_bmtag_interleaved.fq.gz,0.002268,1395000
4,,superkingdom,0.00194,Stramenopila,41f6a87e,1903c117_50m-1_bmtag_interleaved.fq.gz,0.00194,1193000


In [4]:
summary.shape

(414, 8)

In [5]:
genus = summary[summary['rank'] == 'genus']
genus = genus[['fraction', 'lineage']]

In [6]:
genus.lineage.nunique()

99

In [7]:
genus.head()

Unnamed: 0,fraction,lineage
143,0.008308,d__Archaea;p__Thermoplasmatota;c__Poseidoniia;...
144,0.006619,Viridiplantae;Chlorophyta;Prasinophyceae;Mamie...
145,0.005281,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...
146,0.004626,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...
147,0.003715,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...


In [8]:
genus2 = genus.groupby(['lineage']).sum()
genus2.head()

Unnamed: 0_level_0,fraction
lineage,Unnamed: 1_level_1
Alveolata;Dinophyta;Dinophyceae;Suessiales;Suessiaceae;Polarella,8.1e-05
Haptophyta;Haptophyta;Prymnesiophyceae;Isochrysidales;Isochrysidaceae;Isochrysis,0.000283
Haptophyta;Haptophyta;Prymnesiophyceae;Isochrysidales;Noelaerhabdaceae;Emiliania,0.00127
Haptophyta;Haptophyta;Prymnesiophyceae;Isochrysidales;Noelaerhabdaceae;Gephyrocapsa,0.000101
Haptophyta;Haptophyta;Prymnesiophyceae;Phaeocystales;Phaeocystaceae;Phaeocystis,0.000174


### Function to reformat individual data sheet

In [9]:
def format_sourmash_summary(sheet, sample):
    taxon = sheet[sheet['rank'] == 'species'] # select desired level of taxonomic resolution
    taxon = taxon[['bp_match_at_rank', 'lineage']]
    taxon2 = taxon.groupby(['lineage']).sum()
    taxon2['sample'] = sample
    return(taxon2)

### Apply function to all data sheets and combine

In [13]:
summary = []

for file in glob.glob("*summarized.csv"):
    sheet = pd.read_csv(file)
    a, b, c = file.split('_')
    sample = a + "_" + b
    df = format_sourmash_summary(sheet, sample)
    summary.append(df)

summary = pd.concat(summary)

In [14]:
summary.head()

Unnamed: 0_level_0,bp_match_at_rank,sample
lineage,Unnamed: 1_level_1,Unnamed: 2_level_1
Cryptophyta;Cryptophyta;Cryptophyceae;Pyrenomonadales;Geminigeracea;Geminigera;Geminigera sp.,54000,1903c117_50m-2
Haptophyta;Haptophyta;Prymnesiophyceae;Isochrysidales;Isochrysidaceae;Isochrysis;Isochrysis sp.,119000,1903c117_50m-2
Haptophyta;Haptophyta;Prymnesiophyceae;Isochrysidales;Noelaerhabdaceae;Emiliania;Emiliania huxleyi,378000,1903c117_50m-2
Haptophyta;Haptophyta;Prymnesiophyceae;Phaeocystales;Phaeocystaceae;Phaeocystis;Phaeocystis antarctica,59000,1903c117_50m-2
Haptophyta;Haptophyta;Prymnesiophyceae;Phaeocystales;Phaeocystacear;Phaeocystis;Phaeocystis sp.,210000,1903c117_50m-2


#### Pivot table so each sample is a column

In [15]:
summary_pivoted = summary.pivot_table(index='lineage', columns='sample', values='bp_match_at_rank')
summary_pivoted = summary_pivoted.fillna(0)
summary_pivoted.head()

sample,1903c111_10m-1,1903c111_10m-2,1903c111_10m-3,1903c117_50m-1,1903c117_50m-2,1903c117_50m-3,1903c118_23m-2,1903c118_23m-3,1903c119_11m-2,1903c119_11m-3,...,1903c144_13m-2,1903c144_13m-3,Las19c107_10m-1,Las19c107_10m-2,Las19c107_10m-3,Las19c135_5m-1,Las19c135_5m-2,Las19c135_5m-3,Las19c138_27m-1,Las19c138_27m-3
lineage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alveolata;Ciliophora;Litostomatea;Cyclotrichiida;Mesodiniidae;Mesodinium;Mesodinium pulex,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100000.0,0.0
Alveolata;Ciliophora;Spirotrichea;Tintinnida;Ptychocylididae;Favella;Favella ehrenbergii,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,157000.0,0.0,0.0,0.0,0.0,206000.0,0.0,0.0,0.0,0.0
Alveolata;Dinophyta;Dinophyceae;Gonyualacales;Crypthecodiniacea;Crypthecodinium;Crypthecodinium cohnii,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,60000.0,0.0,56000.0,0.0
Alveolata;Dinophyta;Dinophyceae;Gonyualacales;Goniodomataceae;Alexandrium;Alexandrium tamarense,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,109000.0,0.0
Alveolata;Dinophyta;Dinophyceae;Suessiales;Suessiaceae;Pelagodinium;Pelagodinium bii,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52000.0


#### Save the merged table as a csv

In [16]:
summary_pivoted.to_csv("Lasker2019_Illumina_PacBio_sourmash_GTDB_MMETSP_species_bpnumbers.csv")

## Import sheet of other sample group and merge

In [31]:
os.chdir("/Users/nastassia.patin/Desktop/Projects/Lasker 2019/Metagenomes/sourmash/summarized-PacBiomatches/")
summary_illumina = pd.read_csv("Lasker2019_Illumina_sourmash_species_bpnumbers.csv")
summary_illumina.head()

Unnamed: 0,lineage,1903c111_10m-1,1903c117_50m-1,1903c117_50m-2,1903c118_23m-2,1903c119_11m-2,1903c122_28m-1,1903c122_28m-2,1903c123_10m-1,1903c123_10m-2,...,1903c127_7m-1,1903c127_7m-2,1903c129_26m-1,1903c129_26m-2,1903c144_13m-2,Las19c107_10m-1,Las19c107_10m-2,Las19c135_5m-1,Las19c135_5m-2,Las19c138_27m-1
0,d__Archaea;p__Thermoplasmatota;c__Poseidoniia_...,332000.0,61000.0,0.0,0.0,0.0,286000.0,195000.0,0.0,0.0,...,0.0,0.0,363000.0,328000.0,80000.0,143000.0,72000.0,0.0,0.0,590000.0
1,d__Archaea;p__Thermoplasmatota;c__Poseidoniia_...,1794000.0,1657000.0,1426000.0,424000.0,513000.0,1842000.0,1864000.0,1683000.0,1565000.0,...,1588000.0,1532000.0,1776000.0,1843000.0,1318000.0,1748000.0,1681000.0,1339000.0,1115000.0,1749000.0
2,d__Archaea;p__Thermoplasmatota;c__Poseidoniia_...,85000.0,0.0,0.0,0.0,0.0,56000.0,0.0,0.0,0.0,...,0.0,0.0,115000.0,57000.0,0.0,0.0,0.0,0.0,0.0,88000.0
3,d__Archaea;p__Thermoplasmatota;c__Poseidoniia_...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,d__Archaea;p__Thermoplasmatota;c__Poseidoniia_...,262000.0,167000.0,92000.0,91000.0,0.0,171000.0,261000.0,186000.0,130000.0,...,0.0,0.0,390000.0,419000.0,0.0,267000.0,212000.0,0.0,0.0,314000.0


In [32]:
summary_pacbio = summary_pivoted.reset_index()
summary_pacbio.head()

sample,lineage,1903c111_10m-3,1903c117_50m-3,1903c118_23m-3,1903c119_11m-3,1903c122_28m-3,1903c123_10m-3,1903c124_15m-3,1903c126_45m-3,1903c127_7m-3,1903c129_26m-3,1903c144_13m-3,Las19c107_10m-3,Las19c135_5m-3,Las19c138_27m-3
0,d__Archaea;p__Thermoplasmatota;c__Poseidoniia_...,0.0,247000.0,0.0,0.0,0.0,0.0,504000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,d__Archaea;p__Thermoplasmatota;c__Poseidoniia_...,0.0,1848000.0,0.0,0.0,0.0,0.0,1726000.0,0.0,0.0,0.0,0.0,0.0,0.0,374000.0
2,d__Archaea;p__Thermoplasmatota;c__Poseidoniia_...,0.0,62000.0,0.0,0.0,0.0,0.0,137000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,d__Archaea;p__Thermoplasmatota;c__Poseidoniia_...,0.0,0.0,0.0,0.0,0.0,0.0,107000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,d__Archaea;p__Thermoplasmatota;c__Poseidoniia_...,0.0,163000.0,0.0,0.0,0.0,0.0,347000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
summary_all = summary_pacbio.merge(summary_illumina, how='outer', on='lineage')
summary_all.head()

Unnamed: 0,lineage,1903c111_10m-3,1903c117_50m-3,1903c118_23m-3,1903c119_11m-3,1903c122_28m-3,1903c123_10m-3,1903c124_15m-3,1903c126_45m-3,1903c127_7m-3,...,1903c127_7m-1,1903c127_7m-2,1903c129_26m-1,1903c129_26m-2,1903c144_13m-2,Las19c107_10m-1,Las19c107_10m-2,Las19c135_5m-1,Las19c135_5m-2,Las19c138_27m-1
0,d__Archaea;p__Thermoplasmatota;c__Poseidoniia_...,0.0,247000.0,0.0,0.0,0.0,0.0,504000.0,0.0,0.0,...,0.0,0.0,363000.0,328000.0,80000.0,143000.0,72000.0,0.0,0.0,590000.0
1,d__Archaea;p__Thermoplasmatota;c__Poseidoniia_...,0.0,1848000.0,0.0,0.0,0.0,0.0,1726000.0,0.0,0.0,...,1588000.0,1532000.0,1776000.0,1843000.0,1318000.0,1748000.0,1681000.0,1339000.0,1115000.0,1749000.0
2,d__Archaea;p__Thermoplasmatota;c__Poseidoniia_...,0.0,62000.0,0.0,0.0,0.0,0.0,137000.0,0.0,0.0,...,0.0,0.0,115000.0,57000.0,0.0,0.0,0.0,0.0,0.0,88000.0
3,d__Archaea;p__Thermoplasmatota;c__Poseidoniia_...,0.0,0.0,0.0,0.0,0.0,0.0,107000.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,d__Archaea;p__Thermoplasmatota;c__Poseidoniia_...,0.0,163000.0,0.0,0.0,0.0,0.0,347000.0,0.0,0.0,...,0.0,0.0,390000.0,419000.0,0.0,267000.0,212000.0,0.0,0.0,314000.0


In [34]:
summary_all.to_csv("Lasker2019_Illumina_PacBio_sourmash_species_bpnumbers.csv")