# Compare data between pipelines

- [x] Standardise data tables between pipelines
- [ ] Create distance matrices
- [ ] Create ordinations

In [1]:
import pandas as pd
from pytaxonkit import lineage

In [3]:
aseq = pd.read_csv("../data/tables/main_tables/ampliseq-rel-table-ASV_with-DADA2-tax.tsv", sep="\t").drop("ID", axis=1).fillna("")
tour = pd.read_csv("../data/tables/main_tables/tourmaline_taxa_sample_table.tsv", sep="\t", skiprows=1)
obit = pd.read_csv("../data/tables/main_tables/OBITools_main_table.tsv", sep="\t", low_memory=False)

In [4]:
samples = tour.columns[1:]
remove = ['pcrnegcontrol3',
          'pcrposcontrol3',
          'pcrposcontrol2',
          'pcrposcontrol1',
          'I49A',
          'I49B',
          'I49C',
          'I50A',
          'I50B',
          'I50C',]
samples = [i for i in samples if i not in remove]
ranks = "Kingdom Phylum Class Order Family Genus Species".split()

In [5]:
# Formatting Ampliseq table
aseq_stats = pd.read_csv("../data/ampliseq/ampliseq_results/qiime2/abundance_tables/count_table_filter_stats.tsv",
                         sep="\t", index_col=0)

# Convert relative abundance to absolute based on table counts
aseq.loc[:, samples] = aseq_stats["filtered_tax_filter"].loc[samples] * aseq[samples]

In [6]:
# Formatting OBITools table
obit.columns = [i.replace("sample:", "") for i in obit.columns]
obit = obit[list(samples) + ["taxid",]].set_index("taxid")
obit_lineages = lineage(obit.index)
obit_lineages = obit_lineages["Lineage"].str.split(";", expand=True)
obit_lineages.columns = ranks
obit = obit_lineages.merge(obit.reset_index(drop=True), left_index=True, right_index=True)

In [7]:
# Formatting Tourmaline table
tour_lineages = tour["#OTU ID"].apply(lambda s:
                                      s.replace("D_", "").replace("__", "")
                                     ).str.split(";", expand=True).applymap(lambda cell: cell[1:])
tour_lineages.columns = ranks
tour = tour_lineages.merge(tour.drop("#OTU ID", axis=1), left_index=True, right_index=True)

In [8]:
# Standardising all tables
aseq["Species"] = aseq["Species"].str.replace("_", " ").str.replace(".", "")
tour["Species"] = tour["Species"].str.replace("_", " ").str.replace(".", "")
obit["Species"] = obit["Species"].str.replace("_", " ").str.replace(".", "")

aseq = aseq[ranks + samples]
tour = tour[ranks + samples]
obit = obit[ranks + samples]

In [9]:
# Generate tables by ranks
n_observations = dict()

for pipeline in "aseq", "tour", "obit":
    df = eval(pipeline)
    n_observations[pipeline] = dict()
    for rank in ranks:
        gb = df.groupby(rank).sum(numeric_only=True)
        for term in "", "nassigned":
            try:
                gb = gb.drop(term)
            except KeyError:
                pass
        n_observations[pipeline][rank] = len(gb)
        gb.to_csv(f"../data/tables/{pipeline}_{rank}.csv")
        
pd.DataFrame(n_observations)

Unnamed: 0,aseq,tour,obit
Kingdom,1,1,1
Phylum,1,1,1
Class,2,2,2
Order,39,42,40
Family,96,125,99
Genus,162,242,166
Species,206,267,175
