# Process Raw Data
> Gati Aher, SUM2021

To download notebook as pdf, follow: https://towardsdatascience.com/jupyter-notebook-to-pdf-in-a-few-lines-3c48d68a7a63

In [1]:
import skbio
import pandas as pd
import numpy as np
import re

## Data Files

Format the following raw files:
* `data/raw/FCF_master.csv`
    * table of OTU common name x sample x count
* `data/raw/FCF_annotations.csv`
    * table of sample x variables
    
* `data/raw/tree/081616JHnew515Fcomplete-pr.fasta.otus.fa`
    * map of OTU number to 16 sRNA sequence
* `data/raw/tree/081616JHnew515Fcomplete-pr.fasta.otus.fa.OTU.txt`
    * map of OTU number to taxonomy / common name
* `data/raw/tree/Pairwise_distances_all_OTU_Muscle`
    * pairwise distances calculated by Jean

## 1. OTU Data

In [2]:
# map ids
otu_raw = pd.read_csv("data/raw/tree/081616JHnew515Fcomplete-pr.fasta.otus.fa.OTU.txt", sep="\t", index_col=0)
otu_raw.head()

Unnamed: 0_level_0,16rRNA,Percent Homology,evalue,bitscore,homology extract gibbsfree_otu/gibbsfree_homolog,total length gibbsfree_otu/otu_lenth/gibbsfree_homolog/homolog_length,Olin1,Olin2,Olin3,Olin4,...,Olin80,Olin81,Olin82,Olin83,Olin84,Olin85,Olin86,Olin87,Olin88,Unnamed: 95
otu name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OTU_1,eu528231.1 seasonal dynamics mudflat mouth maj...,100.0,0.0,711.813612,,,6,11,13,20,...,11,19,25,22,16,16,6,5,4,
OTU_2,uncultured opitutus sp. ;k__bacteria;p__verruc...,96.969697,0.0,661.319285,,,17546,18302,32674,9700,...,31216,17838,12257,19679,16779,15614,2107,42046,56298,
OTU_3,rhodopseudomonas palustris ;k__bacteria;p__pro...,99.493671,0.0,704.600137,,,3169,3033,4495,3455,...,1073,4207,1275,2274,4900,2027,1140,3192,2086,
OTU_4,ay297802.1 waterlogged archaeological wood clo...,99.492386,0.0,702.796768,,,5243,3526,4658,29020,...,17928,19104,11308,11368,26958,15939,3121,30080,23869,
OTU_5,rhodoblastus acidophilus ;k__bacteria;p__prote...,99.746835,0.0,708.206874,,,3809,1128,1313,1001,...,17129,22577,7156,26149,9243,2211,1368,13865,5190,


## 1.1 Taxomony Table

In [3]:
taxonomy_table = otu_raw["16rRNA"]
taxonomy_table

otu name
OTU_1      eu528231.1 seasonal dynamics mudflat mouth maj...
OTU_2      uncultured opitutus sp. ;k__bacteria;p__verruc...
OTU_3      rhodopseudomonas palustris ;k__bacteria;p__pro...
OTU_4      ay297802.1 waterlogged archaeological wood clo...
OTU_5      rhodoblastus acidophilus ;k__bacteria;p__prote...
                                 ...                        
OTU_639    uncultured spirochaeta sp. ;k__bacteria;p__spi...
OTU_640    ab079679.1 rhodopseudomonas sp. hmd89  ;k__bac...
OTU_641    ay297803.1 waterlogged archaeological wood clo...
OTU_642    uncultured pontibacter sp. ;k__bacteria;p__bac...
OTU_643    ay297802.1 waterlogged archaeological wood clo...
Name: 16rRNA, Length: 643, dtype: object

In [4]:
taxonomy_table.to_csv("data/processed/taxonomy_table.csv")
print("taxonomy_table.values.shape", taxonomy_table.values.shape)

taxonomy_table.values.shape (643,)


## 1.2 OTU Counts

In [5]:
# map MR DNA sample names to real sample names
sample_names = pd.read_csv("data/raw/sample_names.csv", index_col=0)
sample_names.head()

Unnamed: 0_level_0,key
name,Unnamed: 1_level_1
Olin1,C0C
Olin2,1C3A
Olin3,1C3B
Olin4,1G3A
Olin5,1G3B


In [6]:
df_counts = otu_raw.filter(regex='^Olin')
df_counts = df_counts.rename(columns=sample_names.to_dict()['key'])
df_counts = df_counts.T
# NOTE: 3G10C and 2M7C were removed because they were incorrectly sequenced
df_counts = df_counts.drop(["2M7C", "3G10C"]) # drop row by index
df_counts.head()

otu name,OTU_1,OTU_2,OTU_3,OTU_4,OTU_5,OTU_6,OTU_7,OTU_8,OTU_9,OTU_10,...,OTU_634,OTU_635,OTU_636,OTU_637,OTU_638,OTU_639,OTU_640,OTU_641,OTU_642,OTU_643
C0C,6,17546,3169,5243,3809,9546,1830,1913,321,1329,...,5,0,0,0,0,1,4270,0,0,0
1C3A,11,18302,3033,3526,1128,13226,13726,786,282,288,...,1,0,3,0,0,0,2662,0,0,0
1C3B,13,32674,4495,4658,1313,18642,26926,1695,357,856,...,3,0,2,0,0,0,3657,0,0,0
1G3A,20,9700,3455,29020,1001,22919,19167,941,202,143,...,2,0,11,0,0,0,3324,1,0,0
1G3B,18,4428,2142,18697,758,17614,16826,373,204,363,...,0,0,8,0,0,0,1628,4,0,0


In [7]:
df_counts.to_csv("data/processed/FCF_absolute_counts.csv")
print("df_counts.values.shape", df_counts.values.shape)

df_counts.values.shape (86, 643)


## 1.3 Normalize OTU Counts as Relative Abundance

In [8]:
# create relative counts
df_counts_rel = df_counts.div(df_counts.sum(axis=1), axis=0)
df_counts_rel.head()

otu name,OTU_1,OTU_2,OTU_3,OTU_4,OTU_5,OTU_6,OTU_7,OTU_8,OTU_9,OTU_10,...,OTU_634,OTU_635,OTU_636,OTU_637,OTU_638,OTU_639,OTU_640,OTU_641,OTU_642,OTU_643
C0C,6.8e-05,0.199079,0.035956,0.059488,0.043217,0.10831,0.020763,0.021705,0.003642,0.015079,...,5.7e-05,0.0,0.0,0.0,0.0,1.1e-05,0.048448,0.0,0.0,0.0
1C3A,0.000153,0.253979,0.042089,0.048931,0.015653,0.183539,0.190478,0.010907,0.003913,0.003997,...,1.4e-05,0.0,4.2e-05,0.0,0.0,0.0,0.036941,0.0,0.0,0.0
1C3B,0.000111,0.277887,0.038229,0.039616,0.011167,0.158547,0.229002,0.014416,0.003036,0.00728,...,2.6e-05,0.0,1.7e-05,0.0,0.0,0.0,0.031102,0.0,0.0,0.0
1G3A,0.00019,0.092192,0.032838,0.275816,0.009514,0.21783,0.18217,0.008944,0.00192,0.001359,...,1.9e-05,0.0,0.000105,0.0,0.0,0.0,0.031592,1e-05,0.0,0.0
1G3B,0.000244,0.059906,0.028979,0.252949,0.010255,0.238298,0.227637,0.005046,0.00276,0.004911,...,0.0,0.0,0.000108,0.0,0.0,0.0,0.022025,5.4e-05,0.0,0.0


In [9]:
# check that total relative sample counts sum to 1
df_counts_rel.sum(axis=1)

C0C      1.0
1C3A     1.0
1C3B     1.0
1G3A     1.0
1G3B     1.0
        ... 
3M10C    1.0
3G10A    1.0
3G10B    1.0
2M10A    1.0
2M10C    1.0
Length: 86, dtype: float64

In [10]:
df_counts_rel.to_csv("data/processed/FCF_relative_counts.csv")
print("otu_counts_rel.values.shape", df_counts_rel.values.shape)

otu_counts_rel.values.shape (86, 643)


## 3. Counts from Master OTU List 

### 3.1 Absolute Counts

In [34]:
otu_master = pd.read_csv("data/raw/FCF_master.csv", index_col=0).T
otu_master = otu_master.drop(["2M7C"]) # drop row by index
otu_master

Unnamed: 0,opitutus spp.,paludibacter propionicigenes,magnetospirillum sp.,rhodopseudomonas palustris,acetobacter spp.,bacteroides spp.,pleomorphomonas oryzae,afipia sp.,rhodoblastus acidophilus,spirochaeta spp.,...,bosea spp.,corynebacterium durum,rubrivivax gelatinosus,anaerococcus hydrogenalis,globicatella spp.,finegoldia magna,dietzia spp.,granulicatella elegans,actinomyces marimammalium,veillonella dispar
C0C,18304,16452,9817,9279,6578,6400,5409,4521,3913,1958,...,0,0,0,0,0,0,0,0,0,0
1C3A,19239,4060,13664,9191,902,1891,3647,2798,1163,257,...,0,0,0,0,0,0,0,0,0,0
1C3B,34155,6121,19194,13838,1395,3117,4828,3858,1357,221,...,0,0,1,0,0,0,0,0,0,0
1C5A,16237,1967,24640,8188,720,1520,2641,2308,2046,265,...,0,0,1,0,0,0,0,0,0,0
1C5B,23247,6002,16356,16142,1381,4462,6110,4695,3101,605,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3G7A,25593,319,3256,11598,10247,69747,28315,891,7625,15,...,0,0,0,0,0,0,0,0,0,0
3G7B,36316,109,3496,7593,9236,40432,30320,703,3922,15,...,0,1,0,0,0,0,0,0,0,0
3G7C,34948,136,4123,9157,12384,40225,28382,721,4594,16,...,0,0,0,0,0,0,0,0,0,0
3G10A,17610,250,18156,16920,8832,72267,27647,1603,9659,1,...,0,0,0,0,0,0,0,0,0,0


In [35]:
otu_master.to_csv("data/processed/FCF_absolute_counts_master.csv")
print("otu_master.values.shape", otu_master.values.shape)

otu_master.values.shape (86, 153)


### 3.2 Normalize OTU Counts as Relative Abundance

In [37]:
# create relative counts
df_counts_rel_master = otu_master.div(otu_master.sum(axis=1), axis=0)
df_counts_rel_master.head()

Unnamed: 0,opitutus spp.,paludibacter propionicigenes,magnetospirillum sp.,rhodopseudomonas palustris,acetobacter spp.,bacteroides spp.,pleomorphomonas oryzae,afipia sp.,rhodoblastus acidophilus,spirochaeta spp.,...,bosea spp.,corynebacterium durum,rubrivivax gelatinosus,anaerococcus hydrogenalis,globicatella spp.,finegoldia magna,dietzia spp.,granulicatella elegans,actinomyces marimammalium,veillonella dispar
C0C,0.207679,0.186666,0.111385,0.10528,0.074635,0.072615,0.061371,0.051296,0.044397,0.022216,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1C3A,0.266982,0.056341,0.189617,0.127545,0.012517,0.026242,0.05061,0.038828,0.016139,0.003566,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1C3B,0.290483,0.052058,0.163242,0.11769,0.011864,0.02651,0.041061,0.032812,0.011541,0.00188,...,0.0,0.0,9e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1C5A,0.194116,0.023516,0.294575,0.097889,0.008608,0.018172,0.031574,0.027592,0.02446,0.003168,...,0.0,0.0,1.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1C5B,0.18787,0.048505,0.13218,0.130451,0.01116,0.036059,0.049378,0.037942,0.025061,0.004889,...,0.0,8e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
df_counts_rel_master.to_csv("data/processed/FCF_relative_counts_master.csv")
print("df_counts_rel_master.values.shape", df_counts_rel_master.values.shape)

df_counts_rel_master.values.shape (86, 153)


## 4. Format Annotation Table

In [11]:
df_annotations_from_file = pd.read_csv("data/raw/FCF_annotations.csv", index_col=0) # these aren't in the right order
df_annotations_from_file

Unnamed: 0,series,food,day
C0C,C0,cellulose,10
1C3A,1C,cellulose,3
1C3B,1C,cellulose,3
1C5A,1C,cellulose,5
1C5B,1C,cellulose,5
...,...,...,...
3G7A,3G,cellulose,7
3G7B,3G,cellulose,7
3G7C,3G,cellulose,7
3G10A,3G,cellulose,10


In [23]:
# Build Correct Annotations File
# columns: index series carbon transfer day

df_annotations = pd.DataFrame(index=df_counts.index)
ann_series = []
ann_carbon = []
ann_transfer = []
ann_group = []
ann_day = []

#list(map(str, df_annotations_from_file["day"].values))

for i in df_counts.index:
    # series
    if i == "C0C":
        ann_series.append("C0C")
    else:
        ann_series.append(i[1])
    # carbon
    if i == "C0C":
        ann_carbon.append("C0C")
    elif df_annotations_from_file.loc[i, "food"] == "cellulose":
        ann_carbon.append("C")
    elif df_annotations_from_file.loc[i, "food"] == "glucose":
        ann_carbon.append("G")
    elif df_annotations_from_file.loc[i, "food"] == "malate":
        ann_carbon.append("M")
    else:
        ann_carbon.append("X")
    # transfer
    if i == "C0C":
        ann_transfer.append("C0C")
    else:
        ann_transfer.append(i[0])
    # group
    if i == "C0C":
        ann_group.append("C0C")
    else:
        ann_group.append(i[0:2])
    # day
    if i == "C0C":
        ann_day.append("C0C")
    else:
        ann_day.append(i[2:-1])
        
print("ann_group", ann_group)
print("ann_series", ann_series)
print("ann_carbon", ann_carbon)
print("ann_transfer", ann_transfer)
print("ann_day", ann_day)

df_annotations["group"] = ann_group
df_annotations["series"] = ann_series
df_annotations["carbon"] = ann_carbon
df_annotations["transfer"] = ann_transfer
df_annotations["day"] = ann_day

df_annotations.head()

ann_group ['C0C', '1C', '1C', '1G', '1G', '1G', '1M', '1M', '1M', '1M', '1C', '1C', '1G', '1G', '1G', '1M', '1M', '1C', '1C', '1G', '1G', '1G', '1M', '1M', '1M', '1C', '1C', '1G', '1G', '1G', '1M', '1M', '1M', '2C', '2C', '2G', '2G', '2G', '2M', '2M', '2M', '2C', '2C', '2G', '2G', '2G', '2M', '2M', '2G', '2G', '2G', '2M', '2M', '2C', '2C', '2G', '2G', '2G', '2M', '1C', '1C', '3G', '3G', '3G', '3M', '3M', '3M', '3G', '3G', '3G', '3M', '3M', '3M', '3G', '3G', '3G', '3M', '3M', '3M', '3M', '3M', '3M', '3G', '3G', '2M', '2M']
ann_series ['C0C', 'C', 'C', 'G', 'G', 'G', 'M', 'M', 'M', 'M', 'C', 'C', 'G', 'G', 'G', 'M', 'M', 'C', 'C', 'G', 'G', 'G', 'M', 'M', 'M', 'C', 'C', 'G', 'G', 'G', 'M', 'M', 'M', 'C', 'C', 'G', 'G', 'G', 'M', 'M', 'M', 'C', 'C', 'G', 'G', 'G', 'M', 'M', 'G', 'G', 'G', 'M', 'M', 'C', 'C', 'G', 'G', 'G', 'M', 'C', 'C', 'G', 'G', 'G', 'M', 'M', 'M', 'G', 'G', 'G', 'M', 'M', 'M', 'G', 'G', 'G', 'M', 'M', 'M', 'M', 'M', 'M', 'G', 'G', 'M', 'M']
ann_carbon ['C0C', 'C', 'C',

Unnamed: 0,group,series,carbon,transfer,day
C0C,C0C,C0C,C0C,C0C,C0C
1C3A,1C,C,C,1,3
1C3B,1C,C,C,1,3
1G3A,1G,G,G,1,3
1G3B,1G,G,G,1,3


In [24]:
# Groups
groups = set(df_annotations["group"].values)
print("groups:", groups)

groups: {'C0C', '2G', '1C', '1G', '1M', '2M', '2C', '3G', '3M'}


In [25]:
# Save
df_annotations.to_csv("data/processed/FCF_annotations_corrected.csv")
print("df_annotations.shape", df_annotations.shape)

df_annotations.shape (86, 5)


## 5. Handle Newick Tree Creation

In [21]:
from skbio.sequence import DNA
from skbio.sequence.distance import hamming
from skbio import DistanceMatrix
from skbio.tree import nj

### Attempt 1: From Sequences

In [26]:
# get sequence data
sequence_file = open("data/raw/tree/081616JHnew515Fcomplete-pr.fasta.otus.fa", 'r')
sequence_lines = [l.upper() for l in sequence_file.readlines()]
dna_lines = [DNA(str(seq), metadata=seq.metadata) for seq in skbio.io.read(sequence_lines, format='fasta')]

# make distance matrix
dm = np.zeros((len(dna_lines), len(dna_lines)))

for i, dna in enumerate(dna_lines):
    for j in range(0, i // 2):
        # perform pair-wise alignment of dna to make them equal length
        msa = skbio.alignment.local_pairwise_align_ssw(dna_lines[i], dna_lines[j])
        # get hamming distance
        dist = hamming(msa[0][0], msa[0][1])
        dm[i][j] = dist
        dm[j][i] = dist

In [27]:
dna_lines

[DNA
 ---------------------------------------------------------------------
 Metadata:
     'description': ''
     'id': 'OTU_1'
 Stats:
     length: 407
     has gaps: False
     has degenerates: False
     has definites: True
     GC-content: 54.55%
 ---------------------------------------------------------------------
 0   TACGAAGGGG GCAAGCGTTG TTCGGAATTA CTGGGCGTAA AGCGCACGCA GGCGGCGATC
 60  CAAGTCAGAA GTGAAAGCCC TGGGCTCAAC CCGGGAACTG CTTTTGATAC TGGGTTGCTA
 ...
 300 TGGGGTGCAT GCACCTCAGT GTCGAAGCTA ACGCGATAAG CACTCCGCCT GGGGAGTACG
 360 GCCGCAAGGT TAAAACTCAA AGGAATTGAC GGGGTTCAGT GCCGAAG,
 DNA
 ---------------------------------------------------------------------
 Metadata:
     'description': ''
     'id': 'OTU_2'
 Stats:
     length: 401
     has gaps: False
     has degenerates: False
     has definites: True
     GC-content: 53.37%
 ---------------------------------------------------------------------
 0   TACAGAGACT GCAAGCGTTA TTCGGATTCA CTGGGCGTAA AGGGTGCGCA GGCGGCCATG
 60  TG

In [28]:
# create tree
otu_ids = [dna.metadata["id"] for dna in dna_lines]
distance_matrix = DistanceMatrix(dm, otu_ids)
tree = nj(distance_matrix)

In [29]:
with open("data/processed/newick_tree.txt", "w") as newick_file:
    tree.write(newick_file, format='newick')
print("saved tree:", str(tree))

saved tree: (OTU_642:0.0,(OTU_641:0.0,(OTU_639:0.0,(OTU_640:0.0,(OTU_637:0.0,(OTU_638:0.0,(OTU_635:0.0,(OTU_636:0.0,(OTU_633:0.0,(OTU_634:0.0,(OTU_631:0.0,(OTU_632:0.0,(OTU_630:0.0,(OTU_629:0.0,(OTU_627:0.0,(OTU_628:0.0,(OTU_626:0.0,(OTU_625:0.0,(OTU_623:0.0,(OTU_624:0.0,(OTU_622:0.0,(OTU_621:0.0,(OTU_619:0.0,(OTU_620:0.0,(OTU_618:0.0,(OTU_617:0.0,(OTU_615:0.0,(OTU_616:0.0,(OTU_611:0.0,(OTU_612:0.0,(OTU_614:0.0,(OTU_613:0.0,(OTU_610:0.0,(OTU_609:0.0,(OTU_607:0.0,(OTU_599:0.0,(OTU_603:0.0,(OTU_608:0.0,(OTU_601:0.0,(OTU_602:0.0,(OTU_598:0.0,(OTU_605:0.0,(OTU_600:0.0,(OTU_606:0.0,(OTU_604:0.0,(OTU_597:0.0,(OTU_596:0.0,(OTU_595:0.0,(OTU_591:0.0,(OTU_593:0.0,(OTU_594:0.0,(OTU_590:0.0,(OTU_589:0.0,(OTU_588:0.0,(OTU_587:0.0,(OTU_585:0.0,(OTU_586:0.0,(OTU_584:0.0,(OTU_592:0.0,(OTU_583:0.0,(OTU_582:0.0,(OTU_581:0.0,(OTU_574:0.0,(OTU_577:0.0,(OTU_578:0.0,(OTU_579:0.0,(OTU_580:0.0,(OTU_575:0.0,(OTU_576:0.0,(OTU_573:0.0,(OTU_572:0.0,(OTU_565:0.0,(OTU_568:0.0,(OTU_570:0.0,(OTU_562:0.0,(OTU_571:0.0,

### Attempt 2: From Pairwise Distances

In [4]:
pairwise_distances_muscle = pd.read_csv("data/raw/tree/Pairwise_distances_all_OTU_Muscle", index_col=0)

In [18]:
x = np.nan_to_num(pairwise_distances_muscle.to_numpy())
filled_diag_x = np.rot90(np.fliplr(x))
filled_x = x + filled_diag_x
filled_x

array([[0.        , 0.29639958, 0.20446642, ..., 0.17966551, 0.44625324,
        0.11749654],
       [0.29639958, 0.        , 0.3280139 , ..., 0.33295201, 0.44860921,
        0.29738399],
       [0.20446642, 0.3280139 , 0.        , ..., 0.14433162, 0.43510314,
        0.1340228 ],
       ...,
       [0.17966551, 0.33295201, 0.14433162, ..., 0.        , 0.47751537,
        0.08173676],
       [0.44625324, 0.44860921, 0.43510314, ..., 0.47751537, 0.        ,
        0.42090375],
       [0.11749654, 0.29738399, 0.1340228 , ..., 0.08173676, 0.42090375,
        0.        ]])

In [22]:
# create tree
otu_ids_pd = list(pairwise_distances_muscle.index)
distance_matrix_pd = DistanceMatrix(filled_x, otu_ids_pd)
tree_pd = nj(distance_matrix_pd)

In [23]:
with open("data/processed/newick_tree_from_pairwise_distances.txt", "w") as newick_file:
    tree_pd.write(newick_file, format='newick')
print("saved tree:", str(tree_pd))

saved tree: ((((OTU_573:0.029358,OTU_553:0.018553):0.030347,(((OTU_605:0.03029,(OTU_632:0.005624,OTU_383:0.022983):0.005747):0.002845,(OTU_4:0.000655,((OTU_362:0.00526,(OTU_295:0.004387,(OTU_283:0.013471,(OTU_456:0.00369,OTU_197:0.037353):0.00335):0.008521):0.016208):0.014756,(OTU_259:0.025642,((OTU_539:0.018331,(OTU_516:0.0,(OTU_239:0.003255,(OTU_537:0.0,OTU_481:0.047651):0.017735):0.002171):0.00469):0.009446,(OTU_641:0.01978,(OTU_629:0.006787,OTU_555:0.021033):0.003832):0.002241):0.002584):0.00742):0.00376):0.001562):0.003348,(((OTU_567:0.015791,OTU_333:0.028767):0.003235,(OTU_548:0.016019,(OTU_643:0.021896,OTU_387:0.014483):0.010389):0.00161):0.006735,((OTU_527:0.023098,(OTU_551:0.051681,OTU_528:0.001109):0.032465):0.004637,((OTU_355:0.070604,OTU_299:0.019877):0.023521,(OTU_61:0.021202,(OTU_393:0.023371,(OTU_432:0.031078,(OTU_606:0.025695,OTU_161:0.018578):0.009):0.014971):0.013297):0.00136):0.005361):0.002713):0.001351):0.001434):0.00396,((((OTU_200:0.003399,(OTU_478:0.0,(OTU_544:0