# Format
> By Gati Aher  
> Sept 17, 2021

**Dataset:** FCF Carbon Perturbation (Cellulose-Glucose-Malate)

**Goal:** Put raw data in more workable format for downstream analysis

## Outputs

* `data/processed_FCF/OTU_counts.tsv`
* `data/processed_FCF/taxonomy.tsv`
* `data/processed_FCF/sample_metadata.tsv`
* `data/processed_FCF/renamed_081616JHnew515Fcomplete-pr.fasta.otus.fa`

In [1]:
raw_location = "../data/raw/FCF"

In [2]:
# imports
import skbio
import pandas as pd
import numpy as np
import re

## Load Raw Data

Inputs:
* Sample ID: Sample Name
    * (Jean): `sample_names.csv`
* Sample Name x Meta Information
    * (Jean): `FCF_annotations.csv`
* OTU ID-Taxonomy x Sample Name x OTU Counts
    * (MR DNA): `081616JHnew515Fcomplete-pr.fasta.otus.fa.OTU.txt`

In [3]:
# get map of MR DNA sample IDs to sample names
sample_names = pd.read_csv(f"{raw_location}/sample_names.csv", index_col=0)
# prefix sample names with s (so they do not start with a number) 
sample_names['key'] = 's' + sample_names['key'].astype(str)

sample_names

Unnamed: 0_level_0,key
name,Unnamed: 1_level_1
Olin1,sC0C
Olin2,s1C03A
Olin3,s1C03B
Olin4,s1G03A
Olin5,s1G03B
...,...
Olin84,s3G10A
Olin85,s3G10B
Olin86,s3G10C
Olin87,s2M10A


In [10]:
# get map of sample IDs to decided cluster
sample_clusters = pd.read_csv("../data/processed_FCF_53OTUs/cluster_decisions.csv", index_col=0)
sample_clusters = sample_clusters.rename(columns={"x": "cluster"})
sample_clusters

Unnamed: 0,cluster
s3M07C,late
s3M05B,late
s3M07A,late
s3M05A,late
s3M05C,late
...,...
s2M10B,mid
s2C03A,mid
s2C03B,mid
s2C05A,mid


In [11]:
# get table of sample names to sample meta data
sample_annotations_from_file = pd.read_csv(f"{raw_location}/FCF_annotations.csv", index_col=0) # these aren't in the right order
sample_annotations_from_file

Unnamed: 0,series,food,day
C0C,C0,cellulose,10
1C03A,1C,cellulose,3
1C03B,1C,cellulose,3
1C05A,1C,cellulose,5
1C05B,1C,cellulose,5
...,...,...,...
3G07A,3G,cellulose,7
3G07B,3G,cellulose,7
3G07C,3G,cellulose,7
3G10A,3G,cellulose,10


In [12]:
# get table of OTU ID-taxonomy x absolute OTU counts
otu_raw = pd.read_csv(f"{raw_location}/081616JHnew515Fcomplete-pr.fasta.otus.fa.OTU.txt", sep="\t", index_col=0)
otu_raw.index.names = ['featureID']
otu_raw.head()

Unnamed: 0_level_0,16rRNA,Percent Homology,evalue,bitscore,homology extract gibbsfree_otu/gibbsfree_homolog,total length gibbsfree_otu/otu_lenth/gibbsfree_homolog/homolog_length,Olin1,Olin2,Olin3,Olin4,...,Olin80,Olin81,Olin82,Olin83,Olin84,Olin85,Olin86,Olin87,Olin88,Unnamed: 95
featureID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OTU_1,eu528231.1 seasonal dynamics mudflat mouth maj...,100.0,0.0,711.813612,,,6,11,13,20,...,11,19,25,22,16,16,6,5,4,
OTU_2,uncultured opitutus sp. ;k__bacteria;p__verruc...,96.969697,0.0,661.319285,,,17546,18302,32674,9700,...,31216,17838,12257,19679,16779,15614,2107,42046,56298,
OTU_3,rhodopseudomonas palustris ;k__bacteria;p__pro...,99.493671,0.0,704.600137,,,3169,3033,4495,3455,...,1073,4207,1275,2274,4900,2027,1140,3192,2086,
OTU_4,ay297802.1 waterlogged archaeological wood clo...,99.492386,0.0,702.796768,,,5243,3526,4658,29020,...,17928,19104,11308,11368,26958,15939,3121,30080,23869,
OTU_5,rhodoblastus acidophilus ;k__bacteria;p__prote...,99.746835,0.0,708.206874,,,3809,1128,1313,1001,...,17129,22577,7156,26149,9243,2211,1368,13865,5190,


## Process Data

Outputs:
* Sample Name x Meta Information
    * `sample_metadata.tsv`
* OTU ID x Sample ID x HTS Counts
    * `OTU_counts.tsv`
* OTU ID x Taxonomy
    * `taxonomy.tsv`

### Create `sample_metadata.tsv`

In [13]:
# format useful sample annotations file
# columns: index series carbon transfer day

ann_index = []
ann_series = []
ann_carbon = []
ann_transfer = []
ann_group = []
ann_day = []
ann_replicate = []

for sn in sample_annotations_from_file.index:
    # index
    ann_index.append("s" + sn)
    # series
    if sn == "C0C":
        ann_series.append("C0C")
    else:
        ann_series.append(sn[1])
    # carbon
    if sn == "C0C":
        ann_carbon.append("original")
    elif sample_annotations_from_file.loc[sn, "food"] == "cellulose":
        ann_carbon.append("cellulose")
    elif sample_annotations_from_file.loc[sn, "food"] == "glucose":
        ann_carbon.append("glucose")
    elif sample_annotations_from_file.loc[sn, "food"] == "malate":
        ann_carbon.append("malate")
    else:
        ann_carbon.append("X")
    # transfer
    if sn == "C0C":
        ann_transfer.append(0)
    else:
        ann_transfer.append(int(sn[0]))
    # group
    if sn == "C0C":
        ann_group.append("C0C")
    else:
        ann_group.append(sn[0:2])
    # day
    if sn == "C0C":
        ann_day.append(10)
    else:
        ann_day.append(int(sn[2:-1]))
    # replicate
    if sn == "C0C":
        ann_replicate.append("C0C")
    else:
        ann_replicate.append(sn[-1])  
        
# print("ann_group", ann_group)
# print("ann_series", ann_series)
# print("ann_carbon", ann_carbon)
# print("ann_transfer", ann_transfer)
# print("ann_day", ann_day)
# print("ann_replicate", ann_replicate)
# print("ann_index", ann_index)

sample_metadata = pd.DataFrame()
sample_metadata["group"] = ann_group
sample_metadata["series"] = ann_series
sample_metadata["carbon"] = ann_carbon
sample_metadata["transfer"] = ann_transfer
sample_metadata["day"] = ann_day
sample_metadata["replicate"] = ann_replicate
sample_metadata["sampleID"] = ann_index
sample_metadata = sample_metadata.set_index("sampleID")

print("samples:", list(sample_metadata.group))
sample_metadata = sample_metadata.drop(["s2M07C"])
print("sample_metadata.shape", sample_metadata.shape)

samples: ['C0C', '1C', '1C', '1C', '1C', '1C', '1C', '1C', '1C', '1C', '1C', '2C', '2C', '2C', '2C', '2C', '2C', '1M', '1M', '1M', '1M', '1M', '1M', '1M', '1M', '1M', '1M', '1M', '1M', '2M', '2M', '2M', '2M', '2M', '2M', '2M', '2M', '2M', '2M', '2M', '3M', '3M', '3M', '3M', '3M', '3M', '3M', '3M', '3M', '3M', '3M', '3M', '1G', '1G', '1G', '1G', '1G', '1G', '1G', '1G', '1G', '1G', '1G', '1G', '2G', '2G', '2G', '2G', '2G', '2G', '2G', '2G', '2G', '2G', '2G', '2G', '3G', '3G', '3G', '3G', '3G', '3G', '3G', '3G', '3G', '3G', '3G']
sample_metadata.shape (86, 6)


In [14]:
type(sample_metadata["day"][1])

numpy.int64

## Add sample clusters to sample metadata

In [15]:
sample_metadata = sample_metadata.merge(sample_clusters, left_index=True, right_index=True, how="left")
sample_metadata

Unnamed: 0_level_0,group,series,carbon,transfer,day,replicate,cluster
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
sC0C,C0C,C0C,original,0,10,C0C,late
s1C03A,1C,C,cellulose,1,3,A,late
s1C03B,1C,C,cellulose,1,3,B,late
s1C05A,1C,C,cellulose,1,5,A,mid
s1C05B,1C,C,cellulose,1,5,B,late
...,...,...,...,...,...,...,...
s3G07A,3G,G,cellulose,3,7,A,late
s3G07B,3G,G,cellulose,3,7,B,late
s3G07C,3G,G,cellulose,3,7,C,late
s3G10A,3G,G,cellulose,3,10,A,late


In [19]:
# fill in other clusters
def sample_cluster(row):
    if (row["group"] == "1M"): return "perturbed"
    if (row["group"] == "1G"): return "perturbed"
    # manual examination showed that these are different than mid
    if (row["group"] in ["1C", "2G", "2M"] 
        and row["day"] == 3): 
        return "early"
    # manual examination showed that this is objectively wrong
    if (row["group"] in ["1C", "2G", "2M"] and 
        row["day"] == 5 and 
        row["cluster"] == "late"): 
        return "early"
    return row["cluster"]

sample_metadata["cluster"] = sample_metadata.apply(lambda x: sample_cluster(x), axis=1)
sample_metadata

Unnamed: 0_level_0,group,series,carbon,transfer,day,replicate,cluster
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
sC0C,C0C,C0C,original,0,10,C0C,late
s1C03A,1C,C,cellulose,1,3,A,early
s1C03B,1C,C,cellulose,1,3,B,early
s1C05A,1C,C,cellulose,1,5,A,mid
s1C05B,1C,C,cellulose,1,5,B,early
...,...,...,...,...,...,...,...
s3G07A,3G,G,cellulose,3,7,A,late
s3G07B,3G,G,cellulose,3,7,B,late
s3G07C,3G,G,cellulose,3,7,C,late
s3G10A,3G,G,cellulose,3,10,A,late


In [21]:
sample_metadata["cluster"].value_counts()

late         32
perturbed    24
mid          21
early         9
Name: cluster, dtype: int64

In [22]:
# add more focused sample clusters to sample metadata
def focused_sample_cluster(row):
    if (row["cluster"] == "perturbed"):
        if (row["group"] == "1G"): return "perturbed G"
        if (row["group"] == "1M"): return "perturbed M"
    if (row["cluster"] == "mid"):
        if (row["series"] == "C"): return "mid C"
        if (row["series"] == "G"): return "mid G"
        if (row["series"] == "M"): return "mid M"
    if (row["cluster"] == "late"):
        if (row["series"] == "C"): return "late C"
        if (row["series"] == "C0C"): return "late C"
        if (row["series"] == "G"): return "late G"
        if (row["series"] == "M"): return "late M"
    return row["cluster"]

sample_metadata["focused cluster"] = sample_metadata.apply(lambda x: focused_sample_cluster(x), axis=1)
sample_metadata

Unnamed: 0_level_0,group,series,carbon,transfer,day,replicate,cluster,focused cluster
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
sC0C,C0C,C0C,original,0,10,C0C,late,late C
s1C03A,1C,C,cellulose,1,3,A,early,early
s1C03B,1C,C,cellulose,1,3,B,early,early
s1C05A,1C,C,cellulose,1,5,A,mid,mid C
s1C05B,1C,C,cellulose,1,5,B,early,early
...,...,...,...,...,...,...,...,...
s3G07A,3G,G,cellulose,3,7,A,late,late G
s3G07B,3G,G,cellulose,3,7,B,late,late G
s3G07C,3G,G,cellulose,3,7,C,late,late G
s3G10A,3G,G,cellulose,3,10,A,late,late G


In [23]:
sample_metadata["focused cluster"].value_counts()

late G         14
perturbed M    12
perturbed G    12
late M         11
early           9
mid M           8
late C          7
mid C           7
mid G           6
Name: focused cluster, dtype: int64

### Create `OTU_counts.tsv`

In [24]:
# format table of OTU ID x sample name x absolute counts
OTU_counts = otu_raw.filter(regex='^Olin')
# rename sample ID to sample names
OTU_counts = OTU_counts.rename(columns=sample_names.to_dict()["key"])
# NOTE: 3G10C and 2M7C were removed because they were incorrectly sequenced
OTU_counts = OTU_counts.drop(columns=["s3G10C", "s2M07C"])
# order columns by order in sample metadata
OTU_counts = OTU_counts.reindex(columns=list(sample_metadata.index))

In [25]:
OTU_counts

Unnamed: 0_level_0,sC0C,s1C03A,s1C03B,s1C05A,s1C05B,s1C07A,s1C07B,s1C10A,s1C10B,s1C20A,...,s3G03B,s3G03C,s3G05A,s3G05B,s3G05C,s3G07A,s3G07B,s3G07C,s3G10A,s3G10B
featureID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OTU_1,6,11,13,30,15,23,25,5,8,0,...,30,46,1,14,10,6,3,1,16,16
OTU_2,17546,18302,32674,15446,22216,25011,21224,13067,13340,6896,...,6869,8322,9792,21106,39348,24470,34872,33561,16779,15614
OTU_3,3169,3033,4495,3138,4601,3633,1919,2952,2110,5739,...,610,599,993,2016,1433,3008,1979,2067,4900,2027
OTU_4,5243,3526,4658,2475,5861,5526,3808,4545,4121,4551,...,7179,6226,10400,21570,21360,27375,29348,27454,26958,15939
OTU_5,3809,1128,1313,1989,3024,2419,1540,1956,869,2546,...,923,1287,2062,3568,2621,7346,3789,4428,9243,2211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OTU_639,1,0,0,1,1,0,0,0,1,2,...,0,0,0,0,0,0,0,0,0,0
OTU_640,4270,2662,3657,2136,4412,2622,1964,2673,1856,4256,...,312,248,302,535,498,755,607,602,1442,684
OTU_641,0,0,0,1,3,3,0,5,0,0,...,0,3,1,3,0,9,4,6,4,1
OTU_642,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Create `feature_metadata.tsv` 

In [26]:
# format table of OTU ID x taxonomy & function
feature_metadata = pd.DataFrame(index=otu_raw.index, columns=["Name"])

for otu, desc in otu_raw.iterrows(): 
    parts = desc["16rRNA"].split(";")
    taxonomy = "Unassigned"
    for i, part in enumerate(parts):
        if i == 0:
            feature_metadata.loc[otu, "Name"] = part
        elif part.startswith("k__"):
            feature_metadata.loc[otu, "kingdom"] = part[3:]
            taxonomy = "Root;" + part + ";"
        elif part.startswith("p__"):
            feature_metadata.loc[otu, "phylum"] = part[3:]
            taxonomy += part + ";"
        elif part.startswith("c__"):
            feature_metadata.loc[otu, "class"] = part[3:]
            taxonomy += part + ";"
        elif part.startswith("o__"):
            feature_metadata.loc[otu, "order"] = part[3:]
            taxonomy += part + ";"
        elif part.startswith("f__"):
            feature_metadata.loc[otu, "family"] = part[3:]
            taxonomy += part + ";"
        elif part.startswith("g__"):
            feature_metadata.loc[otu, "genus"] = part[3:]
            taxonomy += part + ";"
        elif part.startswith("s__"):
            feature_metadata.loc[otu, "species"] = part[3:]
            taxonomy += part + ";"
        elif part.startswith("superkingdom__"):
            feature_metadata.loc[otu, "superkingdom"] = part[14:]
        elif part.startswith("superphylum__"):
            feature_metadata.loc[otu, "superphylum"] = part[13:]
        elif part.startswith("subclass"):
            feature_metadata.loc[otu, "subclass"] = part[8:]
        elif part.startswith("suborder"):
            feature_metadata.loc[otu, "suborder"] = part[8:]
        elif part.startswith("subphylum__"):
            feature_metadata.loc[otu, "subphylum"] = part[9:]
        else:
            print("OTU:", otu, "Extra PART:", part)
    feature_metadata.loc[otu, "taxonomy"] = taxonomy

OTU: OTU_18 Extra PART: species group  pseudomonas fluorescens group
OTU: OTU_526 Extra PART: jpl_saf&apos
OTU: OTU_526 Extra PART:  during msl mission spacecraft assembly clean room floor clone gi8_sp_i21  


In [27]:
feature_metadata

Unnamed: 0_level_0,Name,kingdom,phylum,class,order,family,genus,species,superkingdom,taxonomy,superphylum,subclass,suborder,subphylum
featureID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
OTU_1,eu528231.1 seasonal dynamics mudflat mouth maj...,bacteria,proteobacteria,alphaproteobacteria,rhodospirillales,rhodospirillaceae,magnetospirillum,magnetospirillum sp.,bacteria,Root;k__bacteria;p__proteobacteria;c__alphapro...,,,,
OTU_2,uncultured opitutus sp.,bacteria,verrucomicrobia,opitutae,opitutales,opitutaceae,opitutus,opitutus spp.,bacteria,Root;k__bacteria;p__verrucomicrobia;c__opituta...,chlamydiae/verrucomicrobia group,,,
OTU_3,rhodopseudomonas palustris,bacteria,proteobacteria,alphaproteobacteria,rhizobiales,bradyrhizobiaceae,rhodopseudomonas,rhodopseudomonas palustris,bacteria,Root;k__bacteria;p__proteobacteria;c__alphapro...,,,,
OTU_4,ay297802.1 waterlogged archaeological wood clo...,bacteria,proteobacteria,alphaproteobacteria,rhizobiales,methylocystaceae,pleomorphomonas,pleomorphomonas oryzae,bacteria,Root;k__bacteria;p__proteobacteria;c__alphapro...,,,,
OTU_5,rhodoblastus acidophilus,bacteria,proteobacteria,alphaproteobacteria,rhizobiales,bradyrhizobiaceae,rhodoblastus,rhodoblastus acidophilus,bacteria,Root;k__bacteria;p__proteobacteria;c__alphapro...,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OTU_639,uncultured spirochaeta sp.,bacteria,spirochaetes,spirochaetia,spirochaetales,spirochaetaceae,spirochaeta,spirochaeta spp.,bacteria,Root;k__bacteria;p__spirochaetes;c__spirochaet...,,,,
OTU_640,ab079679.1 rhodopseudomonas sp. hmd89,bacteria,proteobacteria,alphaproteobacteria,rhizobiales,bradyrhizobiaceae,afipia,afipia sp.,bacteria,Root;k__bacteria;p__proteobacteria;c__alphapro...,,,,
OTU_641,ay297803.1 waterlogged archaeological wood clo...,bacteria,proteobacteria,alphaproteobacteria,rhizobiales,methylocystaceae,pleomorphomonas,pleomorphomonas oryzae,bacteria,Root;k__bacteria;p__proteobacteria;c__alphapro...,,,,
OTU_642,uncultured pontibacter sp.,bacteria,bacteroidetes,cytophagia,cytophagales,cytophagaceae,pontibacter,pontibacter spp.,bacteria,Root;k__bacteria;p__bacteroidetes;c__cytophagi...,bacteroidetes/chlorobi group,,,


### Additional Cleaning (Do Not Threshold OTUs)

In [28]:
# check if there are OTUs that appear in no samples
OTU_counts[OTU_counts.sum(axis=1) == 0]

Unnamed: 0_level_0,sC0C,s1C03A,s1C03B,s1C05A,s1C05B,s1C07A,s1C07B,s1C10A,s1C10B,s1C20A,...,s3G03B,s3G03C,s3G05A,s3G05B,s3G05C,s3G07A,s3G07B,s3G07C,s3G10A,s3G10B
featureID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [29]:
# check if there are Unassigned taxonomy OTUs
not_assigned = feature_metadata[feature_metadata["taxonomy"] == "Unassigned"]
print(not_assigned.Name)

OTU_counts = OTU_counts.drop(not_assigned.index)
print("OTU_counts.shape", OTU_counts.shape)

feature_metadata = feature_metadata.drop(not_assigned.index)
print("feature_metadata.shape", feature_metadata.shape)

featureID
OTU_97     No hits found
OTU_112    No hits found
OTU_124    No hits found
OTU_152    No hits found
OTU_154    No hits found
Name: Name, dtype: object
OTU_counts.shape (638, 86)
feature_metadata.shape (638, 14)


In [30]:
# remove OTUs that are not bacteria
not_bacteria = feature_metadata[feature_metadata["kingdom"] != "bacteria"]
print(not_bacteria.Name)

OTU_counts = OTU_counts.drop(not_bacteria.index)
print("OTU_counts.shape", OTU_counts.shape)

feature_metadata = feature_metadata.drop(not_bacteria.index)
print("feature_metadata.shape", feature_metadata.shape)

featureID
OTU_55     uncultured methanosaeta sp. 
OTU_221                 vitis vinifera 
Name: Name, dtype: object
OTU_counts.shape (636, 86)
feature_metadata.shape (636, 14)


## Get Phylogenetic Info and Alignment Scores

* OTU ID: DNA Sequences
    * (MR DNA): `081616JHnew515Fcomplete-pr.fasta.otus.fa`
* OTU ID x OTU ID Pairwise Alignment Scores
    * (Muscle): `Pairwise_distances_all_OTU_Muscle`
* OTU ID Phylogenetic Tree
    * (Muscle): `MUSCLE_alignment_ML_tree_all_OTUs`

### Rename OTU ID: DNA Sequences mapping (for Jean, to visually analyze phylogenetic tree)

In [31]:
from skbio.sequence import DNA

sequence_file = open(f"{raw_location}/081616JHnew515Fcomplete-pr.fasta.otus.fa", 'r')
sequence_lines = [l.upper() for l in sequence_file.readlines()]
dna_lines = [DNA(str(seq), metadata=seq.metadata) for seq in skbio.io.read(sequence_lines, format='fasta')]
dna_lines_renamed = []

for seq in skbio.io.read(sequence_lines, format='fasta'):
    str_seq = str(seq)
    metadata = seq.metadata
    if (metadata['id'] in feature_metadata.index):
        metadata['id'] = metadata['id'] + "_" + feature_metadata.loc[metadata['id'], "Name"].replace(" ", "_").replace(".", "")
        dna_lines_renamed.append(DNA(str_seq, metadata=metadata))

# save
dna_lines_renamed_file = open(f"{raw_location}/renamed_081616JHnew515Fcomplete-pr.fasta.otus.fa", "w")
for seq in dna_lines_renamed:
    skbio.io.write(seq, into=dna_lines_renamed_file, format='fasta') 

### Import Newick Phylogenetic Tree

In [32]:
# import phylogenetic weights and use in an example
from skbio import TreeNode
tree_file = open(f"{raw_location}/MUSCLE_alignment_ML_tree_all_OTUs", "r")
tree = TreeNode.read(tree_file)
tree_rooted = tree.root_at_midpoint()
print(str(tree)[:50], "...")

(((((((((((((((((((((((((((((((((((((((OTU_199:0.0 ...


## Alpha Diversity

In [33]:
# alpha diversity w/ phylogenetic weights example 
from skbio.diversity import alpha_diversity
# alpha_diversity(metric="faith_pd", counts=OTU_counts.T.values, ids=OTU_counts.columns, tree=tree_rooted, otu_ids=[n.name for n in tree_rooted.tips()], validate=True)

In [34]:
# ADD raw counts to sample meta data
sample_metadata["RC"] = OTU_counts.sum(axis = 0)
sample_metadata

Unnamed: 0_level_0,group,series,carbon,transfer,day,replicate,cluster,focused cluster,RC
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
sC0C,C0C,C0C,original,0,10,C0C,late,late C,88135
s1C03A,1C,C,cellulose,1,3,A,early,early,72061
s1C03B,1C,C,cellulose,1,3,B,early,early,117579
s1C05A,1C,C,cellulose,1,5,A,mid,mid C,83646
s1C05B,1C,C,cellulose,1,5,B,early,early,123739
...,...,...,...,...,...,...,...,...,...
s3G07A,3G,G,cellulose,3,7,A,late,late G,177164
s3G07B,3G,G,cellulose,3,7,B,late,late G,146638
s3G07C,3G,G,cellulose,3,7,C,late,late G,153173
s3G10A,3G,G,cellulose,3,10,A,late,late G,188359


In [35]:
# ADD shdiv to sample meta data
shdiv = alpha_diversity(metric="shannon", ids=OTU_counts.columns, counts=OTU_counts.T.values) # default log base 2
sample_metadata["shdiv"] = shdiv
sample_metadata

Unnamed: 0_level_0,group,series,carbon,transfer,day,replicate,cluster,focused cluster,RC,shdiv
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
sC0C,C0C,C0C,original,0,10,C0C,late,late C,88135,3.799602
s1C03A,1C,C,cellulose,1,3,A,early,early,72061,3.364793
s1C03B,1C,C,cellulose,1,3,B,early,early,117579,3.236063
s1C05A,1C,C,cellulose,1,5,A,mid,mid C,83646,3.346074
s1C05B,1C,C,cellulose,1,5,B,early,early,123739,3.798067
...,...,...,...,...,...,...,...,...,...,...
s3G07A,3G,G,cellulose,3,7,A,late,late G,177164,3.211457
s3G07B,3G,G,cellulose,3,7,B,late,late G,146638,3.295120
s3G07C,3G,G,cellulose,3,7,C,late,late G,153173,3.407462
s3G10A,3G,G,cellulose,3,10,A,late,late G,188359,3.289271


# Save

In [36]:
# save OTU counts
OTU_counts.to_csv(f"{raw_location}/processed_OTU_counts.tsv", sep='\t')
# save sample metadata
sample_metadata.to_csv(f"{raw_location}/processed_sample_metadata.tsv", sep='\t')
# save OTU taxonomy
feature_metadata.to_csv(f"{raw_location}/processed_feature_metadata.tsv", sep='\t')
feature_metadata["taxonomy"].to_csv(f"{raw_location}/processed_taxonomy.tsv", sep='\t')
feature_metadata[["kingdom", "phylum", "class", "order", "family", "genus", "species"]].to_csv(f"{raw_location}/processed_taxonomy_table.tsv", sep='\t')