# Format
> By Gati Aher  
> April 28, 2021

**Dataset:** FCF-FCD Biotic Perturbation

**Goal:** Put raw data in more workable format for downstream analysis

## Outputs

* `data/raw/biotic/processed_feature_metadata.tsv`
* `data/raw/biotic/processed_OTU_counts.tsv`
* `data/raw/biotic/processed_sample_metadata.tsv`
* `data/raw/biotic/processed_taxonomy_table.tsv`
* `data/raw/biotic/processed_taxonomy.tsv`
* `data/raw/biotic/renamed_081616JHnew515Fcomplete-pr.fasta.otus.fa`

In [1]:
raw_location = "../data/raw/biotic"

In [2]:
# imports
import skbio
import pandas as pd
import numpy as np
import re

## Load Raw Data

Inputs:
* Sample Name x Meta Information
    * (Jean): `biotic_annotations.csv`
* OTU ID-Taxonomy x Sample Name x OTU Counts
    * (VAMPS): `matrix-1649873574615.csv`

In [3]:
# get table of sample names to sample meta data
sample_annotations_from_file = pd.read_csv(f"{raw_location}/biotic_annotations.csv", index_col=0) # these aren't in the right order
sample_annotations_from_file

Unnamed: 0_level_0,sample_name,series,day,replicate,experiment,replicate (original)
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
D_start_200A,LAZ_OLIN_Bv4v5--Olin_FCD_200A_Bv4v5,D_start,200,A,start 200,A
D_start_200B,LAZ_OLIN_Bv4v5--Olin_FCD_200B_Bv4v5,D_start,200,B,start 200,B
D_start_200C,LAZ_OLIN_Bv4v5--Olin_FCD_200C_Bv4v5,D_start,200,C,start 200,C
D_011A,LAZ_OLIN_Bv4v5--Olin_FCD_Day_11A_Bv4v5,D,11,A,none,A
D_011B,LAZ_OLIN_Bv4v5--Olin_FCD_Day_11B_Bv4v5,D,11,B,none,B
...,...,...,...,...,...,...
Fn_014D,LAZ_OLIN_Bv4v5--OlinTC3_34,Fn,14,D,native_Rhodobacter,O
Fn_014E,LAZ_OLIN_Bv4v5--OlinTC3_35,Fn,14,E,native_Rhodobacter,P
F_003D,LAZ_OLIN_Bv4v5--OlinTC3_6,F,3,D,none,O
Fa_003D,LAZ_OLIN_Bv4v5--OlinTC3_7,Fa,3,D,new_Rubrivivax,O


In [4]:
# get table of OTU ID-taxonomy x absolute OTU counts
otu_raw = pd.read_csv(f"{raw_location}/matrix-1649873574615.csv", sep=",", index_col=0, header=1)
otu_raw.index.names = ['featureID']
otu_raw.head()

Unnamed: 0_level_0,LAZ_OLIN_Bv4v5--OlinTC3_1,LAZ_OLIN_Bv4v5--OlinTC3_11,LAZ_OLIN_Bv4v5--OlinTC3_12,LAZ_OLIN_Bv4v5--OlinTC3_13,LAZ_OLIN_Bv4v5--OlinTC3_16,LAZ_OLIN_Bv4v5--OlinTC3_17,LAZ_OLIN_Bv4v5--OlinTC3_18,LAZ_OLIN_Bv4v5--OlinTC3_2,LAZ_OLIN_Bv4v5--OlinTC3_21,LAZ_OLIN_Bv4v5--OlinTC3_22,...,LAZ_OLIN_Bv4v5--Olin_FCF_New_Day_7C_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_0B_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_11A_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_11B_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_3A_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_3B_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_5A_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_5B_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_7A_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_7B_Bv4v5
featureID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bacteria,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,1,2,0,0,0
Bacteria;Acidobacteria,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bacteria;Acidobacteria;Acidobacteriia;Acidobacteriales,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Bacteria;Acidobacteria;Acidobacteriia;Acidobacteriales;Acidobacteriaceae;Acidicapsa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bacteria;Acidobacteria;Acidobacteriia;Acidobacteriales;Acidobacteriaceae;Terriglobus,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Process Data

Outputs:
* Sample Name x Meta Information
    * `processed_sample_metadata.tsv`
* OTU ID x Sample ID x HTS Counts
    * `processed_OTU_counts.tsv`
* OTU ID x Taxonomy
    * `processed_taxonomy.tsv`

### Create `sample_metadata.tsv`

In [5]:
sample_metadata = sample_annotations_from_file[["series", "day", "replicate", "experiment"]]
sample_metadata

Unnamed: 0_level_0,series,day,replicate,experiment
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
D_start_200A,D_start,200,A,start 200
D_start_200B,D_start,200,B,start 200
D_start_200C,D_start,200,C,start 200
D_011A,D,11,A,none
D_011B,D,11,B,none
...,...,...,...,...
Fn_014D,Fn,14,D,native_Rhodobacter
Fn_014E,Fn,14,E,native_Rhodobacter
F_003D,F,3,D,none
Fa_003D,Fa,3,D,new_Rubrivivax


In [6]:
type(sample_metadata["day"][1])

numpy.int64

### Create `OTU_counts.tsv`

In [7]:
# generate list of otu features (Ex: OTU_1)
otu_features = ["OTU_" + str(i+1) for i in range(len(otu_raw))]
# extract list of otu taxa
otu_16rRNA = otu_raw.index
# create dictionary
otu_featureID_dict = dict(zip(otu_16rRNA, otu_features))

# get list of sample ids
sampleIDs = sample_annotations_from_file.index
# get list of sample names
samples_names = sample_annotations_from_file["sample_name"]
# create dictionary
otu_sampleID_dict = dict(zip(samples_names, sampleIDs))

# rename otu_16rRNA to otu features
OTU_counts = otu_raw.rename(index=otu_featureID_dict)
OTU_counts
# rename sample names to sample ids
OTU_counts = OTU_counts.rename(columns=otu_sampleID_dict)

# order columns by order in sample metadata
OTU_counts = OTU_counts.reindex(columns=list(sample_metadata.index))

In [8]:
OTU_counts

Unnamed: 0_level_0,D_start_200A,D_start_200B,D_start_200C,D_011A,D_011B,D_003A,D_003B,D_005A,D_005B,D_007A,...,Fn_000D,F_014D,F_014E,Fa_014D,Fa_014E,Fn_014D,Fn_014E,F_003D,Fa_003D,Fn_003D
featureID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OTU_1,1301,1646,1492,1067,2841,2031,1153,928,1446,1137,...,0,0,0,0,1,0,0,0,0,0
OTU_2,29,51,35,247,405,171,165,79,67,127,...,0,0,0,0,0,0,0,0,0,0
OTU_3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OTU_4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OTU_5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OTU_334,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
OTU_335,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OTU_336,292,357,278,1217,1411,915,900,816,850,1244,...,408,150,307,236,654,1092,734,1050,347,597
OTU_337,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Create `feature_metadata.tsv` 

In [9]:
# format table of OTU ID x taxonomy & function
feature_metadata = pd.DataFrame({"Name": otu_16rRNA}, index=otu_features)

for otu, desc in feature_metadata.iterrows(): 
    parts = desc["Name"].split(";")
    taxonomy = "Unassigned"
    
    # print(len(parts), parts)
    for i, part in enumerate(parts):
        temp = part
        if "empty" in part.lower():
            temp = "Unassigned"
        if "na" in part.lower():
            temp = "Unassigned"
        if i == 0:
            feature_metadata.loc[otu, "kingdom"] = temp
            taxonomy = "Root;" + part + ";"
        if i == 1:
            feature_metadata.loc[otu, "phylum"] = temp
            taxonomy += temp + ";"
        if i == 2:
            feature_metadata.loc[otu, "class"] = temp
            taxonomy += temp + ";"
        if i == 3:
            feature_metadata.loc[otu, "order"] = temp
            taxonomy += temp + ";"
        if i == 4:
            feature_metadata.loc[otu, "family"] = temp
            taxonomy += temp + ";"
        if i == 5:
            feature_metadata.loc[otu, "genus"] = temp
            taxonomy += temp + ";"
        if i == 6:
            feature_metadata.loc[otu, "species"] = temp
            taxonomy += temp + ";"
        if i > 6:
            print("OTU:", otu, "Extra PART:", part)

    feature_metadata.loc[otu, "taxonomy"] = taxonomy
    feature_metadata.fillna("Unassigned", inplace=True)

In [10]:
feature_metadata

Unnamed: 0,Name,kingdom,taxonomy,phylum,class,order,family,genus,species
OTU_1,Bacteria,Bacteria,Root;Bacteria;,Unassigned,Unassigned,Unassigned,Unassigned,Unassigned,Unassigned
OTU_2,Bacteria;Acidobacteria,Bacteria,Root;Bacteria;Acidobacteria;,Acidobacteria,Unassigned,Unassigned,Unassigned,Unassigned,Unassigned
OTU_3,Bacteria;Acidobacteria;Acidobacteriia;Acidobac...,Bacteria,Root;Bacteria;Acidobacteria;Acidobacteriia;Aci...,Acidobacteria,Acidobacteriia,Acidobacteriales,Unassigned,Unassigned,Unassigned
OTU_4,Bacteria;Acidobacteria;Acidobacteriia;Acidobac...,Bacteria,Root;Bacteria;Acidobacteria;Acidobacteriia;Aci...,Acidobacteria,Acidobacteriia,Acidobacteriales,Acidobacteriaceae,Acidicapsa,Unassigned
OTU_5,Bacteria;Acidobacteria;Acidobacteriia;Acidobac...,Bacteria,Root;Bacteria;Acidobacteria;Acidobacteriia;Aci...,Acidobacteria,Acidobacteriia,Acidobacteriales,Acidobacteriaceae,Terriglobus,Unassigned
...,...,...,...,...,...,...,...,...,...
OTU_334,Bacteria;Thermotogae;Thermotogae;Thermotogales...,Bacteria,Root;Bacteria;Thermotogae;Thermotogae;Thermoto...,Thermotogae,Thermotogae,Thermotogales,Thermotogaceae,Unassigned,Unassigned
OTU_335,Bacteria;Verrucomicrobia,Bacteria,Root;Bacteria;Verrucomicrobia;,Verrucomicrobia,Unassigned,Unassigned,Unassigned,Unassigned,Unassigned
OTU_336,Bacteria;Verrucomicrobia;Opitutae;Opitutales;O...,Bacteria,Root;Bacteria;Verrucomicrobia;Opitutae;Opituta...,Verrucomicrobia,Opitutae,Opitutales,Opitutaceae,Opitutus,Unassigned
OTU_337,Bacteria;Verrucomicrobia;Opitutae;Opitutales;O...,Bacteria,Root;Bacteria;Verrucomicrobia;Opitutae;Opituta...,Verrucomicrobia,Opitutae,Opitutales,Opitutaceae,Opitutus,VeSm13


In [11]:
# check for unassigned genus
print(feature_metadata[feature_metadata["genus"] == "Unassigned"].shape)
# check for unassigned species
print(feature_metadata[feature_metadata["species"] == "Unassigned"].shape)

(107, 9)
(251, 9)


In [12]:
# Rename "Unassigned" to "x_latest-taxa"  
for otu, row in feature_metadata.iterrows():
    hierarchy = ["kingdom", "phylum", "class", "order", "family", "genus", "species"]
    fill_taxa = "Fill"
    for taxrank in hierarchy:
        taxa = row[taxrank]
        if (taxa != "Unassigned"):
            fill_taxa = taxrank[0] + "_" + taxa
        else:
            feature_metadata.loc[otu, taxrank] = fill_taxa

feature_metadata

Unnamed: 0,Name,kingdom,taxonomy,phylum,class,order,family,genus,species
OTU_1,Bacteria,Bacteria,Root;Bacteria;,k_Bacteria,k_Bacteria,k_Bacteria,k_Bacteria,k_Bacteria,k_Bacteria
OTU_2,Bacteria;Acidobacteria,Bacteria,Root;Bacteria;Acidobacteria;,Acidobacteria,p_Acidobacteria,p_Acidobacteria,p_Acidobacteria,p_Acidobacteria,p_Acidobacteria
OTU_3,Bacteria;Acidobacteria;Acidobacteriia;Acidobac...,Bacteria,Root;Bacteria;Acidobacteria;Acidobacteriia;Aci...,Acidobacteria,Acidobacteriia,Acidobacteriales,o_Acidobacteriales,o_Acidobacteriales,o_Acidobacteriales
OTU_4,Bacteria;Acidobacteria;Acidobacteriia;Acidobac...,Bacteria,Root;Bacteria;Acidobacteria;Acidobacteriia;Aci...,Acidobacteria,Acidobacteriia,Acidobacteriales,Acidobacteriaceae,Acidicapsa,g_Acidicapsa
OTU_5,Bacteria;Acidobacteria;Acidobacteriia;Acidobac...,Bacteria,Root;Bacteria;Acidobacteria;Acidobacteriia;Aci...,Acidobacteria,Acidobacteriia,Acidobacteriales,Acidobacteriaceae,Terriglobus,g_Terriglobus
...,...,...,...,...,...,...,...,...,...
OTU_334,Bacteria;Thermotogae;Thermotogae;Thermotogales...,Bacteria,Root;Bacteria;Thermotogae;Thermotogae;Thermoto...,Thermotogae,Thermotogae,Thermotogales,Thermotogaceae,f_Thermotogaceae,f_Thermotogaceae
OTU_335,Bacteria;Verrucomicrobia,Bacteria,Root;Bacteria;Verrucomicrobia;,Verrucomicrobia,p_Verrucomicrobia,p_Verrucomicrobia,p_Verrucomicrobia,p_Verrucomicrobia,p_Verrucomicrobia
OTU_336,Bacteria;Verrucomicrobia;Opitutae;Opitutales;O...,Bacteria,Root;Bacteria;Verrucomicrobia;Opitutae;Opituta...,Verrucomicrobia,Opitutae,Opitutales,Opitutaceae,Opitutus,g_Opitutus
OTU_337,Bacteria;Verrucomicrobia;Opitutae;Opitutales;O...,Bacteria,Root;Bacteria;Verrucomicrobia;Opitutae;Opituta...,Verrucomicrobia,Opitutae,Opitutales,Opitutaceae,Opitutus,VeSm13


### Additional Cleaning (Do Not Threshold OTUs)

In [13]:
# check if there are OTUs that appear in no samples
OTU_counts[OTU_counts.sum(axis=1) == 0]

Unnamed: 0_level_0,D_start_200A,D_start_200B,D_start_200C,D_011A,D_011B,D_003A,D_003B,D_005A,D_005B,D_007A,...,Fn_000D,F_014D,F_014E,Fa_014D,Fa_014E,Fn_014D,Fn_014E,F_003D,Fa_003D,Fn_003D
featureID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [14]:
# check if there are Unassigned taxonomy OTUs
not_assigned = feature_metadata[feature_metadata["taxonomy"] == "Unassigned"]
print(not_assigned.Name)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
OTU_counts.loc[list(not_assigned.index), :]

Series([], Name: Name, dtype: object)


Unnamed: 0_level_0,D_start_200A,D_start_200B,D_start_200C,D_011A,D_011B,D_003A,D_003B,D_005A,D_005B,D_007A,D_007B,F_000A,LF_start_100A,LF_start_100B,LF_start_100C,LF_start_200A,LF_start_200B,LF_start_200B2,LF_start_200C,LF_start_200C2,LF_start_050A,LF_start_050B,LF_start_050B2,LF_000B,LF_011A,LF_011B,LF_003A,LF_003B,LF_005A,LF_005B,LF_007A,LF_007B,FD_004A,FD_010A,FD_015A,FD_004B,FD_010B,FD_015B,FD_004C,FD_010C,FD_015C,LFa_000A,LFa_000B,LFa_011A,LFa_011B,LFa_003A,LFa_003B,LFa_003C,LFa_005A,LFa_005B,LFa_005C,LFa_007A,LFa_007B,LFa_007C,F_000D,F_005D,Fa_005D,Fn_005D,F_007D,Fa_007D,Fn_007D,Fa_000D,F_011D,Fa_011D,Fn_000D,F_014D,F_014E,Fa_014D,Fa_014E,Fn_014D,Fn_014E,F_003D,Fa_003D,Fn_003D
featureID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1


In [15]:
# check if any of the unassigned OTUs appear in 20% of samples
(OTU_counts.loc[list(not_assigned.index), :] > 1).sum(axis=1)

Series([], dtype: int64)

In [16]:
OTU_counts = OTU_counts.drop(not_assigned.index)
print("OTU_counts.shape", OTU_counts.shape)

feature_metadata = feature_metadata.drop(not_assigned.index)
print("feature_metadata.shape", feature_metadata.shape)

OTU_counts.shape (338, 74)
feature_metadata.shape (338, 9)


In [17]:
# remove OTUs that are not bacteria
not_bacteria = feature_metadata[feature_metadata["kingdom"] != "Bacteria"]
print(not_bacteria.Name)

OTU_counts = OTU_counts.drop(not_bacteria.index)
print("OTU_counts.shape", OTU_counts.shape)

feature_metadata = feature_metadata.drop(not_bacteria.index)
print("feature_metadata.shape", feature_metadata.shape)

Series([], Name: Name, dtype: object)
OTU_counts.shape (338, 74)
feature_metadata.shape (338, 9)


## Alpha Diversity

In [18]:
# alpha diversity w/ phylogenetic weights example 
from skbio.diversity import alpha_diversity

In [19]:
sample_metadata

Unnamed: 0_level_0,series,day,replicate,experiment
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
D_start_200A,D_start,200,A,start 200
D_start_200B,D_start,200,B,start 200
D_start_200C,D_start,200,C,start 200
D_011A,D,11,A,none
D_011B,D,11,B,none
D_003A,D,3,A,none
D_003B,D,3,B,none
D_005A,D,5,A,none
D_005B,D,5,B,none
D_007A,D,7,A,none


In [20]:
OTU_counts

Unnamed: 0_level_0,D_start_200A,D_start_200B,D_start_200C,D_011A,D_011B,D_003A,D_003B,D_005A,D_005B,D_007A,D_007B,F_000A,LF_start_100A,LF_start_100B,LF_start_100C,LF_start_200A,LF_start_200B,LF_start_200B2,LF_start_200C,LF_start_200C2,LF_start_050A,LF_start_050B,LF_start_050B2,LF_000B,LF_011A,LF_011B,LF_003A,LF_003B,LF_005A,LF_005B,LF_007A,LF_007B,FD_004A,FD_010A,FD_015A,FD_004B,FD_010B,FD_015B,FD_004C,FD_010C,FD_015C,LFa_000A,LFa_000B,LFa_011A,LFa_011B,LFa_003A,LFa_003B,LFa_003C,LFa_005A,LFa_005B,LFa_005C,LFa_007A,LFa_007B,LFa_007C,F_000D,F_005D,Fa_005D,Fn_005D,F_007D,Fa_007D,Fn_007D,Fa_000D,F_011D,Fa_011D,Fn_000D,F_014D,F_014E,Fa_014D,Fa_014E,Fn_014D,Fn_014E,F_003D,Fa_003D,Fn_003D
featureID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1
OTU_1,1301,1646,1492,1067,2841,2031,1153,928,1446,1137,1174,1,0,1,1,0,1,1,0,0,0,1,1,1,1,1,0,1,2,0,0,0,281,681,317,619,617,116,613,637,498,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
OTU_2,29,51,35,247,405,171,165,79,67,127,111,0,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,10,13,2,29,21,2,37,10,4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
OTU_3,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
OTU_4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
OTU_5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
OTU_6,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
OTU_7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
OTU_8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
OTU_9,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
OTU_10,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
# ADD raw counts to sample meta data
sample_metadata["RC"] = OTU_counts.sum(axis = 0)
sample_metadata

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_metadata["RC"] = OTU_counts.sum(axis = 0)


Unnamed: 0_level_0,series,day,replicate,experiment,RC
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D_start_200A,D_start,200,A,start 200,89310
D_start_200B,D_start,200,B,start 200,114611
D_start_200C,D_start,200,C,start 200,100652
D_011A,D,11,A,none,97130
D_011B,D,11,B,none,108420
D_003A,D,3,A,none,99098
D_003B,D,3,B,none,82085
D_005A,D,5,A,none,126397
D_005B,D,5,B,none,98060
D_007A,D,7,A,none,114468


In [22]:
# ADD shdiv to sample meta data
shdiv = alpha_diversity(metric="shannon", ids=OTU_counts.columns, counts=OTU_counts.T.values) # default log base 2
sample_metadata["shdiv"] = shdiv
sample_metadata

Unnamed: 0_level_0,series,day,replicate,experiment,RC,shdiv
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
D_start_200A,D_start,200,A,start 200,89310,1.734827
D_start_200B,D_start,200,B,start 200,114611,1.613681
D_start_200C,D_start,200,C,start 200,100652,1.530892
D_011A,D,11,A,none,97130,2.198627
D_011B,D,11,B,none,108420,2.473297
D_003A,D,3,A,none,99098,2.742455
D_003B,D,3,B,none,82085,2.600032
D_005A,D,5,A,none,126397,1.68547
D_005B,D,5,B,none,98060,1.702726
D_007A,D,7,A,none,114468,1.880626


# Save

In [23]:
# save OTU counts
OTU_counts.to_csv(f"{raw_location}/processed_OTU_counts.tsv", sep='\t')
# save sample metadata
sample_metadata.to_csv(f"{raw_location}/processed_sample_metadata.tsv", sep='\t')
# save OTU taxonomy
feature_metadata.to_csv(f"{raw_location}/processed_feature_metadata.tsv", sep='\t')
feature_metadata["taxonomy"].to_csv(f"{raw_location}/processed_taxonomy.tsv", sep='\t')
feature_metadata[["kingdom", "phylum", "class", "order", "family", "genus", "species"]].to_csv(f"{raw_location}/processed_taxonomy_table.tsv", sep='\t')