# Format
> By Gati Aher  
> March 28, 2022

**Dataset:** FCF Biotic Perturbation (None-New-Native)

**Goal:** Put raw data in more workable format for downstream analysis

## Outputs

* `data/processed_FCF/OTU_counts.tsv`
* `data/processed_FCF/taxonomy.tsv`
* `data/processed_FCF/sample_metadata.tsv`
* `data/processed_FCF/renamed_081616JHnew515Fcomplete-pr.fasta.otus.fa`

In [3]:
raw_location = "../data/raw/FCF_biotic"

In [4]:
# imports
import skbio
import pandas as pd
import numpy as np
import re

## Load Raw Data

Inputs:
* Sample Name x Meta Information
    * (Jean): `sample-metadata.csv`
* OTU ID-Taxonomy x Sample Name x OTU Counts
    * (VAMPS): `tax-data.csv`

In [3]:
# get sample metadata
sample_metadata = pd.read_csv(f"{raw_location}/sample-metadata.csv", index_col=0)
sample_metadata = sample_metadata.rename(columns={"Label": "label", 
                                         "Additional": "addition",
                                         "Series": "series",
                                         "Day": "day",
                                         "Replicate": "replicate"})
sample_metadata

Unnamed: 0_level_0,label,addition,day,series,replicate
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
N00B,LAZ_OLIN_Bv4v5--Olin_FCF_day_0B_Bv4v5,none,0,N,B
N00O,LAZ_OLIN_Bv4v5--OlinTC3_1,none,0,N,O
N03A,LAZ_OLIN_Bv4v5--Olin_FCF_day_3A_Bv4v5,none,3,N,A
N03B,LAZ_OLIN_Bv4v5--Olin_FCF_day_3B_Bv4v5,none,3,N,B
N03O,LAZ_OLIN_Bv4v5--OlinTC3_6,none,3,N,O
N05A,LAZ_OLIN_Bv4v5--Olin_FCF_day_5A_Bv4v5,none,5,N,A
N05B,LAZ_OLIN_Bv4v5--Olin_FCF_day_5B_Bv4v5,none,5,N,B
N05O,LAZ_OLIN_Bv4v5--OlinTC3_11,none,5,N,O
N07A,LAZ_OLIN_Bv4v5--Olin_FCF_day_7A_Bv4v5,none,7,N,A
N07B,LAZ_OLIN_Bv4v5--Olin_FCF_day_7B_Bv4v5,none,7,N,B


In [4]:
# get table of OTU ID-taxonomy x absolute OTU counts
otu_raw = pd.read_csv(f"{raw_location}/tax-data.csv", sep=",")
otu_raw

Unnamed: 0.1,Unnamed: 0,LAZ_OLIN_Bv4v5--OlinTC3_1,LAZ_OLIN_Bv4v5--OlinTC3_11,LAZ_OLIN_Bv4v5--OlinTC3_12,LAZ_OLIN_Bv4v5--OlinTC3_13,LAZ_OLIN_Bv4v5--OlinTC3_16,LAZ_OLIN_Bv4v5--OlinTC3_17,LAZ_OLIN_Bv4v5--OlinTC3_18,LAZ_OLIN_Bv4v5--OlinTC3_2,LAZ_OLIN_Bv4v5--OlinTC3_21,...,LAZ_OLIN_Bv4v5--Olin_FCF_New_Day_7C_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_0B_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_11A_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_11B_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_3A_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_3B_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_5A_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_5B_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_7A_Bv4v5,LAZ_OLIN_Bv4v5--Olin_FCF_day_7B_Bv4v5
0,Bacteria,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,1,2,0,0,0
1,Bacteria;Acidobacteria,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Bacteria;Acidobacteria;Acidobacteriia;Acidobac...,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,Bacteria;Actinobacteria;Actinobacteria;Actinom...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,Bacteria;Actinobacteria;Actinobacteria;Coryneb...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274,Bacteria;Tenericutes;Mollicutes;Acholeplasmata...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
275,Bacteria;Thermotogae;Thermotogae;Thermotogales...,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
276,Bacteria;Verrucomicrobia,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
277,Bacteria;Verrucomicrobia;Opitutae;Opitutales;O...,415,276,270,416,182,490,443,790,526,...,7517,6393,16889,15252,7092,7069,15076,13588,10705,11708


## Process Data

Outputs:
* Sample Name x Meta Information
    * `sample_metadata.tsv`
* OTU ID x Sample ID x HTS Counts
    * `OTU_counts.tsv`
* OTU ID x Taxonomy
    * `taxonomy.tsv`

### Create `sample_metadata.tsv`

In [5]:
# create transfer column
def label_transfer(row):
    if (row["replicate"] in ["A", "B", "C"]):
        return "2" + row["series"]
    else:
        return "1" + row["series"]

sample_metadata["transfer"] = sample_metadata.apply(label_transfer, axis=1)
sample_metadata

Unnamed: 0_level_0,label,addition,day,series,replicate,transfer
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
N00B,LAZ_OLIN_Bv4v5--Olin_FCF_day_0B_Bv4v5,none,0,N,B,2N
N00O,LAZ_OLIN_Bv4v5--OlinTC3_1,none,0,N,O,1N
N03A,LAZ_OLIN_Bv4v5--Olin_FCF_day_3A_Bv4v5,none,3,N,A,2N
N03B,LAZ_OLIN_Bv4v5--Olin_FCF_day_3B_Bv4v5,none,3,N,B,2N
N03O,LAZ_OLIN_Bv4v5--OlinTC3_6,none,3,N,O,1N
N05A,LAZ_OLIN_Bv4v5--Olin_FCF_day_5A_Bv4v5,none,5,N,A,2N
N05B,LAZ_OLIN_Bv4v5--Olin_FCF_day_5B_Bv4v5,none,5,N,B,2N
N05O,LAZ_OLIN_Bv4v5--OlinTC3_11,none,5,N,O,1N
N07A,LAZ_OLIN_Bv4v5--Olin_FCF_day_7A_Bv4v5,none,7,N,A,2N
N07B,LAZ_OLIN_Bv4v5--Olin_FCF_day_7B_Bv4v5,none,7,N,B,2N


### Create `taxonomy.tsv`

In [6]:
# format table of OTU ID x taxonomy & function
feature_metadata = pd.DataFrame(index=otu_raw.index)

for otu, desc in otu_raw.iterrows(): 
    feature_metadata.loc[otu, "featureID"] = "OTU_" + str(otu)
    parts = desc["Unnamed: 0"].split(";")
    feature_metadata.loc[otu, "taxonomy"] = desc["Unnamed: 0"]
    if (len(parts) > 0):
        feature_metadata.loc[otu, "kingdom"] = parts[0]
    if (len(parts) > 1):
        feature_metadata.loc[otu, "phylum"] = parts[1]
    if (len(parts) > 2):
        feature_metadata.loc[otu, "class"] = parts[2]
    if (len(parts) > 3):
        feature_metadata.loc[otu, "order"] = parts[3]
    if (len(parts) > 4):
        feature_metadata.loc[otu, "family"] = parts[4]
    if (len(parts) > 5):
        feature_metadata.loc[otu, "genus"] = parts[5]
    if (len(parts) > 6):
        feature_metadata.loc[otu, "species"] = parts[6]

feature_metadata = feature_metadata.fillna(value="Unassigned")
feature_metadata = feature_metadata.set_index("featureID")
feature_metadata

Unnamed: 0_level_0,taxonomy,kingdom,phylum,class,order,family,genus,species
featureID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
OTU_0,Bacteria,Bacteria,Unassigned,Unassigned,Unassigned,Unassigned,Unassigned,Unassigned
OTU_1,Bacteria;Acidobacteria,Bacteria,Acidobacteria,Unassigned,Unassigned,Unassigned,Unassigned,Unassigned
OTU_2,Bacteria;Acidobacteria;Acidobacteriia;Acidobac...,Bacteria,Acidobacteria,Acidobacteriia,Acidobacteriales,Unassigned,Unassigned,Unassigned
OTU_3,Bacteria;Actinobacteria;Actinobacteria;Actinom...,Bacteria,Actinobacteria,Actinobacteria,Actinomycetales,Actinomycetaceae,Actinobaculum,Unassigned
OTU_4,Bacteria;Actinobacteria;Actinobacteria;Coryneb...,Bacteria,Actinobacteria,Actinobacteria,Corynebacteriales,Corynebacteriaceae,Corynebacterium,Unassigned
...,...,...,...,...,...,...,...,...
OTU_274,Bacteria;Tenericutes;Mollicutes;Acholeplasmata...,Bacteria,Tenericutes,Mollicutes,Acholeplasmatales,Acholeplasmataceae,Acholeplasma,Unassigned
OTU_275,Bacteria;Thermotogae;Thermotogae;Thermotogales...,Bacteria,Thermotogae,Thermotogae,Thermotogales,Thermotogaceae,Unassigned,Unassigned
OTU_276,Bacteria;Verrucomicrobia,Bacteria,Verrucomicrobia,Unassigned,Unassigned,Unassigned,Unassigned,Unassigned
OTU_277,Bacteria;Verrucomicrobia;Opitutae;Opitutales;O...,Bacteria,Verrucomicrobia,Opitutae,Opitutales,Opitutaceae,Opitutus,Unassigned


In [7]:
feature_metadata["genus"].unique()

array(['Unassigned', 'Actinobaculum', 'Corynebacterium', 'Rhodococcus',
       'Brevibacterium', 'Propionibacterium', 'Propionicicella',
       'Propionicimonas', 'Bacteroides', 'Alkaliflexus', 'Mangroviflexus',
       'Marinifilum', 'Dysgonomonas', 'Macellibacteroides',
       'Paludibacter', 'Prevotella', 'Cloacibacterium', 'Elizabethkingia',
       'Tenacibaculum', 'Filimonas', 'Prolixibacter', 'Chlorobaculum',
       'Chlorobium', 'Chloroherpeton', 'Ornatilinea', 'Synechococcus',
       'Prochlorococcus', 'Denitrovibrio', 'Fibrobacter',
       'Staphylococcus', 'Gemella', 'Aerococcus', 'Atopostipes',
       'Enterococcus', 'Vagococcus', 'Lactobacillus', 'Streptococcus',
       'Guggenheimella', 'Clostridium_sensu_stricto_10', 'genus_NA',
       'Anaerococcus', 'Fusibacter', 'Anaerofilum', 'Incertae_Sedis',
       'Dethiosulfatibacter', 'Erysipelothrix', 'Veillonella',
       'Rhodopirellula', 'Bradyrhizobium', 'Rhodoblastus',
       'Rhodopseudomonas', 'Tardiphaga', 'Ancalomicrobiu

### Create `OTU_counts.tsv`

In [8]:
# column mapper
sample_names = sample_metadata["label"]
sample_names = {y:x for x,y in sample_names.to_dict().items()}
sample_names

# row mapper
tax_names = feature_metadata["taxonomy"]
tax_names = {y:x for x,y in tax_names.to_dict().items()}
tax_names

{'Bacteria': 'OTU_0',
 'Bacteria;Acidobacteria': 'OTU_1',
 'Bacteria;Acidobacteria;Acidobacteriia;Acidobacteriales': 'OTU_2',
 'Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinobaculum': 'OTU_3',
 'Bacteria;Actinobacteria;Actinobacteria;Corynebacteriales;Corynebacteriaceae;Corynebacterium': 'OTU_4',
 'Bacteria;Actinobacteria;Actinobacteria;Corynebacteriales;Corynebacteriaceae;Corynebacterium;species_NA': 'OTU_5',
 'Bacteria;Actinobacteria;Actinobacteria;Corynebacteriales;Nocardiaceae;Rhodococcus': 'OTU_6',
 'Bacteria;Actinobacteria;Actinobacteria;Micrococcales;Brevibacteriaceae;Brevibacterium': 'OTU_7',
 'Bacteria;Actinobacteria;Actinobacteria;Propionibacteriales;Propionibacteriaceae': 'OTU_8',
 'Bacteria;Actinobacteria;Actinobacteria;Propionibacteriales;Propionibacteriaceae;Propionibacterium': 'OTU_9',
 'Bacteria;Actinobacteria;Actinobacteria;Propionibacteriales;Propionibacteriaceae;Propionibacterium;species_NA': 'OTU_10',
 'Bacteria;Actinobacteria;Actinob

In [9]:
OTU_counts = otu_raw.rename(columns=sample_names)
OTU_counts = OTU_counts.rename(columns={"Unnamed: 0": "featureID"})
OTU_counts = OTU_counts.set_index("featureID")
OTU_counts = OTU_counts.rename(index=tax_names)
OTU_counts

Unnamed: 0_level_0,N00O,N05O,X05O,R05O,N07O,X07O,R07O,X00O,N11O,X11O,...,X07C,N00B,N11A,N11B,N03A,N03B,N05A,N05B,N07A,N07B
featureID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OTU_0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,1,2,0,0,0
OTU_1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OTU_2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
OTU_3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
OTU_4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OTU_274,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OTU_275,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
OTU_276,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
OTU_277,415,276,270,416,182,490,443,790,526,689,...,7517,6393,16889,15252,7092,7069,15076,13588,10705,11708


### Additional Cleaning (Do Not Threshold OTUs)

In [10]:
# check if there are OTUs that appear in no samples
OTU_counts[OTU_counts.sum(axis=1) == 0]

Unnamed: 0_level_0,N00O,N05O,X05O,R05O,N07O,X07O,R07O,X00O,N11O,X11O,...,X07C,N00B,N11A,N11B,N03A,N03B,N05A,N05B,N07A,N07B
featureID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [11]:
# remove OTUs that are not bacteria
not_bacteria = feature_metadata[feature_metadata["kingdom"] != "Bacteria"]
print(not_bacteria.taxonomy)

OTU_counts = OTU_counts.drop(not_bacteria.index)
print("OTU_counts.shape", OTU_counts.shape)

feature_metadata = feature_metadata.drop(not_bacteria.index)
print("feature_metadata.shape", feature_metadata.shape)

Series([], Name: taxonomy, dtype: object)
OTU_counts.shape (279, 42)
feature_metadata.shape (279, 8)


## Alpha Diversity

In [12]:
# alpha diversity w/ phylogenetic weights example 
from skbio.diversity import alpha_diversity
# alpha_diversity(metric="faith_pd", counts=OTU_counts.T.values, ids=OTU_counts.columns, tree=tree_rooted, otu_ids=[n.name for n in tree_rooted.tips()], validate=True)

In [13]:
# ADD raw counts to sample meta data
sample_metadata["RC"] = OTU_counts.sum(axis = 0)
sample_metadata

Unnamed: 0_level_0,label,addition,day,series,replicate,transfer,RC
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
N00B,LAZ_OLIN_Bv4v5--Olin_FCF_day_0B_Bv4v5,none,0,N,B,2N,100614
N00O,LAZ_OLIN_Bv4v5--OlinTC3_1,none,0,N,O,1N,8721
N03A,LAZ_OLIN_Bv4v5--Olin_FCF_day_3A_Bv4v5,none,3,N,A,2N,128490
N03B,LAZ_OLIN_Bv4v5--Olin_FCF_day_3B_Bv4v5,none,3,N,B,2N,130748
N03O,LAZ_OLIN_Bv4v5--OlinTC3_6,none,3,N,O,1N,80391
N05A,LAZ_OLIN_Bv4v5--Olin_FCF_day_5A_Bv4v5,none,5,N,A,2N,175874
N05B,LAZ_OLIN_Bv4v5--Olin_FCF_day_5B_Bv4v5,none,5,N,B,2N,168080
N05O,LAZ_OLIN_Bv4v5--OlinTC3_11,none,5,N,O,1N,29167
N07A,LAZ_OLIN_Bv4v5--Olin_FCF_day_7A_Bv4v5,none,7,N,A,2N,135770
N07B,LAZ_OLIN_Bv4v5--Olin_FCF_day_7B_Bv4v5,none,7,N,B,2N,143028


In [14]:
# ADD shdiv to sample meta data
shdiv = alpha_diversity(metric="shannon", ids=OTU_counts.columns, counts=OTU_counts.T.values) # default log base 2
sample_metadata["shdiv"] = shdiv
sample_metadata

Unnamed: 0_level_0,label,addition,day,series,replicate,transfer,RC,shdiv
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
N00B,LAZ_OLIN_Bv4v5--Olin_FCF_day_0B_Bv4v5,none,0,N,B,2N,100614,2.406747
N00O,LAZ_OLIN_Bv4v5--OlinTC3_1,none,0,N,O,1N,8721,3.102879
N03A,LAZ_OLIN_Bv4v5--Olin_FCF_day_3A_Bv4v5,none,3,N,A,2N,128490,1.488821
N03B,LAZ_OLIN_Bv4v5--Olin_FCF_day_3B_Bv4v5,none,3,N,B,2N,130748,1.467614
N03O,LAZ_OLIN_Bv4v5--OlinTC3_6,none,3,N,O,1N,80391,2.090803
N05A,LAZ_OLIN_Bv4v5--Olin_FCF_day_5A_Bv4v5,none,5,N,A,2N,175874,1.680408
N05B,LAZ_OLIN_Bv4v5--Olin_FCF_day_5B_Bv4v5,none,5,N,B,2N,168080,1.600235
N05O,LAZ_OLIN_Bv4v5--OlinTC3_11,none,5,N,O,1N,29167,2.323295
N07A,LAZ_OLIN_Bv4v5--Olin_FCF_day_7A_Bv4v5,none,7,N,A,2N,135770,1.703652
N07B,LAZ_OLIN_Bv4v5--Olin_FCF_day_7B_Bv4v5,none,7,N,B,2N,143028,1.607483


# Save

In [16]:
# save OTU counts
OTU_counts.to_csv(f"{raw_location}/processed_OTU_counts.tsv", sep='\t')
# save sample metadata
sample_metadata.to_csv(f"{raw_location}/processed_sample_metadata.tsv", sep='\t')
# save OTU taxonomy
feature_metadata.to_csv(f"{raw_location}/processed_feature_metadata.tsv", sep='\t')
feature_metadata["taxonomy"].to_csv(f"{raw_location}/processed_taxonomy.tsv", sep='\t')
feature_metadata[["kingdom", "phylum", "class", "order", "family", "genus", "species"]].to_csv(f"{raw_location}/processed_taxonomy_table.tsv", sep='\t')