## This script takes individual taxon raw abundance files, with one column named 'Taxonomy' and one named with the sample identifier, and merges them into a spreadsheet that can be used for bar plots, diversity analyses, etc.

## For the Norovirus manuscript this was run after 'vsearch_out_format_for_merge.ipynb'

In [1]:
import os as os
import pandas as pd
import numpy as np
import glob as glob
from functools import reduce

In [2]:
cd ~/Dropbox\ \(GaTech\)/Norovirus/vsearch/vsearch_out/baselines

/Users/npatin3/Dropbox (GaTech)/Norovirus/vsearch/vsearch_out/baselines


### Make a dictionary with sample name and associated file

In [3]:
files = {}
for file in glob.glob("*.csv"):
    a, b = file.split('.')
    x, y, z = a.split('_')
    name = x + '_' + y
    files[name] = file
print(files)

{'36_1': '36_1_pdformat.csv', '37_1': '37_1_pdformat.csv', '41_1': '41_1_pdformat.csv', '49_1': '49_1_pdformat.csv', '38_1': '38_1_pdformat.csv', '28_1': '28_1_pdformat.csv', '4_1': '4_1_pdformat.csv', '15_1': '15_1_pdformat.csv', '13_1': '13_1_pdformat.csv'}


### Use the dictionary to merge all files on the Taxonomy file with a fill outer merge (keeping all taxa)

In [5]:
reads = []
for key in files:
    df = pd.read_csv(files[key])
    reads.append(df)

reads_df = reduce(lambda x, y: pd.merge(x, y, on = 'Taxonomy', how='outer'), reads)

In [6]:
reads_df.head(10)

Unnamed: 0,Taxonomy,36_1,37_1,41_1,49_1,38_1,28_1,4_1,15_1,13_1
0,Acidobacteria;Holophagae;Acanthopleuribacteral...,1.0,,,,,,,,
1,Actinobacteria;Actinomycetales;Actinomycetacea...,3.0,,,2.0,,3.0,,2.0,
2,Actinobacteria;Actinomycetales;Actinomycetacea...,13.0,,,,1.0,18.0,1.0,7.0,5.0
3,Actinobacteria;Actinomycetales;Actinomycetacea...,2.0,,,,,2.0,,1.0,1.0
4,Actinobacteria;Bifidobacteriales;Bifidobacteri...,908.0,,78.0,187.0,137.0,96.0,185.0,262.0,200.0
5,Actinobacteria;Bifidobacteriales;Bifidobacteri...,92.0,,9.0,7.0,5.0,1.0,,9.0,17.0
6,Actinobacteria;Bifidobacteriales;Bifidobacteri...,59.0,,17.0,20.0,21.0,9.0,7.0,40.0,26.0
7,Actinobacteria;Bifidobacteriales;Bifidobacteri...,198.0,,13.0,38.0,30.0,15.0,46.0,100.0,54.0
8,Actinobacteria;Bifidobacteriales;Bifidobacteri...,1.0,,,,,,,1.0,
9,Actinobacteria;Coriobacteriia;Coriobacteriales...,5.0,,1.0,,1.0,3.0,2.0,6.0,3.0


### Replace all instances of NaN with 0

In [7]:
reads_df = reads_df.fillna(0)

### Save as csv

In [9]:
reads_df.to_csv("Noro_baselines_16S_all.csv", index=None)