## This script takes individual (sample-specific) taxon raw abundance files, with one column named 'Taxonomy' and one named with the sample identifier, and merges them into a spreadsheet that can be used for bar plots, diversity analyses, etc.

## You need all your individual .csv files in one directory, and no other .csv files in that directory for this to work.

## For the Norovirus manuscript this was run after 'vsearch_out_format_for_merge.ipynb'

In [3]:
import os as os
import pandas as pd
import numpy as np
import glob as glob
from functools import reduce

In [4]:
cd ~/Dropbox\ \(GaTech\)/Norovirus/vsearch/vsearch_out/timeseries

/Users/npatin3/Dropbox (GaTech)/Norovirus/vsearch/vsearch_out/timeseries


### Make a dictionary with sample name and associated file

In [6]:
files = {}
for file in glob.glob("*.csv"):
    a, b = file.split('.')
    x, y, z = a.split('_')
    name = x + '_' + y
    files[name] = file
print(files)

{'15_9': '15_9_pdformat.csv', '37_7': '37_7_pdformat.csv', '36_12': '36_12_pdformat.csv', '36_1': '36_1_pdformat.csv', '37_1': '37_1_pdformat.csv', '15_8': '15_8_pdformat.csv', '15_3': '15_3_pdformat.csv', '36_9': '36_9_pdformat.csv', '37_8': '37_8_pdformat.csv', '37_10': '37_10_pdformat.csv', '15_1': '15_1_pdformat.csv', '15_6': '15_6_pdformat.csv', '15_7': '15_7_pdformat.csv', '37_11': '37_11_pdformat.csv', '37_9': '37_9_pdformat.csv', '37_4': '37_4_pdformat.csv', '36_5': '36_5_pdformat.csv', '36_11': '36_11_pdformat.csv', '36_4': '36_4_pdformat.csv'}


### Use the dictionary to merge all files on the Taxonomy file with a fill outer merge (keeping all taxa)

In [7]:
reads = []
for key in files:
    df = pd.read_csv(files[key])
    reads.append(df)

reads_df = reduce(lambda x, y: pd.merge(x, y, on = 'Taxonomy', how='outer'), reads)

In [8]:
reads_df.head(10)

Unnamed: 0,Taxonomy,15_9,37_7,36_12,36_1,37_1,15_8,15_3,36_9,37_8,37_10,15_1,15_6,15_7,37_11,37_9,37_4,36_5,36_11,36_4
0,AELO01000022.3647.5110,1.0,,,,,3.0,3.0,,,,,3.0,1.0,,,,,,
1,AY196663.1.1261,1.0,,,,,,,,,,,,,,,,,,
2,Acidobacteria;Subgroup,1.0,,,,,1.0,,,1.0,,1.0,,,,,,1.0,,
3,Acidobacteria;Thermoanaerobaculia;Thermoanaero...,1.0,,,,,1.0,1.0,1.0,,,,,,,,1.0,2.0,,
4,Actinobacteria;Actinomycetales;Actinomycetacea...,1.0,2.0,11.0,3.0,,3.0,1.0,,,,2.0,,3.0,3.0,1.0,,4.0,1.0,12.0
5,Actinobacteria;Actinomycetales;Actinomycetacea...,6.0,2.0,42.0,13.0,,4.0,11.0,3.0,5.0,2.0,7.0,4.0,7.0,6.0,4.0,1.0,8.0,13.0,40.0
6,Actinobacteria;Actinomycetales;Actinomycetacea...,1.0,,8.0,2.0,,,,,2.0,,1.0,,,2.0,,,2.0,,4.0
7,Actinobacteria;Bifidobacteriales;Bifidobacteri...,162.0,,1712.0,908.0,,587.0,236.0,21.0,119.0,,262.0,22.0,245.0,1.0,5.0,,282.0,108.0,983.0
8,Actinobacteria;Bifidobacteriales;Bifidobacteri...,11.0,,104.0,92.0,,23.0,14.0,3.0,,,9.0,,13.0,,,,25.0,6.0,101.0
9,Actinobacteria;Bifidobacteriales;Bifidobacteri...,23.0,,108.0,59.0,,68.0,38.0,1.0,11.0,,40.0,5.0,31.0,,1.0,1.0,21.0,5.0,76.0


### Replace all instances of NaN with 0

In [9]:
reads_df = reads_df.fillna(0)

### Save as csv

In [10]:
reads_df.to_csv("Noro_timeseries_16S_all.csv", index=None)