# 07/04/2020

Preparing the raw data for QIIME2.

1. Metadata validation  
2. Manifest file  

In [20]:
import pandas as pd
import numpy as np
import glob, os

In [22]:
cd /xdisk/tfaily/mig2020/extra/nathaliagg/sulfate_experiment/microbial_16S/

/xdisk/tfaily/mig2020/extra/nathaliagg/sulfate_experiment/microbial_16S


### 1. Metadata validation

I validated metadata in `/Volumes/NGG_TFAILY_LAB_1/Methanogenesis/microbial/sequencing_data`, and uploaded here.

The `fastq` files have sample names separated by `-`, and the metadata has those names separated by `_`.

In [3]:
ls

metadata.tsv  [0m[01;34mraw_data[0m/  raw_data.zip
[m

In [28]:
# Import metadata file 
metadf = pd.read_csv("o_metadata.tsv", sep="\t")
metadf.head()

Unnamed: 0,SampleID,Temperature,Sulfate,Time,TechRep
0,T04_C05_T6_RA,4°C,0.5 mM,T6,RA
1,T04_C05_T6_RB,4°C,0.5 mM,T6,RB
2,T04_C05_T4_RA,4°C,0.5 mM,T4,RA
3,T04_C05_T4_RB,4°C,0.5 mM,T4,RB
4,T04_C05_T1_RA,4°C,0.5 mM,T1,RA


In [29]:
metadf['SampleID'] = metadf['SampleID'].replace('_','-', regex=True)

In [31]:
metadf.head()

Unnamed: 0,SampleID,Temperature,Sulfate,Time,TechRep
0,T04-C05-T6-RA,4°C,0.5 mM,T6,RA
1,T04-C05-T6-RB,4°C,0.5 mM,T6,RB
2,T04-C05-T4-RA,4°C,0.5 mM,T4,RA
3,T04-C05-T4-RB,4°C,0.5 mM,T4,RB
4,T04-C05-T1-RA,4°C,0.5 mM,T1,RA


In [32]:
metadf.to_csv('metadata.tsv', sep="\t", index=False)

### 2. Manifest file

A “manifest file” maps sample identifiers to fastq.gz or fastq absolute filepaths. The manifest file also indicates the direction of the reads in each fastq.gz or fastq file.

The manifest file is a tab-seperated (i.e., .tsv) text file. The first column defines the Sample ID, while the second (and optional third) column defines the absolute filepath to the forward (and optional reverse) reads. 

In [34]:
sampleid_new = metadf.reset_index()['SampleID'].to_list()
len(sampleid_new)

90

In [35]:
sampleid_new[1]

'T04-C05-T6-RB'

Remeber to set $DATADIR to "/xdisk/tfaily/mig2020/extra/nathaliagg/sulfate_experiment/microbial_16S/raw_data"

In [36]:
cd raw_data

/xdisk/tfaily/mig2020/extra/nathaliagg/sulfate_experiment/microbial_16S/raw_data


In [37]:
files_forward = glob.glob("R1/*.fastq.gz")
files_reverse = glob.glob("R2/*.fastq.gz")

print(len(files_forward), len(files_reverse))

90 90


In [38]:
files_forward[1]

'R1/T04-C05-T1-RA_S5_L001_R1_001.fastq.gz'

In [39]:
path_f_files = []
path_r_files = []

for s in sampleid_new:
    for file in files_forward:
        if file.split('/')[1].startswith(s):
            path_f_files.append("$DATADIR/"+file)
            
for s in sampleid_new:
    for file in files_reverse:
        if file.split('/')[1].startswith(s):
            path_r_files.append("$DATADIR/"+file)

In [41]:
path_f_files[1]

'$DATADIR/R1/T04-C05-T6-RB_S2_L001_R1_001.fastq.gz'

In [42]:
manifest = pd.DataFrame()
manifest['SampleID'] = sampleid_new
manifest['forward-absolute-filepath'] = path_f_files
manifest['reverse-absolute-filepath'] = path_r_files
manifest.head()

Unnamed: 0,SampleID,forward-absolute-filepath,reverse-absolute-filepath
0,T04-C05-T6-RA,$DATADIR/R1/T04-C05-T6-RA_S1_L001_R1_001.fastq.gz,$DATADIR/R2/T04-C05-T6-RA_S1_L001_R2_001.fastq.gz
1,T04-C05-T6-RB,$DATADIR/R1/T04-C05-T6-RB_S2_L001_R1_001.fastq.gz,$DATADIR/R2/T04-C05-T6-RB_S2_L001_R2_001.fastq.gz
2,T04-C05-T4-RA,$DATADIR/R1/T04-C05-T4-RA_S3_L001_R1_001.fastq.gz,$DATADIR/R2/T04-C05-T4-RA_S3_L001_R2_001.fastq.gz
3,T04-C05-T4-RB,$DATADIR/R1/T04-C05-T4-RB_S4_L001_R1_001.fastq.gz,$DATADIR/R2/T04-C05-T4-RB_S4_L001_R2_001.fastq.gz
4,T04-C05-T1-RA,$DATADIR/R1/T04-C05-T1-RA_S5_L001_R1_001.fastq.gz,$DATADIR/R2/T04-C05-T1-RA_S5_L001_R2_001.fastq.gz


In [43]:
manifest.to_csv('manifest.tsv', index=False, sep='\t')

Done!