# Analyze 2018 Pilot runs in Parallel

## Shell Variables
Assign the variables in this notebook.

In [None]:
source stress_test_config.sh
ls $DATA_BASE

### Running the full pipeline on all the samples

In [None]:
trim_func() {
    # echo $FASTQ
    FASTQ=$1
    FASTQ=${FASTQ##*/} # strip directory from file path
    # echo $FASTQ
    SAMPLE="${FASTQ%_L001_R1_001.fastq.gz}" # strip .fq.gz file extension
    echo $SAMPLE
    # exit 1

    # gunzip all 4 lanes for each sample into 
    # run fastq-mcf on both miseq runs (e.g. concatenate on the fly) using a named pipe
    # see https://github.com/ExpressionAnalysis/ea-utils/blob/wiki/FastqMcf.md

    # make a pipe
    CUR_PIPE=`mktemp --dry-run`_${SAMPLE}_pipe.fq
    mkfifo $CUR_PIPE
    # gunzip each sample's reads from all 4 lanes into the pipe
    # This generates a concatenated stream of all 4 lanes
    # $RAW_FASTQS/1_MA_*_L001_R1_001.fastq.gz
    ls $RAW_FASTQS/${SAMPLE}_L00[1-4]_R1_001.fastq.gz
    gunzip -c $RAW_FASTQS/${SAMPLE}_L00[1-4]_R1_001.fastq.gz > $CUR_PIPE &

    # Use the pipe as input to fastq-mcf
    fastq-mcf \
        $ADAPTERS \
        $CUR_PIPE \
        -o $TRIMMED/${SAMPLE}.trim.fastq.gz \
        -q 20 -x 0.5

    rm -f $CUR_PIPE
}
export -f trim_func

# parallel --jobs 1 pipeline_func ::: $RAW_FASTQS/1_MA_*_L001_R1_001.fastq.gz
# parallel --jobs $MAX_JOBS trim_func ::: $RAW_FASTQS/35_MA_*_L001_R1_001.fastq.gz
# parallel --jobs $MAX_JOBS trim_func ::: $RAW_FASTQS/*_L001_R1_001.fastq.gz
parallel --jobs $MAX_JOBS trim_func ::: $RAW_FASTQS/[3-4]_RZ_*_L001_R1_001.fastq.gz
