# Analyze 2018 Pilot runs in Parallel

## Shell Variables
Assign the variables in this notebook.

In [None]:
ls /data/hts2018_pilot/

In [None]:
set -u
export CUROUT=$HOME/work/scratch/2018_stresstest
export DATA_BASE="/data/hts2018_pilot/"
export RAW_FASTQS="$DATA_BASE/Granek_4837_180427A5"
# CURDATA_1=$CURDATA
# export INFO=$HOME/work/myinfo
export TRIMMED=$CUROUT/trimmed_fastqs
export GENOME_DIR=$CUROUT/genome
export STAR_OUT=$CUROUT/star_out_stress
export ADAPTERS=$DATA_BASE/neb_E7600_adapters_withrc.fasta

# ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/fungi/Cryptococcus_neoformans/all_assembly_versions/GCF_000149245.1_CNA3
export ACCESSION="GCF_000149245.1_CNA3"
export PREFIX=${ACCESSION}_genomic
export GFF=${PREFIX}.gff
export GTF=${PREFIX}.gtf
export FNA=${PREFIX}.fna

export FA_URL="ftp://ftp.ensemblgenomes.org/pub/release-39/fungi/fasta/fungi_basidiomycota1_collection/cryptococcus_neoformans_var_grubii_h99/dna/Cryptococcus_neoformans_var_grubii_h99.CNA3.dna.toplevel.fa.gz"
export GTF_URL="ftp://ftp.ensemblgenomes.org/pub/release-39/fungi/gtf/fungi_basidiomycota1_collection/cryptococcus_neoformans_var_grubii_h99/Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gtf.gz"
export GFF_URL="ftp://ftp.ensemblgenomes.org/pub/release-39/fungi/gff3/fungi_basidiomycota1_collection/cryptococcus_neoformans_var_grubii_h99/Cryptococcus_neoformans_var_grubii_h99.CNA3.39.gff3.gz"
export GFF=$(basename ${GFF_URL%.gz})
export GTF=$(basename ${GTF_URL%.gz})
export FA=$(basename ${FA_URL%.gz})
export THREADS=8
export MAX_JOBS=2

echo $GFF $GTF $FA

# Making New Directories
# rm -rf $CUROUT
mkdir -p $TRIMMED $GENOME_DIR $STAR_OUT

# ls $CURDATA

### Running the full pipeline on all the samples

In [None]:
trim_func() {
    # echo $FASTQ
    FASTQ=$1
    FASTQ=${FASTQ##*/} # strip directory from file path
    # echo $FASTQ
    SAMPLE="${FASTQ%_L001_R1_001.fastq.gz}" # strip .fq.gz file extension
    echo $SAMPLE
    # exit 1

    # gunzip all 4 lanes for each sample into 
    # run fastq-mcf on both miseq runs (e.g. concatenate on the fly) using a named pipe
    # see https://github.com/ExpressionAnalysis/ea-utils/blob/wiki/FastqMcf.md

    # make a pipe
    CUR_PIPE=`mktemp --dry-run`_${SAMPLE}_pipe.fq
    mkfifo $CUR_PIPE
    # gunzip each sample's reads from all 4 lanes into the pipe
    # This generates a concatenated stream of all 4 lanes
    # $RAW_FASTQS/1_MA_*_L001_R1_001.fastq.gz
    ls $RAW_FASTQS/${SAMPLE}_L00[1-4]_R1_001.fastq.gz
    gunzip -c $RAW_FASTQS/${SAMPLE}_L00[1-4]_R1_001.fastq.gz > $CUR_PIPE &

    # Use the pipe as input to fastq-mcf
    fastq-mcf \
        $ADAPTERS \
        $CUR_PIPE \
        -o $TRIMMED/${SAMPLE}.trim.fastq.gz \
        -q 20 -x 0.5

    rm -f $CUR_PIPE
}
export -f trim_func

# parallel --jobs 1 pipeline_func ::: $RAW_FASTQS/1_MA_*_L001_R1_001.fastq.gz
# parallel --jobs $MAX_JOBS trim_func ::: $RAW_FASTQS/35_MA_*_L001_R1_001.fastq.gz
# parallel --jobs $MAX_JOBS trim_func ::: $RAW_FASTQS/*_L001_R1_001.fastq.gz
parallel --jobs $MAX_JOBS trim_func ::: $RAW_FASTQS/[3-4]_RZ_*_L001_R1_001.fastq.gz
